1117 files changed, 43549 insertions, 15114 deletions
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 518c2b0489..60db234c9c 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -24,6 +24,14 @@ load(
     "gen_api_init_files",  # @unused
 )
 
+# Config setting used when building for products
+# which requires restricted licenses to be avoided.
+config_setting(
+    name = "no_lgpl_deps",
+    values = {"define": "__TENSORFLOW_NO_LGPL_DEPS__=1"},
+    visibility = ["//visibility:public"],
+)
+
 # Config setting for determining if we are building for Android.
 config_setting(
     name = "android",
@@ -373,6 +381,14 @@ config_setting(
     },
 )
 
+# Setting to use when loading kernels dynamically
+config_setting(
+    name = "dynamic_loaded_kernels",
+    define_values = {
+        "dynamic_loaded_kernels": "true",
+    },
+)
+
 config_setting(
     name = "using_cuda_nvcc",
     define_values = {
@@ -400,14 +416,6 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
-# TODO(laigd): consider removing this option and make TensorRT enabled
-# automatically when CUDA is enabled.
-config_setting(
-    name = "with_tensorrt_support",
-    values = {"define": "with_tensorrt_support=true"},
-    visibility = ["//visibility:public"],
-)
-
 package_group(
     name = "internal",
     packages = [
diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index a3003953a3..10bc8cdbee 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eval_const_tensor.h"
 #include "tensorflow/core/common_runtime/shape_refiner.h"
 #include "tensorflow/core/framework/allocation_description.pb.h"
+#include "tensorflow/core/framework/kernel_def.pb.h"
 #include "tensorflow/core/framework/log_memory.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -327,6 +328,7 @@ TF_Buffer* TF_NewBufferFromString(const void* proto, size_t proto_len) {
 }
 
 void TF_DeleteBuffer(TF_Buffer* buffer) {
+  if (buffer == nullptr) return;
   if (buffer->data_deallocator != nullptr) {
     (*buffer->data_deallocator)(const_cast<void*>(buffer->data),
                                 buffer->length);
@@ -356,6 +358,7 @@ void TF_CloseDeprecatedSession(TF_DeprecatedSession* s, TF_Status* status) {
 
 void TF_DeleteDeprecatedSession(TF_DeprecatedSession* s, TF_Status* status) {
   status->status = Status::OK();
+  if (s == nullptr) return;
   delete s->session;
   delete s;
 }
@@ -906,6 +909,7 @@ TF_Library* TF_LoadLibrary(const char* library_filename, TF_Status* status) {
 TF_Buffer TF_GetOpList(TF_Library* lib_handle) { return lib_handle->op_list; }
 
 void TF_DeleteLibraryHandle(TF_Library* lib_handle) {
+  if (lib_handle == nullptr) return;
   tensorflow::port::Free(const_cast<void*>(lib_handle->op_list.data));
   delete lib_handle;
 }
@@ -1853,6 +1857,7 @@ TF_Graph::TF_Graph()
 TF_Graph* TF_NewGraph() { return new TF_Graph; }
 
 void TF_DeleteGraph(TF_Graph* g) {
+  if (g == nullptr) return;
   g->mu.lock();
   g->delete_requested = true;
   const bool del = g->sessions.empty();
@@ -2528,6 +2533,7 @@ void TF_CloseSession(TF_Session* s, TF_Status* status) {
 
 void TF_DeleteSession(TF_Session* s, TF_Status* status) {
   status->status = Status::OK();
+  if (s == nullptr) return;
   TF_Graph* const graph = s->graph;
   if (graph != nullptr) {
     graph->mu.lock();
@@ -2726,7 +2732,34 @@ TF_Buffer* TF_ApiDefMapGet(TF_ApiDefMap* api_def_map, const char* name,
 
   TF_Buffer* ret = TF_NewBuffer();
   status->status = MessageToBuffer(*api_def, ret);
+  if (!status->status.ok()) {
+    TF_DeleteBuffer(ret);
+    return nullptr;
+  }
   return ret;
 #endif  // __ANDROID__
 }
+
+TF_Buffer* TF_GetAllRegisteredKernels(TF_Status* status) {
+  tensorflow::KernelList kernel_list = tensorflow::GetAllRegisteredKernels();
+  TF_Buffer* ret = TF_NewBuffer();
+  status->status = MessageToBuffer(kernel_list, ret);
+  if (!status->status.ok()) {
+    TF_DeleteBuffer(ret);
+    return nullptr;
+  }
+  return ret;
+}
+
+TF_Buffer* TF_GetRegisteredKernelsForOp(const char* name, TF_Status* status) {
+  tensorflow::KernelList kernel_list =
+      tensorflow::GetRegisteredKernelsForOp(name);
+  TF_Buffer* ret = TF_NewBuffer();
+  status->status = MessageToBuffer(kernel_list, ret);
+  if (!status->status.ok()) {
+    TF_DeleteBuffer(ret);
+    return nullptr;
+  }
+  return ret;
+}
 }  // end extern "C"
diff --git a/tensorflow/c/c_api.h b/tensorflow/c/c_api.h
index fddc09d45e..c8ae6f2dd1 100644
--- a/tensorflow/c/c_api.h
+++ b/tensorflow/c/c_api.h
@@ -44,6 +44,7 @@ limitations under the License.
 // * size_t is used to represent byte sizes of objects that are
 //   materialized in the address space of the calling process.
 // * int is used as an index into arrays.
+// * Deletion functions are safe to call on nullptr.
 //
 // Questions left to address:
 // * Might at some point need a way for callers to provide their own Env.
@@ -1610,6 +1611,18 @@ TF_CAPI_EXPORT extern TF_Buffer* TF_ApiDefMapGet(TF_ApiDefMap* api_def_map,
                                                  size_t name_len,
                                                  TF_Status* status);
 
+// --------------------------------------------------------------------------
+// Kernel definition information.
+
+// Returns a serialized KernelList protocol buffer containing KernelDefs for all
+// registered kernels.
+TF_CAPI_EXPORT extern TF_Buffer* TF_GetAllRegisteredKernels(TF_Status* status);
+
+// Returns a serialized KernelList protocol buffer containing KernelDefs for all
+// kernels registered for the operation named `name`.
+TF_CAPI_EXPORT extern TF_Buffer* TF_GetRegisteredKernelsForOp(
+    const char* name, TF_Status* status);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/c/c_api_test.cc b/tensorflow/c/c_api_test.cc
index bc04b53fbb..e674b1623c 100644
--- a/tensorflow/c/c_api_test.cc
+++ b/tensorflow/c/c_api_test.cc
@@ -29,9 +29,11 @@ limitations under the License.
 #include "tensorflow/core/framework/api_def.pb.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/graph.pb_text.h"
+#include "tensorflow/core/framework/kernel_def.pb.h"
 #include "tensorflow/core/framework/node_def.pb_text.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
@@ -1424,6 +1426,29 @@ TEST(CAPI, SavedModelNullArgsAreValid) {
   TF_DeleteStatus(s);
 }
 
+TEST(CAPI, DeletingNullPointerIsSafe) {
+  TF_Status* status = TF_NewStatus();
+
+  TF_DeleteStatus(nullptr);
+  TF_DeleteBuffer(nullptr);
+  TF_DeleteTensor(nullptr);
+  TF_DeleteSessionOptions(nullptr);
+  TF_DeleteGraph(nullptr);
+  TF_DeleteImportGraphDefOptions(nullptr);
+  TF_DeleteImportGraphDefResults(nullptr);
+  TF_DeleteFunction(nullptr);
+  TF_DeleteSession(nullptr, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TF_DeletePRunHandle(nullptr);
+  TF_DeleteDeprecatedSession(nullptr, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TF_DeleteDeviceList(nullptr);
+  TF_DeleteLibraryHandle(nullptr);
+  TF_DeleteApiDefMap(nullptr);
+
+  TF_DeleteStatus(status);
+}
+
 REGISTER_OP("TestOpWithNoGradient")
     .Input("x: T")
     .Output("y: T")
@@ -2312,6 +2337,57 @@ TEST(TestApiDef, TestCreateApiDefWithOverwrites) {
   TF_DeleteLibraryHandle(lib);
 }
 
+class DummyKernel : public tensorflow::OpKernel {
+ public:
+  explicit DummyKernel(tensorflow::OpKernelConstruction* context)
+      : OpKernel(context) {}
+  void Compute(tensorflow::OpKernelContext* context) override {}
+};
+
+// Test we can query kernels
+REGISTER_OP("TestOpWithSingleKernel")
+    .Input("a: float")
+    .Input("b: float")
+    .Output("o: float");
+REGISTER_KERNEL_BUILDER(
+    Name("TestOpWithSingleKernel").Device(tensorflow::DEVICE_CPU), DummyKernel);
+
+TEST(TestKernel, TestGetAllRegisteredKernels) {
+  TF_Status* status = TF_NewStatus();
+  TF_Buffer* kernel_list_buf = TF_GetAllRegisteredKernels(status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  KernelList kernel_list;
+  kernel_list.ParseFromArray(kernel_list_buf->data, kernel_list_buf->length);
+  ASSERT_GT(kernel_list.kernel_size(), 0);
+  TF_DeleteBuffer(kernel_list_buf);
+  TF_DeleteStatus(status);
+}
+
+TEST(TestKernel, TestGetRegisteredKernelsForOp) {
+  TF_Status* status = TF_NewStatus();
+  TF_Buffer* kernel_list_buf =
+      TF_GetRegisteredKernelsForOp("TestOpWithSingleKernel", status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  KernelList kernel_list;
+  kernel_list.ParseFromArray(kernel_list_buf->data, kernel_list_buf->length);
+  ASSERT_EQ(kernel_list.kernel_size(), 1);
+  EXPECT_EQ(kernel_list.kernel(0).op(), "TestOpWithSingleKernel");
+  EXPECT_EQ(kernel_list.kernel(0).device_type(), "CPU");
+  TF_DeleteBuffer(kernel_list_buf);
+  TF_DeleteStatus(status);
+}
+
+TEST(TestKernel, TestGetRegisteredKernelsForOpNoKernels) {
+  TF_Status* status = TF_NewStatus();
+  TF_Buffer* kernel_list_buf = TF_GetRegisteredKernelsForOp("Unknown", status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  KernelList kernel_list;
+  kernel_list.ParseFromArray(kernel_list_buf->data, kernel_list_buf->length);
+  ASSERT_EQ(kernel_list.kernel_size(), 0);
+  TF_DeleteBuffer(kernel_list_buf);
+  TF_DeleteStatus(status);
+}
+
 #undef EXPECT_TF_META
 
 }  // namespace
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 6c510536d6..7321b4b791 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -288,7 +288,7 @@ TFE_Context* TFE_NewContext(const TFE_ContextOptions* opts, TF_Status* status) {
                          opts->async, std::move(device_mgr), r);
 }
 
-void TFE_DeleteContext(TFE_Context* ctx, TF_Status* status) { delete ctx; }
+void TFE_DeleteContext(TFE_Context* ctx) { delete ctx; }
 
 TF_DeviceList* TFE_ContextListDevices(TFE_Context* ctx, TF_Status* status) {
   TF_DeviceList* list = new TF_DeviceList;
@@ -336,7 +336,7 @@ TFE_TensorHandle* TFE_NewTensorHandle(TF_Tensor* t, TF_Status* status) {
 }
 
 void TFE_DeleteTensorHandle(TFE_TensorHandle* h) {
-  DCHECK(h);
+  if (h == nullptr) return;
   if (h->handle) {
     h->handle->Unref();
   }
diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h
index fdbd5374b2..ea019a5711 100644
--- a/tensorflow/c/eager/c_api.h
+++ b/tensorflow/c/eager/c_api.h
@@ -102,8 +102,7 @@ typedef struct TFE_Context TFE_Context;
 
 TF_CAPI_EXPORT extern TFE_Context* TFE_NewContext(
     const TFE_ContextOptions* opts, TF_Status* status);
-TF_CAPI_EXPORT extern void TFE_DeleteContext(TFE_Context* ctx,
-                                             TF_Status* status);
+TF_CAPI_EXPORT extern void TFE_DeleteContext(TFE_Context* ctx);
 TF_CAPI_EXPORT extern TF_DeviceList* TFE_ContextListDevices(TFE_Context* ctx,
                                                             TF_Status* status);
 
diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc
index 3504a8b5e7..0bdea70fe6 100644
--- a/tensorflow/c/eager/c_api_test.cc
+++ b/tensorflow/c/eager/c_api_test.cc
@@ -49,7 +49,7 @@ void BM_InitOp(int iters) {
   }
   tensorflow::testing::StopTiming();
   TFE_DeleteTensorHandle(m);
-  TFE_DeleteContext(ctx, status);
+  TFE_DeleteContext(ctx);
   CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TF_DeleteStatus(status);
 }
@@ -80,7 +80,7 @@ void BM_Execute(int iters, int async) {
   tensorflow::testing::StopTiming();
   TFE_DeleteOp(matmul);
   TFE_DeleteTensorHandle(m);
-  TFE_DeleteContext(ctx, status);
+  TFE_DeleteContext(ctx);
   CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TF_DeleteStatus(status);
 }
@@ -95,7 +95,7 @@ TEST(CAPI, Context) {
   TF_DeviceList* devices = TFE_ContextListDevices(ctx, status);
   EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
-  TFE_DeleteContext(ctx, status);
+  TFE_DeleteContext(ctx);
   EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
   const int num_devices = TF_DeviceListCount(devices);
@@ -195,7 +195,7 @@ void TestRemoteExecute(bool async) {
   TFE_DeleteOp(matmul);
 
   TFE_ContextAsyncWait(ctx, status);
-  TFE_DeleteContext(ctx, status);
+  TFE_DeleteContext(ctx);
   EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
   TF_DeleteStatus(status);
@@ -281,7 +281,7 @@ void TestRemoteExecuteSilentCopies(bool async) {
   TFE_DeleteOp(matmul);
 
   TFE_ContextAsyncWait(ctx, status);
-  TFE_DeleteContext(ctx, status);
+  TFE_DeleteContext(ctx);
   EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
   TF_DeleteStatus(status);
@@ -380,8 +380,7 @@ void TensorHandleCopyBetweenDevices(bool async) {
   TF_DeleteDeviceList(devices);
   TF_DeleteTensor(t);
   TFE_DeleteTensorHandle(hcpu);
-  TFE_DeleteContext(ctx, status.get());
-  EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  TFE_DeleteContext(ctx);
 }
 
 TEST(CAPI, TensorHandleCopyBetweenDevices) {
@@ -418,7 +417,7 @@ void TensorHandleCopyBetweenDevicesError(bool async) {
   TFE_DeleteTensorHandle(hcopy);
   TFE_DeleteTensorHandle(hcpu);
   if (hdevice != nullptr) TFE_DeleteTensorHandle(hdevice);
-  TFE_DeleteContext(ctx, status.get());
+  TFE_DeleteContext(ctx);
 }
 
 TEST(CAPI, TensorHandleCopyBetweenDevicesError) {
@@ -451,7 +450,7 @@ void TensorHandleCopyBetweenTwoGPUDevices(bool async) {
     TF_DeleteDeviceList(devices);
     TF_DeleteTensor(t);
     TFE_DeleteTensorHandle(hcpu);
-    TFE_DeleteContext(ctx, status.get());
+    TFE_DeleteContext(ctx);
     return;
   }
   const string gpu_1_name(TF_DeviceListName(devices, 1, status.get()));
@@ -484,8 +483,7 @@ void TensorHandleCopyBetweenTwoGPUDevices(bool async) {
   TF_DeleteDeviceList(devices);
   TF_DeleteTensor(t);
   TFE_DeleteTensorHandle(hcpu);
-  TFE_DeleteContext(ctx, status.get());
-  EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  TFE_DeleteContext(ctx);
 }
 
 TEST(CAPI, TensorHandleCopyBetweenTwoGPUDevices) {
@@ -533,8 +531,7 @@ void TensorHandleSilentCopy(bool async) {
   TFE_DeleteTensorHandle(hcpu);
   TFE_ContextAsyncWait(ctx, status.get());
   EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-  TFE_DeleteContext(ctx, status.get());
-  EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  TFE_DeleteContext(ctx);
 }
 
 TEST(CAPI, TensorHandleSilentCopy) { TensorHandleSilentCopy(false); }
@@ -580,8 +577,7 @@ void TensorHandleSilentCopyLocal(bool async) {
   TFE_DeleteTensorHandle(hcpu);
   TFE_ContextAsyncWait(ctx, status.get());
   EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-  TFE_DeleteContext(ctx, status.get());
-  EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  TFE_DeleteContext(ctx);
 }
 TEST(CAPI, TensorHandleSilentCopyLocal) { TensorHandleSilentCopyLocal(false); }
 TEST(CAPI, TensorHandleSilentCopyLocalAsync) {
@@ -614,7 +610,7 @@ void SetAndGetOpDevices(bool async) {
 
   TFE_DeleteOp(matmul);
   TFE_DeleteTensorHandle(m);
-  TFE_DeleteContext(ctx, status);
+  TFE_DeleteContext(ctx);
   ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TF_DeleteStatus(status);
 }
@@ -640,7 +636,7 @@ void Execute_MatMul_CPU(bool async) {
   TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
   ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TFE_DeleteTensorHandle(retvals[0]);
-  TFE_DeleteContext(ctx, status);
+  TFE_DeleteContext(ctx);
   ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   float product[4] = {0};
   EXPECT_EQ(sizeof(product), TF_TensorByteSize(t));
@@ -712,7 +708,7 @@ void Execute_MatMul_CPU_Runtime_Error(bool async) {
   TFE_DeleteTensorHandle(m1);
   TFE_DeleteTensorHandle(m2);
   TFE_DeleteTensorHandle(retvals[0]);
-  TFE_DeleteContext(ctx, status);
+  TFE_DeleteContext(ctx);
   TF_DeleteStatus(status);
 }
 TEST(CAPI, Execute_MatMul_CPU_Runtime_Error) {
@@ -743,7 +739,7 @@ void Execute_MatMul_CPU_Type_Error(bool async) {
   if (retvals[0] != nullptr) {
     TFE_DeleteTensorHandle(retvals[0]);
   }
-  TFE_DeleteContext(ctx, status);
+  TFE_DeleteContext(ctx);
   TF_DeleteStatus(status);
 }
 
@@ -781,7 +777,7 @@ TEST(CAPI, Execute_Min_CPU) {
   TF_DeleteTensor(t);
   EXPECT_EQ(1, output[0]);
   EXPECT_EQ(3, output[1]);
-  TFE_DeleteContext(ctx, status);
+  TFE_DeleteContext(ctx);
   ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TF_DeleteStatus(status);
 }
@@ -823,7 +819,7 @@ void Execute_MatMul_XLA_CPU(bool async) {
   EXPECT_EQ(10, product[1]);
   EXPECT_EQ(15, product[2]);
   EXPECT_EQ(22, product[3]);
-  TFE_DeleteContext(ctx, status);
+  TFE_DeleteContext(ctx);
   TF_DeleteStatus(status);
 }
 TEST(CAPI, Execute_MatMul_XLA_CPU) { Execute_MatMul_XLA_CPU(false); }
@@ -862,7 +858,7 @@ void Execute_Min_XLA_CPU(bool async) {
   TF_DeleteTensor(t);
   EXPECT_EQ(1, output[0]);
   EXPECT_EQ(3, output[1]);
-  TFE_DeleteContext(ctx, status);
+  TFE_DeleteContext(ctx);
   TF_DeleteStatus(status);
 }
 TEST(CAPI, Execute_Min_XLA_CPU) { Execute_Min_XLA_CPU(false); }
@@ -898,7 +894,7 @@ void ExecuteWithTracing(bool async) {
 
   TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
   TFE_DeleteTensorHandle(retvals[0]);
-  TFE_DeleteContext(ctx, status);
+  TFE_DeleteContext(ctx);
   ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   float product[4] = {0};
   EXPECT_EQ(sizeof(product), TF_TensorByteSize(t));
@@ -974,7 +970,7 @@ TEST(CAPI, Function_ident_CPU) {
     TF_DeleteTensor(r);
     TFE_DeleteTensorHandle(result[0]);
   }
-  TFE_DeleteContext(ctx, status);
+  TFE_DeleteContext(ctx);
   ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
   TF_DeleteStatus(status);
 }
@@ -1044,7 +1040,7 @@ TEST(CAPI, Function_ident_XLA_CPU) {
     TF_DeleteTensor(r);
     TFE_DeleteTensorHandle(result[0]);
   }
-  TFE_DeleteContext(ctx, status);
+  TFE_DeleteContext(ctx);
   ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
   TF_DeleteStatus(status);
 }
@@ -1120,7 +1116,7 @@ void FunctionDefAndExecute(bool async) {
   EXPECT_EQ(10, product[1]);
   EXPECT_EQ(15, product[2]);
   EXPECT_EQ(22, product[3]);
-  TFE_DeleteContext(ctx, status);
+  TFE_DeleteContext(ctx);
   EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TF_DeleteStatus(status);
 }
@@ -1161,7 +1157,7 @@ void BM_ExecuteFunction(int iters, int async) {
   tensorflow::testing::StopTiming();
   TFE_DeleteTensorHandle(m);
   TFE_DeleteTensorHandle(retval[0]);
-  TFE_DeleteContext(ctx, status);
+  TFE_DeleteContext(ctx);
   EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TF_DeleteStatus(status);
 }
@@ -1249,7 +1245,7 @@ TEST(CAPI, Variables) {
 
   TFE_DeleteTensorHandle(var_handle);
   TFE_DeleteTensorHandle(value_handle);
-  TFE_DeleteContext(ctx, status);
+  TFE_DeleteContext(ctx);
   CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TF_DeleteStatus(status);
 }
@@ -1288,7 +1284,7 @@ void BM_ReadVariable(int iters) {
   TFE_DeleteOp(op);
 
   TFE_DeleteTensorHandle(var_handle);
-  TFE_DeleteContext(ctx, status);
+  TFE_DeleteContext(ctx);
   CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TF_DeleteStatus(status);
 }
diff --git a/tensorflow/cc/gradients/math_grad_test.cc b/tensorflow/cc/gradients/math_grad_test.cc
index fd7b6fe662..1c9bdff5e1 100644
--- a/tensorflow/cc/gradients/math_grad_test.cc
+++ b/tensorflow/cc/gradients/math_grad_test.cc
@@ -475,11 +475,7 @@ TEST_F(CWiseUnaryGradTest, Tan_Complex) {
   auto x_fn = [this](const int i) {
     return CRV({{1, 0}, {0, 1}, {2, -1}, {1, 2}, {3, 4}});
   };
-  // TODO(kbsriram)
-  // Enable when tan kernel supports complex inputs
-  if (false) {
-    TestCWiseGrad<complex64, complex64>(TAN, x_fn);
-  }
+  TestCWiseGrad<complex64, complex64>(TAN, x_fn);
 }
 
 TEST_F(CWiseUnaryGradTest, Atan) {
diff --git a/tensorflow/cc/gradients/nn_grad.cc b/tensorflow/cc/gradients/nn_grad.cc
index c73482d5f4..588e96cb19 100644
--- a/tensorflow/cc/gradients/nn_grad.cc
+++ b/tensorflow/cc/gradients/nn_grad.cc
@@ -47,6 +47,72 @@ Status SoftmaxGrad(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("Softmax", SoftmaxGrad);
 
+bool IsZero(const Scope& scope, const Output& grad) {
+  string op_type_name = grad.op().node()->type_string();
+  if (op_type_name == "ZerosLike" || op_type_name == "Zeros") {
+    return true;
+  }
+  // The Operation we were provided is not named something obvious so
+  // we need to actually look at its contents.
+  // The original python code did this by calling a utility function called
+  // tensor_util.constant_value.
+  // There is no C++ equivalent to tensor_util.constant_value so we do nothing
+  // for the moment.
+  return false;
+}
+
+// Multiply after broadcasting vec to match dimensions of mat.
+//   Args:
+//     vec: A 1-D tensor of dimension [D0]
+//     mat: A 2-D tensor of dimesnion [D0, D1]
+//
+//   Returns:
+//     A tensor of dimension [D0, D1], the result fo vec * mat.
+Output BroadcastMul(const Scope& scope, const Output& vec, const Output& mat) {
+  auto reshaped = ExpandDims(scope, vec, -1);
+  return Multiply(scope, reshaped, mat);
+}
+
+Status SoftmaxCrossEntropyWithLogitsGrad(const Scope& scope,
+                                         const Operation& op,
+                                         const std::vector<Output>& grad_inputs,
+                                         std::vector<Output>* grad_outputs) {
+  // Softmax gradient with cross entropy logits function.
+  // We multiply the backprop for cost with the gradients - op.output[1].
+  // There is no gradient for labels.
+
+  // The outputs of the network are at input index 0.
+  auto logits = op.input(0);
+  // The "truth" labels are at index 1.
+  auto softmax_grad = op.output(1);
+
+  // The loss is the output at index 0, and backprop is the output at index 1.
+  auto grad_loss = grad_inputs[0];
+  auto grad_grad = grad_inputs[1];
+
+  auto grad = BroadcastMul(scope, grad_loss, softmax_grad);
+  if (!IsZero(scope, grad_grad)) {
+    std::vector<int> axis;
+    auto logits_softmax = Softmax(scope, logits);
+
+    auto grad_grad_expand = ExpandDims(scope, grad_grad, 1);
+    auto logits_softmax_expand = ExpandDims(scope, logits_softmax, 2);
+    auto matmul_result =
+        BatchMatMul(scope, grad_grad_expand, logits_softmax_expand);
+    axis.push_back(1);
+    auto squeeze_result = Squeeze(scope, matmul_result, Squeeze::Axis(axis));
+    auto subtraction_result = Subtract(scope, grad_grad, squeeze_result);
+    auto multiply_result = Multiply(scope, subtraction_result, logits_softmax);
+    grad = Add(scope, grad, multiply_result);
+  }
+  auto minus_log_softmax = Multiply(scope, LogSoftmax(scope, logits), -1.0f);
+  grad_outputs->push_back(grad);
+  grad_outputs->push_back(BroadcastMul(scope, grad_loss, minus_log_softmax));
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("SoftmaxCrossEntropyWithLogits",
+                     SoftmaxCrossEntropyWithLogitsGrad);
+
 Status LogSoftmaxGrad(const Scope& scope, const Operation& op,
                       const std::vector<Output>& grad_inputs,
                       std::vector<Output>* grad_outputs) {
@@ -195,9 +261,9 @@ Status MaxPool3DGradHelper(const Scope& scope, const Operation& op,
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "padding", &padding));
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format));
   MaxPool3DGrad::Attrs grad_attrs;
-  auto dx = MaxPool3DGrad(scope, op.input(0), op.output(0), grad_inputs[0],
-                          ksize, strides, padding,
-                          grad_attrs.DataFormat(data_format));
+  auto dx =
+      MaxPool3DGrad(scope, op.input(0), op.output(0), grad_inputs[0], ksize,
+                    strides, padding, grad_attrs.DataFormat(data_format));
   grad_outputs->push_back(dx);
   return scope.status();
 }
@@ -216,10 +282,9 @@ Status AvgPoolGradHelper(const Scope& scope, const Operation& op,
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "padding", &padding));
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format));
   internal::AvgPoolGrad::Attrs grad_attrs;
-  auto dx =
-      internal::AvgPoolGrad(scope, Shape(scope, op.input(0)), grad_inputs[0],
-                            ksize, strides, padding,
-                            grad_attrs.DataFormat(data_format));
+  auto dx = internal::AvgPoolGrad(scope, Shape(scope, op.input(0)),
+                                  grad_inputs[0], ksize, strides, padding,
+                                  grad_attrs.DataFormat(data_format));
   grad_outputs->push_back(dx);
   return scope.status();
 }
@@ -238,9 +303,9 @@ Status AvgPool3DGradHelper(const Scope& scope, const Operation& op,
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "padding", &padding));
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format));
   AvgPool3DGrad::Attrs grad_attrs;
-  auto dx = AvgPool3DGrad(scope, Shape(scope, op.input(0)), grad_inputs[0],
-                          ksize, strides, padding,
-                          grad_attrs.DataFormat(data_format));
+  auto dx =
+      AvgPool3DGrad(scope, Shape(scope, op.input(0)), grad_inputs[0], ksize,
+                    strides, padding, grad_attrs.DataFormat(data_format));
   grad_outputs->push_back(dx);
   return scope.status();
 }
diff --git a/tensorflow/cc/gradients/nn_grad_test.cc b/tensorflow/cc/gradients/nn_grad_test.cc
index b4d457a9d1..aa72cf7ba2 100644
--- a/tensorflow/cc/gradients/nn_grad_test.cc
+++ b/tensorflow/cc/gradients/nn_grad_test.cc
@@ -25,6 +25,8 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+using ops::AvgPool;
+using ops::AvgPool3D;
 using ops::BiasAdd;
 using ops::Conv2D;
 using ops::Elu;
@@ -33,11 +35,9 @@ using ops::FractionalMaxPool;
 using ops::L2Loss;
 using ops::LogSoftmax;
 using ops::LRN;
-using ops::AvgPool;
-using ops::AvgPool3D;
 using ops::MaxPool;
-using ops::MaxPoolV2;
 using ops::MaxPool3D;
+using ops::MaxPoolV2;
 using ops::Placeholder;
 using ops::Relu;
 using ops::Relu6;
@@ -111,6 +111,20 @@ TEST_F(NNGradTest, SoftmaxGrad) {
   RunTest(x, shape, y, shape);
 }
 
+TEST_F(NNGradTest, SoftmaxCrossEntropyWithLogitsGrad) {
+  TensorShape logits_shape({5, 3});
+  TensorShape loss_shape({5});
+
+  auto logits = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(logits_shape));
+  auto labels = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(logits_shape));
+  auto y =
+      tensorflow::ops::SoftmaxCrossEntropyWithLogits(scope_, logits, labels);
+  // Note the reversal of the backprop and loss orders. Issue #18734 has been
+  // opened for this.
+  RunTest({logits, labels}, {logits_shape, logits_shape}, {y.backprop, y.loss},
+          {logits_shape, loss_shape});
+}
+
 TEST_F(NNGradTest, LogSoftmaxGrad) {
   TensorShape shape({5, 3});
   auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
@@ -253,7 +267,7 @@ TEST_F(NNGradTest, AvgPool3DGradHelper) {
   RunTest(x, x_shape, y, y_shape);
 }
 
-TEST_F(NNGradTest, LRN){
+TEST_F(NNGradTest, LRN) {
   TensorShape x_shape({1, 1, 2, 1});
   auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape));
   auto y = LRN(scope_, x);
diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc
index 07807ed2f3..98be66a6ad 100644
--- a/tensorflow/cc/saved_model/loader.cc
+++ b/tensorflow/cc/saved_model/loader.cc
@@ -74,6 +74,54 @@ void AddAssetsTensorsToInputs(const StringPiece export_dir,
   }
 }
 
+// Like Session::Run(), but uses the Make/Run/ReleaseCallable() API to avoid
+// leaving behind non-GC'ed state.
+//
+// Detailed motivation behind this approach, from ashankar@:
+//
+// Each call to Session::Run() that identifies a new subgraph (based on feeds
+// and fetches) creates some datastructures that live as long as the session
+// (the partitioned graph, associated executors etc.).
+//
+// A pathological case of this would be if say the initialization op
+// (main_op/legacy_init_op) involves the use of a large constant. Then we
+// allocate memory for that large constant that will just stick around till the
+// session dies. With this Callable mechanism, that memory will be released
+// right after ReleaseCallable returns.
+//
+// However, the resource manager state remains.
+Status RunOnce(const RunOptions& run_options,
+               const std::vector<std::pair<string, Tensor>>& inputs,
+               const std::vector<string>& output_tensor_names,
+               const std::vector<string>& target_node_names,
+               std::vector<Tensor>* outputs, RunMetadata* run_metadata,
+               Session* session) {
+  CallableOptions callable_options;
+  std::vector<Tensor> feed_tensors;
+  *callable_options.mutable_run_options() = run_options;
+  for (const auto& input : inputs) {
+    const string& name = input.first;
+    const Tensor& tensor = input.second;
+    callable_options.add_feed(name);
+    feed_tensors.push_back(tensor);
+  }
+  for (const string& output_tensor_name : output_tensor_names) {
+    callable_options.add_fetch(output_tensor_name);
+  }
+  for (const string& target_node_name : target_node_names) {
+    callable_options.add_target(target_node_name);
+  }
+
+  Session::CallableHandle callable_handle;
+  TF_RETURN_IF_ERROR(session->MakeCallable(callable_options, &callable_handle));
+  const Status run_status = session->RunCallable(callable_handle, feed_tensors,
+                                                 outputs, run_metadata);
+  // Be sure to call ReleaseCallable() regardless of the outcome of
+  // RunCallable().
+  session->ReleaseCallable(callable_handle).IgnoreError();
+  return run_status;
+}
+
 bool HasMainOp(const MetaGraphDef& meta_graph_def) {
   const auto& collection_def_map = meta_graph_def.collection_def();
   if (collection_def_map.find(kSavedModelMainOpKey) !=
@@ -86,10 +134,11 @@ bool HasMainOp(const MetaGraphDef& meta_graph_def) {
 Status RunMainOp(const RunOptions& run_options, const string& export_dir,
                  const MetaGraphDef& meta_graph_def,
                  const std::vector<AssetFileDef>& asset_file_defs,
-                 Session* session) {
-  LOG(INFO) << "Running MainOp on SavedModel bundle.";
+                 Session* session, const string& main_op_key) {
+  LOG(INFO) << "Running MainOp with key " << main_op_key
+            << " on SavedModel bundle.";
   const auto& collection_def_map = meta_graph_def.collection_def();
-  const auto main_op_it = collection_def_map.find(kSavedModelMainOpKey);
+  const auto main_op_it = collection_def_map.find(main_op_key);
   if (main_op_it != collection_def_map.end()) {
     if (main_op_it->second.node_list().value_size() != 1) {
       return errors::FailedPrecondition(
@@ -99,8 +148,8 @@ Status RunMainOp(const RunOptions& run_options, const string& export_dir,
     AddAssetsTensorsToInputs(export_dir, asset_file_defs, &inputs);
     RunMetadata run_metadata;
     const StringPiece main_op_name = main_op_it->second.node_list().value(0);
-    return session->Run(run_options, inputs, {}, {main_op_name.ToString()},
-                        nullptr /* outputs */, &run_metadata);
+    return RunOnce(run_options, inputs, {}, {main_op_name.ToString()},
+                   nullptr /* outputs */, &run_metadata, session);
   }
   return Status::OK();
 }
@@ -137,32 +186,8 @@ Status RunRestore(const RunOptions& run_options, const string& export_dir,
   AddAssetsTensorsToInputs(export_dir, asset_file_defs, &inputs);
 
   RunMetadata run_metadata;
-  return session->Run(run_options, inputs, {}, {restore_op_name.ToString()},
-                      nullptr /* outputs */, &run_metadata);
-}
-
-Status RunLegacyInitOp(const RunOptions& run_options, const string& export_dir,
-                       const MetaGraphDef& meta_graph_def,
-                       const std::vector<AssetFileDef>& asset_file_defs,
-                       Session* session) {
-  LOG(INFO) << "Running LegacyInitOp on SavedModel bundle.";
-  const auto& collection_def_map = meta_graph_def.collection_def();
-  const auto init_op_it = collection_def_map.find(kSavedModelLegacyInitOpKey);
-  if (init_op_it != collection_def_map.end()) {
-    if (init_op_it->second.node_list().value_size() != 1) {
-      return errors::FailedPrecondition(strings::StrCat(
-          "Expected exactly one serving init op in : ", export_dir));
-    }
-    std::vector<std::pair<string, Tensor>> inputs;
-    AddAssetsTensorsToInputs(export_dir, asset_file_defs, &inputs);
-    RunMetadata run_metadata;
-    const StringPiece legacy_init_op_name =
-        init_op_it->second.node_list().value(0);
-    return session->Run(run_options, inputs, {},
-                        {legacy_init_op_name.ToString()}, nullptr /* outputs */,
-                        &run_metadata);
-  }
-  return Status::OK();
+  return RunOnce(run_options, inputs, {}, {restore_op_name.ToString()},
+                 nullptr /* outputs */, &run_metadata, session);
 }
 
 Status GetAssetFileDefs(const MetaGraphDef& meta_graph_def,
@@ -204,11 +229,11 @@ Status LoadSavedModelInternal(const SessionOptions& session_options,
   if (HasMainOp(bundle->meta_graph_def)) {
     TF_RETURN_IF_ERROR(RunMainOp(run_options, export_dir,
                                  bundle->meta_graph_def, asset_file_defs,
-                                 bundle->session.get()));
+                                 bundle->session.get(), kSavedModelMainOpKey));
   } else {
-    TF_RETURN_IF_ERROR(RunLegacyInitOp(run_options, export_dir,
-                                       bundle->meta_graph_def, asset_file_defs,
-                                       bundle->session.get()));
+    TF_RETURN_IF_ERROR(RunMainOp(
+        run_options, export_dir, bundle->meta_graph_def, asset_file_defs,
+        bundle->session.get(), kSavedModelLegacyInitOpKey));
   }
   return Status::OK();
 }
diff --git a/tensorflow/compiler/aot/BUILD b/tensorflow/compiler/aot/BUILD
index 2119c8ec47..fef8b8d4d4 100644
--- a/tensorflow/compiler/aot/BUILD
+++ b/tensorflow/compiler/aot/BUILD
@@ -68,6 +68,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:compile_only_client",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
         "//tensorflow/core:core_cpu_internal",
diff --git a/tensorflow/compiler/aot/compile.cc b/tensorflow/compiler/aot/compile.cc
index bbc35da2ef..2b5f97b34c 100644
--- a/tensorflow/compiler/aot/compile.cc
+++ b/tensorflow/compiler/aot/compile.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/compile_only_client.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/util.h"
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 9174a67cc6..e34347b9d4 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -166,6 +166,7 @@ cc_library(
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/service:stream_pool",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 73db0d5952..38eb6d830f 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <unordered_map>
 #include <unordered_set>
 
+#include "tensorflow/compiler/jit/deadness_analysis.h"
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/graphcycles/graphcycles.h"
 #include "tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h"
@@ -463,6 +464,12 @@ Status MarkForCompilationPass::Run(
   VLOG(1) << "flags->tf_xla_fusion_only = " << flags->tf_xla_fusion_only;
   const FunctionLibraryDefinition* fld = options.flib_def;
 
+  std::unique_ptr<DeadnessAnalysis> deadness;
+  {
+    XLA_SCOPED_LOGGING_TIMER_LEVEL("DeadnessAnalysis", 1);
+    TF_RETURN_IF_ERROR(DeadnessAnalysis::Run(**options.graph, &deadness));
+  }
+
   auto is_compilable = [&](const Node* node, const DeviceType& device_type) {
     const XlaOpRegistry::DeviceRegistration* registration;
     if (!XlaOpRegistry::GetCompilationDevice(device_type.type(),
@@ -470,15 +477,6 @@ Status MarkForCompilationPass::Run(
       return false;
     }
 
-    // TODO(b/111570009): This bailout for ControlTrigger is probably not
-    // needed.
-    //
-    // Don't compile control trigger nodes. We won't preserve their deadness
-    // semantics correctly, so it's safest not to compile them.
-    if (node->IsControlTrigger()) {
-      return false;
-    }
-
     // If this device requires a JIT, we must say yes.
     if (registration->requires_compilation) return true;
 
@@ -490,6 +488,14 @@ Status MarkForCompilationPass::Run(
     status = fld->GetAttr(*node, kXlaCompileAttr, &compile);
     if (status.ok()) return compile;
 
+    // If inputs to `node` can have conflicting deadness (i.e. some are alive
+    // and some are dead) then don't compile it.  XLA cannot represent the
+    // deadness semantics of these nodes correctly and auto-clustering these
+    // nodes can cause deadness to propagate to nodes that should be live.
+    if (node->IsMerge() || deadness->HasInputsWithMismatchingDeadness(*node)) {
+      return false;
+    }
+
     // Check for fusable ops only if requested.
     if (global_jit_level > 0 && fusion_only && !IsXlaFusable(node->def())) {
       return false;
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
index 772c92d369..2c5f4fb774 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/control_flow_ops_internal.h"
 #include "tensorflow/cc/ops/function_ops.h"
+#include "tensorflow/cc/ops/sendrecv_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
@@ -680,5 +681,37 @@ TEST(XlaCompilationTest, ClusterIdentityWithNonRefInput) {
   EXPECT_EQ(clusters, expected_clusters);
 }
 
+TEST(XlaCompilationTest, ClusterControlTrigger) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+
+  Output recv_a = ops::_Recv(root.WithOpName("recv_a"), DT_BOOL, "tensor_a",
+                             "sender", 0, "receiver");
+  Output recv_b = ops::_Recv(root.WithOpName("recv_b"), DT_BOOL, "tensor_b",
+                             "sender", 0, "receiver");
+  Output const_a = ops::Const(root.WithOpName("const_a"), 42);
+
+  ops::ControlTrigger ctrl_trigger_a(root.WithOpName("ctrl_trigger_a"));
+  ops::ControlTrigger ctrl_trigger_b(root.WithOpName("ctrl_trigger_b"));
+  root.graph()->AddControlEdge(recv_a.node(), ctrl_trigger_a.operation.node());
+  root.graph()->AddControlEdge(recv_b.node(), ctrl_trigger_a.operation.node());
+  root.graph()->AddControlEdge(ctrl_trigger_b.operation.node(), const_a.node());
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+  TF_ASSERT_OK(MarkForCompilation(&graph));
+
+  std::unordered_map<string, string> clusters = GetClusters(*graph);
+
+  ASSERT_FALSE(clusters.empty());
+  string cluster_name = clusters.begin()->second;
+
+  // ctrl_trigger_a has inputs with mismatching deadness so it won't be
+  // clustered.  ctrl_trigger_b is okay to cluster.
+  std::unordered_map<string, string> expected_clusters(
+      {{"const_a", cluster_name}, {"ctrl_trigger_b", cluster_name}});
+  EXPECT_EQ(clusters, expected_clusters);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.cc b/tensorflow/compiler/jit/xla_compilation_cache.cc
index 7ed609c437..54a41a4daa 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.cc
+++ b/tensorflow/compiler/jit/xla_compilation_cache.cc
@@ -40,7 +40,23 @@ namespace tensorflow {
 XlaCompilationCache::XlaCompilationCache(xla::LocalClient* client,
                                          DeviceType device_type)
     : client_(client), device_type_(std::move(device_type)) {}
-XlaCompilationCache::~XlaCompilationCache() = default;
+XlaCompilationCache::~XlaCompilationCache() {
+  // Ensure any use of our programs have completed by waiting for all stream
+  // executors to complete.
+  for (auto* executor : client_->backend().stream_executors()) {
+    bool ok = executor->SynchronizeAllActivity();
+    if (!ok) {
+      LOG(ERROR) << "Error synchronizing activity while waiting for all "
+                    "programs to complete";
+    }
+  }
+  // TODO(b/110813685): Think about the program ownership model. Programs are
+  // currently owned by the compilation cache which means we must wait for
+  // program completion in the destructor. There are multiple compilation caches
+  // around, which complicates things a little. Perhaps having programs be
+  // shared_ptrs (an invasive change) would make the model easier to reason
+  // about?
+}
 
 string XlaCompilationCache::DebugString() {
   return "XLA JIT compilation cache";
diff --git a/tensorflow/compiler/jit/xla_device.h b/tensorflow/compiler/jit/xla_device.h
index fccdb14368..4a5942fbd7 100644
--- a/tensorflow/compiler/jit/xla_device.h
+++ b/tensorflow/compiler/jit/xla_device.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/service/stream_pool.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/local_device.h"
 #include "tensorflow/core/framework/allocator.h"
@@ -153,17 +154,17 @@ class XlaDevice : public LocalDevice {
   // stream are executed on the device. Operations include data
   // copying back and forth between CPU and the device, and
   // computations enqueued by XLA.
-  xla::Backend::StreamPtr stream_;
+  xla::StreamPool::Ptr stream_;
   // If true, only stream_ is valid and all computation and transfers use
   // stream_. If false, computation is performed by stream_ and transfers are
   // performed by host_to_device/device_to_host_stream.
   bool use_multiple_streams_;
   // If use_multiple_streams_, host to device transfers are performed using this
   // stream.
-  xla::Backend::StreamPtr host_to_device_stream_;
+  xla::StreamPool::Ptr host_to_device_stream_;
   // If use_multiple_streams_, device to host transfers are performed using this
   // stream.
-  xla::Backend::StreamPtr device_to_host_stream_;
+  xla::StreamPool::Ptr device_to_host_stream_;
   // Must we use XLA's transfer manager for correct host<->device transfers? if
   // false, we can use ThenMemcpy() instead.
   bool transfer_as_literal_;
diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc
index 04778c0090..8cf198239c 100644
--- a/tensorflow/compiler/jit/xla_device_context.cc
+++ b/tensorflow/compiler/jit/xla_device_context.cc
@@ -74,43 +74,64 @@ Status XlaTransferManager::TransferLiteralToDevice(
   xla::Shape xla_shape;
   TF_RETURN_IF_ERROR(TensorShapeToXLAShape(host_tensor.dtype(),
                                            host_tensor.shape(), &xla_shape));
-  xla::BorrowingLiteral literal(
+  // Create a reference to hold onto host_tensor until after the literal has
+  // been transferred. Also make sure the literal exists until the function
+  // asynchronously completes, as it will be wrapped in an xla::LiteralSlice.
+  TensorReference ref(host_tensor);
+  auto literal = std::make_shared<xla::BorrowingLiteral>(
       static_cast<const char*>(DMAHelper::base(&host_tensor)), xla_shape);
 
   XlaTensor* xla_tensor = XlaTensor::FromTensor(device_tensor);
   const xla::ShapedBuffer& shaped_buffer = xla_tensor->shaped_buffer();
-  VLOG(1) << "Transfer to device as literal: " << literal.ToString() << " "
+  VLOG(1) << "Transfer to device as literal: " << literal->ToString() << " "
           << shaped_buffer.ToString();
-  TF_RETURN_IF_ERROR(transfer_manager_->TransferLiteralToDevice(
-      host_to_device_stream_, literal, shaped_buffer));
+  if (UseMultipleStreams()) {
+    // Initially wait for the compute stream so that memory allocations are
+    // synchronized.
+    host_to_device_stream_->ThenWaitFor(stream_);
+  }
+  TF_RETURN_IF_ERROR(transfer_manager_->TransferLiteralToDeviceAsync(
+      host_to_device_stream_, *literal, shaped_buffer));
   if (UseMultipleStreams()) {
     se::Event event(stream_->parent());
     TF_RET_CHECK(event.Init()) << "Event failed to initialize!";
     host_to_device_stream_->ThenRecordEvent(&event);
     xla_tensor->SetDefinedOn(host_to_device_stream_, std::move(event));
   }
+  // Unref the host tensor, and capture the literal shared_ptr too so it goes
+  // out of scope when the lambda completes.
+  host_to_device_stream_->ThenDoHostCallback([ref, literal]() { ref.Unref(); });
   return Status::OK();
 }
 
-Status XlaTransferManager::TransferLiteralFromDevice(
-    Tensor* host_tensor, const Tensor& device_tensor) const {
+void XlaTransferManager::TransferLiteralFromDevice(
+    Tensor* host_tensor, const Tensor& device_tensor,
+    const StatusCallback& done) const {
   const xla::ShapedBuffer& shaped_buffer =
       XlaTensor::FromTensor(&device_tensor)->shaped_buffer();
 
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Literal> literal,
-                      transfer_manager_->TransferLiteralFromDevice(
-                          device_to_host_stream_, shaped_buffer));
-  VLOG(1) << "Transfer from device as literal: " << literal->ToString() << " "
-          << shaped_buffer.ToString();
-  Tensor tensor;
-  TF_RETURN_IF_ERROR(
-      LiteralToHostTensor(*literal, host_tensor->dtype(), &tensor));
-  // Reshape the tensor back to its declared shape.
-  if (!host_tensor->CopyFrom(tensor, device_tensor.shape())) {
-    return errors::Internal(
-        "Tensor::CopyFrom failed when copying from XLA device to CPU");
-  }
-  return Status::OK();
+  TensorReference ref(device_tensor);
+  transfer_manager_->TransferLiteralFromDevice(
+      device_to_host_stream_, shaped_buffer,
+      [=, &shaped_buffer](
+          xla::StatusOr<std::unique_ptr<xla::Literal> > literal_or) {
+        ref.Unref();
+        done([&]() -> Status {
+          TF_ASSIGN_OR_RETURN(auto literal, std::move(literal_or));
+          VLOG(1) << "Transfer from device as literal: " << literal->ToString()
+                  << " " << shaped_buffer.ToString();
+          Tensor tensor;
+          TF_RETURN_IF_ERROR(
+              LiteralToHostTensor(*literal, host_tensor->dtype(), &tensor));
+          // Reshape the tensor back to its declared shape.
+          Status status;
+          if (!host_tensor->CopyFrom(tensor, device_tensor.shape())) {
+            status = errors::Internal(
+                "Tensor::CopyFrom failed when copying from XLA device to CPU");
+          }
+          return status;
+        }());
+      });
 }
 
 void XlaTransferManager::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
@@ -163,6 +184,12 @@ void XlaTransferManager::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
       return;
     }
     status = TransferLiteralToDevice(reshaped_cpu_tensor, device_tensor);
+    if (status.ok()) {
+      xla_tensor->set_host_tensor(*cpu_tensor);
+      host_to_device_stream_->ThenDoHostCallback(
+          [done]() { done(Status::OK()); });
+      return;
+    }
   } else {
     se::DeviceMemoryBase dev_dst_ptr =
         XlaTensor::DeviceMemoryFromTensor(*device_tensor);
@@ -212,7 +239,8 @@ void XlaTransferManager::CopyDeviceTensorToCPU(const Tensor* device_tensor,
 
   Status status;
   if (transfer_as_literal_) {
-    status = TransferLiteralFromDevice(cpu_tensor, *device_tensor);
+    TransferLiteralFromDevice(cpu_tensor, *device_tensor, done);
+    return;
   } else {
     device_to_host_stream_->ThenMemcpy(dst_ptr, dev_src_ptr, total_bytes);
     // TODO(hpucha): Make this asynchronous.
@@ -234,15 +262,15 @@ void XlaTransferManager::CopyDeviceTensorToDevice(const Tensor& src_tensor,
           << reinterpret_cast<const void*>(src_tensor.tensor_data().data())
           << " "
           << reinterpret_cast<const void*>(dst_tensor->tensor_data().data());
-  // TODO(phawkins): replace this code with an asynchronous implementation.
-  auto body = [&]() {
+  // Perform memory allocation now, and enqueue the device-to-device transfer.
+  Status status = [&]() -> Status {
     if (src_tensor.NumElements() == 0) {
       return Status::OK();
     }
     // TODO(jmolloy): We co-opt the device_to_host stream for device to device
     // transfers; perhaps we should have a dedicated device to device stream? or
     // one per device?
-    auto device_to_device_stream = device_to_host_stream_;
+    auto device_to_device_stream = stream_;
     XlaTensor* xla_src = XlaTensor::FromTensor(&src_tensor);
     XlaTensor* xla_dst = XlaTensor::FromTensor(dst_tensor);
     CHECK(xla_src && xla_dst)
@@ -254,29 +282,40 @@ void XlaTransferManager::CopyDeviceTensorToDevice(const Tensor& src_tensor,
       TF_RETURN_IF_ERROR(
           xla_dst->AllocateShapedBuffer(src_tensor.dtype(), shape, client_,
                                         stream_->parent()->device_ordinal()));
+      if (stream_ != device_to_device_stream) {
+        // Initially wait for the compute stream so that memory allocations are
+        // synchronized.
+        device_to_device_stream->ThenWaitFor(stream_);
+      }
     }
 
     if (se::Event* event =
             xla_src->GetDefinitionEvent(device_to_device_stream)) {
       device_to_device_stream->ThenWaitFor(event);
       xla_src->SetDefinedOn(device_to_device_stream);
-      TF_RETURN_IF_ERROR(device_to_device_stream->BlockHostUntilDone());
     }
-    TF_RETURN_IF_ERROR(
-        xla_dst->shaped_buffer().buffers().ForEachMutableElementWithStatus(
-            [&](const xla::ShapeIndex& index, se::DeviceMemoryBase* buffer) {
-              const se::DeviceMemoryBase& from_buffer =
-                  xla_src->shaped_buffer().buffers().element(index);
-              CHECK_EQ(buffer->size(), from_buffer.size());
-              if (!stream_->parent()->SynchronousMemcpy(buffer, from_buffer,
-                                                        buffer->size())) {
-                return errors::Internal("Device to device memcpy failed");
-              }
-              return Status::OK();
-            }));
+
+    auto from_iter = xla_src->shaped_buffer().buffers().begin();
+    auto to_iter = xla_dst->shaped_buffer().buffers().begin();
+    for (auto end_iter = xla_src->shaped_buffer().buffers().end();
+         from_iter != end_iter; ++from_iter, ++to_iter) {
+      device_to_device_stream->ThenMemcpyD2D(
+          &to_iter->second, from_iter->second, to_iter->second.size());
+    }
+
+    if (UseMultipleStreams()) {
+      se::Event event(stream_->parent());
+      CHECK(event.Init());
+      device_to_device_stream->ThenRecordEvent(&event);
+      xla_dst->SetDefinedOn(device_to_device_stream, std::move(event));
+    }
     return Status::OK();
-  };
-  done(body());
+  }();
+  if (!status.ok()) {
+    return done(status);
+  } else {
+    stream_->ThenDoHostCallback([=]() { done(Status::OK()); });
+  }
 }
 
 XlaDeviceContext::XlaDeviceContext(
diff --git a/tensorflow/compiler/jit/xla_device_context.h b/tensorflow/compiler/jit/xla_device_context.h
index c726495f96..912f8d779e 100644
--- a/tensorflow/compiler/jit/xla_device_context.h
+++ b/tensorflow/compiler/jit/xla_device_context.h
@@ -66,8 +66,9 @@ class XlaTransferManager {
  private:
   Status TransferLiteralToDevice(const Tensor& host_tensor,
                                  Tensor* device_tensor) const;
-  Status TransferLiteralFromDevice(Tensor* host_tensor,
-                                   const Tensor& device_tensor) const;
+  void TransferLiteralFromDevice(Tensor* host_tensor,
+                                 const Tensor& device_tensor,
+                                 const StatusCallback& done) const;
   bool UseMultipleStreams() const { return stream_ != host_to_device_stream_; }
 
   // The main compute stream of the device, used to synchronize the transfer
diff --git a/tensorflow/compiler/jit/xla_fusion_optimizer.cc b/tensorflow/compiler/jit/xla_fusion_optimizer.cc
index 74257b09a8..4b499b1613 100644
--- a/tensorflow/compiler/jit/xla_fusion_optimizer.cc
+++ b/tensorflow/compiler/jit/xla_fusion_optimizer.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <unordered_map>
 #include <unordered_set>
 
+#include "tensorflow/compiler/jit/deadness_analysis.h"
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/graphcycles/graphcycles.h"
 #include "tensorflow/compiler/jit/union_find.h"
@@ -146,6 +147,9 @@ Status XlaFusionOptimizer::Optimize(grappler::Cluster* cluster,
   TF_RETURN_IF_ERROR(
       ImportGraphDef(options, item.graph, &graph, &shape_refiner));
 
+  std::unique_ptr<DeadnessAnalysis> deadness;
+  TF_RETURN_IF_ERROR(DeadnessAnalysis::Run(graph, &deadness));
+
   // Collect nodes that can be fused via XLA, while ignoring those that
   // explicitly ask for XLA: (*) nodes that are marked to be compiled
   // explicitly. (*) nodes assigned to XLA device.
@@ -185,6 +189,14 @@ Status XlaFusionOptimizer::Optimize(grappler::Cluster* cluster,
       continue;
     }
 
+    // If inputs to `node` can have conflicting deadness (i.e. some are alive
+    // and some are dead) then don't compile it.  XLA cannot represent the
+    // deadness semantics of these nodes correctly and auto-clustering these
+    // nodes can cause deadness to propagate to nodes that should be live.
+    if (node->IsMerge() || deadness->HasInputsWithMismatchingDeadness(*node)) {
+      continue;
+    }
+
     compilation_candidates.insert(node);
   }
 
diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index 616c3ed2a2..6134b8c694 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -64,11 +64,13 @@ xla::StatusOr<xla::OwningDeviceMemory> XlaAllocator::Allocate(
     int device_ordinal, uint64 size, bool retry_on_failure) {
   AllocationAttributes attrs;
   attrs.no_retry_on_failure = !retry_on_failure;
-  void* data =
-      wrapped_->AllocateRaw(Allocator::kAllocatorAlignment, size, attrs);
-  if (data == nullptr) {
-    return errors::ResourceExhausted("Out of memory while trying to allocate ",
-                                     size, " bytes.");
+  void* data = nullptr;
+  if (size != 0) {
+    data = wrapped_->AllocateRaw(Allocator::kAllocatorAlignment, size, attrs);
+    if (data == nullptr) {
+      return errors::ResourceExhausted(
+          "Out of memory while trying to allocate ", size, " bytes.");
+    }
   }
   return xla::OwningDeviceMemory(se::DeviceMemoryBase(data, size),
                                  device_ordinal, this);
diff --git a/tensorflow/compiler/jit/xla_tensor.cc b/tensorflow/compiler/jit/xla_tensor.cc
index 5dff187fff..d777dfa5a3 100644
--- a/tensorflow/compiler/jit/xla_tensor.cc
+++ b/tensorflow/compiler/jit/xla_tensor.cc
@@ -92,10 +92,8 @@ se::Event* XlaTensor::GetDefinitionEvent(se::Stream* stream) {
 
 void XlaTensor::SetDefinedOn(se::Stream* stream, se::Event event) {
   mutex_lock lock(mu_);
-  CHECK(!definition_event_.has_value())
-      << "SetDefinedOn must only be called once!";
   definition_event_ = std::move(event);
-  streams_defined_on_.push_back(stream);
+  streams_defined_on_ = {stream};
 }
 
 void XlaTensor::SetDefinedOn(se::Stream* stream) {
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index ff002d15b0..338943201b 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -81,7 +81,7 @@ cc_library(
         "//tensorflow/compiler/tf2xla/kernels:xla_cpu_only_ops",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
         "//tensorflow/compiler/xla/client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -119,6 +119,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service:cpu_plugin",
         "//tensorflow/compiler/xla/service/cpu:cpu_executable",
         "//tensorflow/core:lib",
@@ -139,14 +140,12 @@ cc_library(
         "xla_op_registry.cc",
         "xla_resource.cc",
         "xla_cpu_backend.cc",
-        "legacy_flags/backend_registration_flags.cc",
     ] + if_cuda_is_configured([
         "xla_gpu_backend.cc",
     ]),
     hdrs = [
         "const_analysis.h",
         "graph_compiler.h",
-        "legacy_flags/backend_registration_flags.h",
         "xla_compilation_device.h",
         "xla_compiler.h",
         "xla_context.h",
@@ -172,16 +171,14 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/client/lib:constants",
         "//tensorflow/compiler/xla/client/lib:numeric",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
-        "//tensorflow/compiler/xla/legacy_flags:parse_flags_from_env",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
@@ -294,6 +291,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service:cpu_plugin",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
diff --git a/tensorflow/compiler/tf2xla/dump_graph.cc b/tensorflow/compiler/tf2xla/dump_graph.cc
index 03603ee9ba..24616c01c7 100644
--- a/tensorflow/compiler/tf2xla/dump_graph.cc
+++ b/tensorflow/compiler/tf2xla/dump_graph.cc
@@ -33,7 +33,7 @@ struct NameCounts {
   std::unordered_map<string, int> counts;
 };
 
-string MakeUniquePath(string name) {
+string MakeUniqueFilename(string name) {
   static NameCounts& instance = *new NameCounts;
 
   // Remove illegal characters from `name`.
@@ -50,26 +50,41 @@ string MakeUniquePath(string name) {
     count = instance.counts[name]++;
   }
 
-  legacy_flags::DumpGraphFlags* flags = legacy_flags::GetDumpGraphFlags();
-  string path = strings::StrCat(flags->tf_dump_graph_prefix, "/", name);
+  string filename = name;
   if (count > 0) {
-    strings::StrAppend(&path, "_", count);
+    strings::StrAppend(&filename, "_", count);
   }
-  strings::StrAppend(&path, ".pbtxt");
-  return path;
+  strings::StrAppend(&filename, ".pbtxt");
+  return filename;
+}
+
+string WriteTextProtoToUniqueFile(
+    Env* env, const string& name, const char* proto_type,
+    const ::tensorflow::protobuf::Message& proto) {
+  const string& dirname =
+      legacy_flags::GetDumpGraphFlags()->tf_dump_graph_prefix;
+  Status status = env->RecursivelyCreateDir(dirname);
+  if (!status.ok()) {
+    LOG(WARNING) << "Failed to create " << dirname << " for dumping "
+                 << proto_type << ": " << status;
+    return "(unavailable)";
+  }
+  string filepath = strings::StrCat(dirname, "/", MakeUniqueFilename(name));
+  status = WriteTextProto(Env::Default(), filepath, proto);
+  if (!status.ok()) {
+    LOG(WARNING) << "Failed to dump " << proto_type << " to file: " << filepath
+                 << " : " << status;
+    return "(unavailable)";
+  }
+  LOG(INFO) << "Dumped " << proto_type << " to " << filepath;
+  return filepath;
 }
 
 }  // anonymous namespace
 
 string DumpGraphDefToFile(const string& name, GraphDef const& graph_def) {
-  string path = MakeUniquePath(name);
-  Status status = WriteTextProto(Env::Default(), path, graph_def);
-  if (!status.ok()) {
-    VLOG(1) << "Failed to dump GraphDef to file: " << path << " : " << status;
-    path.clear();
-    path = "(unavailable)";
-  }
-  return path;
+  return WriteTextProtoToUniqueFile(Env::Default(), name, "GraphDef",
+                                    graph_def);
 }
 
 string DumpGraphToFile(const string& name, Graph const& graph,
@@ -83,15 +98,7 @@ string DumpGraphToFile(const string& name, Graph const& graph,
 }
 
 string DumpFunctionDefToFile(const string& name, FunctionDef const& fdef) {
-  string path = MakeUniquePath(name);
-  Status status = WriteTextProto(Env::Default(), path, fdef);
-  if (!status.ok()) {
-    VLOG(1) << "Failed to dump FunctionDef to file: " << path << " : "
-            << status;
-    path.clear();
-    path = "(unavailable)";
-  }
-  return path;
+  return WriteTextProtoToUniqueFile(Env::Default(), name, "FunctionDef", fdef);
 }
 
 }  // namespace dump_graph
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
index 6cc95149a1..0904778f97 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
@@ -177,8 +177,8 @@ Status CheckNoCycleContains(const Node* node, const int num_nodes) {
     visited[current_node->id()] = true;
     for (const Edge* out : current_node->out_edges()) {
       if (out->dst() == node) {
-        return errors::Internal("Detect a cycle: Node \"", node->name(), "\"(",
-                                node->def().op(), ") feeds into itself.");
+        return errors::Internal("Detected a cycle: ", FormatNodeForError(*node),
+                                "(", node->def().op(), ") feeds into itself.");
       } else if (!visited[out->dst()->id()]) {
         ready.push_back(out->dst());
       }
@@ -324,7 +324,7 @@ Status AddMissingFunctionDef(const FunctionDef& fdef,
     if (library->Find(node.op())) {
       continue;
     }
-    // The function refered by 'SymbolicGradient' node is specified in its
+    // The function referred by 'SymbolicGradient' node is specified in its
     // attribute 'f'.
     if (node.op() == FunctionLibraryDefinition::kGradientOp) {
       const AttrValue* attr =
@@ -437,22 +437,24 @@ Status FunctionalizeLoop(const FunctionLibraryDefinition* lookup_library,
           continue;
         }
         if (enter_merge != nullptr) {
-          return errors::Internal(
-              "Enter node for loop-varying argument ", arg.enter->name(),
-              " has multiple successors: ", enter_merge->dst()->name(), " and ",
-              e->dst()->name());
+          return errors::Internal("Enter node for loop-varying argument ",
+                                  FormatNodeForError(*arg.enter),
+                                  " has multiple successors: ",
+                                  FormatNodeForError(*enter_merge->dst()),
+                                  " and ", FormatNodeForError(*e->dst()));
         }
         enter_merge = e;
       }
       if (enter_merge == nullptr) {
         return errors::Internal("Enter node for loop-varying argument ",
-                                arg.enter->name(), " has zero successors");
+                                FormatNodeForError(*arg.enter),
+                                " has zero successors");
       }
       arg.merge = enter_merge->dst();
       if (!IsMerge(arg.merge)) {
         return errors::InvalidArgument(
             "Successor of Enter node for loop-varying argument ",
-            arg.merge->name(),
+            FormatNodeForError(*arg.merge),
             " is not a Merge node; got: ", arg.merge->type_string());
       }
 
@@ -462,7 +464,7 @@ Status FunctionalizeLoop(const FunctionLibraryDefinition* lookup_library,
         return errors::InvalidArgument(
             "Unexpected number of inputs to Merge node for loop-varying "
             "argument ",
-            arg.merge->name(), "; expected 2, got ",
+            FormatNodeForError(*arg.merge), "; expected 2, got ",
             arg.merge->input_types().size());
       }
       TF_RETURN_IF_ERROR(arg.merge->input_node(1 - enter_merge->dst_input(),
@@ -470,7 +472,7 @@ Status FunctionalizeLoop(const FunctionLibraryDefinition* lookup_library,
       if (!IsNextIteration(arg.next_iteration)) {
         return errors::InvalidArgument(
             "Expected NextIteration node as input to Merge node; got node ",
-            arg.next_iteration->name(), " with kind ",
+            FormatNodeForError(*arg.next_iteration), " with kind ",
             arg.next_iteration->type_string());
       }
 
@@ -481,14 +483,14 @@ Status FunctionalizeLoop(const FunctionLibraryDefinition* lookup_library,
             switches.find(edge->dst()) != switches.end()) {
           if (arg.switch_node != nullptr) {
             return errors::InvalidArgument("Duplicate Switch successors to ",
-                                           arg.merge->name());
+                                           FormatNodeForError(*arg.merge));
           }
           arg.switch_node = edge->dst();
         }
       }
       if (arg.switch_node == nullptr) {
         return errors::InvalidArgument("Missing Switch successor to ",
-                                       arg.merge->name());
+                                       FormatNodeForError(*arg.merge));
       }
 
       // Update the device on the Identity outputs of the switch to match their
@@ -516,14 +518,15 @@ Status FunctionalizeLoop(const FunctionLibraryDefinition* lookup_library,
         possible_exit.pop_front();
         if (IsExit(edge->dst())) {
           if (arg.exit != nullptr) {
-            return errors::InvalidArgument("Duplicate Exit successors to ",
-                                           arg.switch_node->name());
+            return errors::InvalidArgument(
+                "Duplicate Exit successors to ",
+                FormatNodeForError(*arg.switch_node));
           }
           arg.exit = edge->dst();
         } else {
           if (!IsIdentity(edge->dst())) {
             return errors::Unimplemented("General graph between switch (",
-                                         arg.switch_node->name(),
+                                         FormatNodeForError(*arg.switch_node),
                                          ") and exit node of frame ",
                                          frame->name, " not supported yet.");
           }
@@ -1470,7 +1473,7 @@ Status FunctionalizeControlFlow(const FunctionLibraryDefinition* lookup_library,
   if (!unreachable_nodes.empty()) {
     return errors::InvalidArgument(
         "The following nodes are unreachable from the source in the graph: ",
-        tensorflow::str_util::Join(unreachable_nodes, ", "));
+        errors::FormatNodeNamesForError(unreachable_nodes));
   }
 
   // Builds Frames, indexed by name.
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
index aae2f8ee5a..ccf249b35d 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
@@ -1064,7 +1064,10 @@ TEST(FunctionalizeControlFlow, Cycle) {
   // less -> XlaIf <--> identity.
   Status status = FunctionalizeControlFlow(graph.get(), &library);
   EXPECT_FALSE(status.ok());
-  EXPECT_TRUE(str_util::StrContains(status.error_message(), "Detect a cycle"))
+  EXPECT_TRUE(str_util::StrContains(status.error_message(), "Detected a cycle"))
+      << status.error_message();
+  EXPECT_TRUE(
+      str_util::StrContains(status.error_message(), "{{node cond/Less_5_If}}"))
       << status.error_message();
 }
 
diff --git a/tensorflow/compiler/tf2xla/graph_compiler.cc b/tensorflow/compiler/tf2xla/graph_compiler.cc
index e1cea03865..e4fdf0a618 100644
--- a/tensorflow/compiler/tf2xla/graph_compiler.cc
+++ b/tensorflow/compiler/tf2xla/graph_compiler.cc
@@ -29,7 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_context.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/executor.h"
 #include "tensorflow/core/common_runtime/function.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index d88a34dfd9..f96483d23d 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -123,12 +123,13 @@ tf_kernel_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/client/lib:constants",
         "//tensorflow/compiler/xla/client/lib:math",
         "//tensorflow/compiler/xla/client/lib:numeric",
         "//tensorflow/compiler/xla/client/lib:prng",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/core:framework",
         "//tensorflow/core:image_ops_op_lib",
         "//tensorflow/core:lib",
@@ -164,7 +165,8 @@ tf_kernel_library(
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla/ops:xla_ops",
         "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -180,7 +182,7 @@ tf_kernel_library(
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla/ops:xla_ops",
         "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -217,8 +219,8 @@ tf_kernel_library(
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla/client:client_library",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/kernels:argmax_op",
diff --git a/tensorflow/compiler/tf2xla/kernels/aggregate_ops.cc b/tensorflow/compiler/tf2xla/kernels/aggregate_ops.cc
index e335328280..41a453da80 100644
--- a/tensorflow/compiler/tf2xla/kernels/aggregate_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/aggregate_ops.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc b/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
index c4af79281d..b3ad0aea84 100644
--- a/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/util/tensor_format.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc b/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc
index 26130fd9e7..48f2a005ab 100644
--- a/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/compiler/tf2xla/kernels/bias_ops.cc b/tensorflow/compiler/tf2xla/kernels/bias_ops.cc
index e9b2c0b16d..41f540506b 100644
--- a/tensorflow/compiler/tf2xla/kernels/bias_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bias_ops.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/util/tensor_format.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
index d6d4ae8937..2c328102e0 100644
--- a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/bucketize_op.cc b/tensorflow/compiler/tf2xla/kernels/bucketize_op.cc
index efbdb76eaa..5078f8662b 100644
--- a/tensorflow/compiler/tf2xla/kernels/bucketize_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bucketize_op.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/tf2xla/kernels/cast_op.cc b/tensorflow/compiler/tf2xla/kernels/cast_op.cc
index 62eebf762b..8cc2479dd5 100644
--- a/tensorflow/compiler/tf2xla/kernels/cast_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/cast_op.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 
diff --git a/tensorflow/compiler/tf2xla/kernels/categorical_op.cc b/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
index 1784e712b5..e7fef77edc 100644
--- a/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/clip_by_value_op.cc b/tensorflow/compiler/tf2xla/kernels/clip_by_value_op.cc
index 4e6d33304c..547fe48046 100644
--- a/tensorflow/compiler/tf2xla/kernels/clip_by_value_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/clip_by_value_op.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/tf2xla/kernels/concat_op.cc b/tensorflow/compiler/tf2xla/kernels/concat_op.cc
index e3a32a5c0e..f410605104 100644
--- a/tensorflow/compiler/tf2xla/kernels/concat_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/concat_op.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/const_op.cc b/tensorflow/compiler/tf2xla/kernels/const_op.cc
index f4360d8c3f..da8cf3fc6f 100644
--- a/tensorflow/compiler/tf2xla/kernels/const_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/const_op.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
index 48ac4867ed..5da7972397 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/numeric.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/cross_op.cc b/tensorflow/compiler/tf2xla/kernels/cross_op.cc
index 500a564f3f..db579a5b35 100644
--- a/tensorflow/compiler/tf2xla/kernels/cross_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/cross_op.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/compiler/tf2xla/kernels/cwise_ops.cc b/tensorflow/compiler/tf2xla/kernels/cwise_ops.cc
index 9ff3e02228..ef1015552d 100644
--- a/tensorflow/compiler/tf2xla/kernels/cwise_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/cwise_ops.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/util/bcast.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/cwise_ops.h b/tensorflow/compiler/tf2xla/kernels/cwise_ops.h
index 4f92dbc874..a5b870f8db 100644
--- a/tensorflow/compiler/tf2xla/kernels/cwise_ops.h
+++ b/tensorflow/compiler/tf2xla/kernels/cwise_ops.h
@@ -20,7 +20,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/util/bcast.h"
 
diff --git a/tensorflow/compiler/tf2xla/kernels/depthtospace_op.cc b/tensorflow/compiler/tf2xla/kernels/depthtospace_op.cc
index f314920025..12b0e38288 100644
--- a/tensorflow/compiler/tf2xla/kernels/depthtospace_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/depthtospace_op.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/util/tensor_format.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/tf2xla/kernels/diag_op.cc b/tensorflow/compiler/tf2xla/kernels/diag_op.cc
index 22cda27567..ed44ad218b 100644
--- a/tensorflow/compiler/tf2xla/kernels/diag_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/diag_op.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/lib/numeric.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 
diff --git a/tensorflow/compiler/tf2xla/kernels/dynamic_slice_ops.cc b/tensorflow/compiler/tf2xla/kernels/dynamic_slice_ops.cc
index 3b86ea34c9..a3389d5b90 100644
--- a/tensorflow/compiler/tf2xla/kernels/dynamic_slice_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/dynamic_slice_ops.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 
 #include "tensorflow/compiler/tf2xla/type_util.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc b/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
index 958231505b..cb73053666 100644
--- a/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/elu_op.cc b/tensorflow/compiler/tf2xla/kernels/elu_op.cc
index 81f42e504e..5fdb1d972c 100644
--- a/tensorflow/compiler/tf2xla/kernels/elu_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/elu_op.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/kernels/cwise_ops.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/types.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc b/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
index 65d42a302f..c68b0bfd79 100644
--- a/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/numeric.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/util/tensor_format.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/tf2xla/kernels/fake_quantize_ops.cc b/tensorflow/compiler/tf2xla/kernels/fake_quantize_ops.cc
index 2fd1a34741..cdba6680de 100644
--- a/tensorflow/compiler/tf2xla/kernels/fake_quantize_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/fake_quantize_ops.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/platform/macros.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/tf2xla/kernels/fft_ops.cc b/tensorflow/compiler/tf2xla/kernels/fft_ops.cc
index b2b00e51e3..80bcef9663 100644
--- a/tensorflow/compiler/tf2xla/kernels/fft_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/fft_ops.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/fill_op.cc b/tensorflow/compiler/tf2xla/kernels/fill_op.cc
index 95faa1d058..54b21a2782 100644
--- a/tensorflow/compiler/tf2xla/kernels/fill_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/fill_op.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/register_types.h"
 
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op.cc b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
index 5f041be5df..35de96e0aa 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h b/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h
index d898e43b85..92346283c3 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h
+++ b/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h
@@ -20,7 +20,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/util/bcast.h"
 
diff --git a/tensorflow/compiler/tf2xla/kernels/if_op.cc b/tensorflow/compiler/tf2xla/kernels/if_op.cc
index e2160feba0..ceb2af756c 100644
--- a/tensorflow/compiler/tf2xla/kernels/if_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/if_op.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_context.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/compiler/tf2xla/kernels/image_ops.cc b/tensorflow/compiler/tf2xla/kernels/image_ops.cc
index cb4caf7bcb..6e061ba278 100644
--- a/tensorflow/compiler/tf2xla/kernels/image_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/image_ops.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
index d6bf92fb3d..8d75624e74 100644
--- a/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/lib/numeric.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/lib/math/math_util.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/l2loss_op.cc b/tensorflow/compiler/tf2xla/kernels/l2loss_op.cc
index 9e64711051..f028e361bc 100644
--- a/tensorflow/compiler/tf2xla/kernels/l2loss_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/l2loss_op.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/no_op.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/listdiff_op.cc b/tensorflow/compiler/tf2xla/kernels/listdiff_op.cc
index 2fb072f827..a11bbe918f 100644
--- a/tensorflow/compiler/tf2xla/kernels/listdiff_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/listdiff_op.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/lib/core/errors.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/lrn_ops.cc b/tensorflow/compiler/tf2xla/kernels/lrn_ops.cc
index dc934543cb..87ee2d3aed 100644
--- a/tensorflow/compiler/tf2xla/kernels/lrn_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/lrn_ops.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/tf2xla/kernels/matmul_op.cc b/tensorflow/compiler/tf2xla/kernels/matmul_op.cc
index 844080b8cf..6440770c29 100644
--- a/tensorflow/compiler/tf2xla/kernels/matmul_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/matmul_op.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 
 namespace tensorflow {
@@ -54,10 +54,14 @@ class MatMulOp : public XlaOpKernel {
     const TensorShape b_shape = ctx->InputShape(1);
 
     // Check that the dimensions of the two matrices are valid.
-    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(a_shape),
-                errors::InvalidArgument("In[0] is not a matrix"));
-    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(b_shape),
-                errors::InvalidArgument("In[1] is not a matrix"));
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsMatrix(a_shape),
+        errors::InvalidArgument("In[0] is not a matrix. Instead it has shape ",
+                                a_shape.DebugString()));
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsMatrix(b_shape),
+        errors::InvalidArgument("In[1] is not a matrix. Instead it has shape ",
+                                b_shape.DebugString()));
     int first_index = transpose_a_ ? 0 : 1;
     int second_index = transpose_b_ ? 1 : 0;
 
diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc b/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc
index e06c87db7a..8dfd7de591 100644
--- a/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/numeric.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc b/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc
index e2ab4b83cf..c0ca881ff8 100644
--- a/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/numeric.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc b/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc
index 529959dbd9..eedfc3c914 100644
--- a/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/util/mirror_pad_mode.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/tf2xla/kernels/pack_op.cc b/tensorflow/compiler/tf2xla/kernels/pack_op.cc
index 3aed47de26..a9b519d892 100644
--- a/tensorflow/compiler/tf2xla/kernels/pack_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/pack_op.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/pad_op.cc b/tensorflow/compiler/tf2xla/kernels/pad_op.cc
index 89fd610bc6..e5937b56c1 100644
--- a/tensorflow/compiler/tf2xla/kernels/pad_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/pad_op.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/register_types.h"
 
diff --git a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
index 12d9cb9bac..3d506e71e0 100644
--- a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
@@ -21,7 +21,8 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/framework/op_kernel.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/quantize_and_dequantize_op.cc b/tensorflow/compiler/tf2xla/kernels/quantize_and_dequantize_op.cc
index e88221e4f4..6f4ed496a1 100644
--- a/tensorflow/compiler/tf2xla/kernels/quantize_and_dequantize_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/quantize_and_dequantize_op.cc
@@ -19,7 +19,8 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/core/platform/macros.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/tf2xla/kernels/random_ops.cc b/tensorflow/compiler/tf2xla/kernels/random_ops.cc
index 607cad798a..2da9340625 100644
--- a/tensorflow/compiler/tf2xla/kernels/random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/random_ops.cc
@@ -27,7 +27,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/lib/numeric.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/reduce_window_op.cc b/tensorflow/compiler/tf2xla/kernels/reduce_window_op.cc
index 76bd1e62aa..b11a4ce36d 100644
--- a/tensorflow/compiler/tf2xla/kernels/reduce_window_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reduce_window_op.cc
@@ -19,7 +19,8 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/op_kernel.h"
 
diff --git a/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc b/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc
index be7f2bce8c..0d260fa8fc 100644
--- a/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 
diff --git a/tensorflow/compiler/tf2xla/kernels/reduction_ops.h b/tensorflow/compiler/tf2xla/kernels/reduction_ops.h
index 8333f9b288..466e79828d 100644
--- a/tensorflow/compiler/tf2xla/kernels/reduction_ops.h
+++ b/tensorflow/compiler/tf2xla/kernels/reduction_ops.h
@@ -19,7 +19,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_TF2XLA_KERNELS_REDUCTION_OPS_H_
 
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc b/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
index ed1d1c6610..b52f0a0ab6 100644
--- a/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
@@ -19,7 +19,8 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 
diff --git a/tensorflow/compiler/tf2xla/kernels/relu_op.cc b/tensorflow/compiler/tf2xla/kernels/relu_op.cc
index f4b804e546..d35777ccb1 100644
--- a/tensorflow/compiler/tf2xla/kernels/relu_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/relu_op.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/kernels/cwise_ops.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/types.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/reshape_op.cc b/tensorflow/compiler/tf2xla/kernels/reshape_op.cc
index 354fec9be7..121750a82a 100644
--- a/tensorflow/compiler/tf2xla/kernels/reshape_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reshape_op.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/retval_op.cc b/tensorflow/compiler/tf2xla/kernels/retval_op.cc
index 5be70a4ded..1911e6ea36 100644
--- a/tensorflow/compiler/tf2xla/kernels/retval_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/retval_op.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_context.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/reverse_op.cc b/tensorflow/compiler/tf2xla/kernels/reverse_op.cc
index ec15b4cc7a..d962ef4a5f 100644
--- a/tensorflow/compiler/tf2xla/kernels/reverse_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reverse_op.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc b/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc
index c810456f94..03a50ef8a0 100644
--- a/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/numeric.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/tf2xla/kernels/scan_ops.cc b/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
index 27ab3e1bf5..ab094d7dd1 100644
--- a/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
@@ -20,7 +20,8 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc b/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc
index 14709bb6cb..f1f32699fe 100644
--- a/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc b/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc
index e2ac7da2c2..b22ecb7c6d 100644
--- a/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/compiler/tf2xla/kernels/select_op.cc b/tensorflow/compiler/tf2xla/kernels/select_op.cc
index 5c010c9df2..6ce50efb4a 100644
--- a/tensorflow/compiler/tf2xla/kernels/select_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/select_op.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 
diff --git a/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc b/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc
index 6281d6c653..a7f5a8f169 100644
--- a/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/types.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/shape_op.cc b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
index 5798823cd5..4e0cf99d8e 100644
--- a/tensorflow/compiler/tf2xla/kernels/shape_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 
diff --git a/tensorflow/compiler/tf2xla/kernels/slice_op.cc b/tensorflow/compiler/tf2xla/kernels/slice_op.cc
index 1864584ade..6adc3c58de 100644
--- a/tensorflow/compiler/tf2xla/kernels/slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/slice_op.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/softmax_op.cc b/tensorflow/compiler/tf2xla/kernels/softmax_op.cc
index a71fbcd901..1d7a63dc31 100644
--- a/tensorflow/compiler/tf2xla/kernels/softmax_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/softmax_op.cc
@@ -20,7 +20,8 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/sort_ops.cc b/tensorflow/compiler/tf2xla/kernels/sort_ops.cc
index faaf8964ff..aaeeae01cc 100644
--- a/tensorflow/compiler/tf2xla/kernels/sort_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/sort_ops.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc b/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc
index 8a8525efa1..7327258c31 100644
--- a/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/compiler/tf2xla/kernels/spacetodepth_op.cc b/tensorflow/compiler/tf2xla/kernels/spacetodepth_op.cc
index 47d282fe9e..4493539fe3 100644
--- a/tensorflow/compiler/tf2xla/kernels/spacetodepth_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/spacetodepth_op.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/util/tensor_format.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/tf2xla/kernels/split_op.cc b/tensorflow/compiler/tf2xla/kernels/split_op.cc
index 242638f981..93fc14e9ef 100644
--- a/tensorflow/compiler/tf2xla/kernels/split_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/split_op.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
index cc4b13d3b9..5412e13547 100644
--- a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
@@ -24,7 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/math.h"
 #include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/client/lib/prng.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
index c2165ccd86..1062399d91 100644
--- a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
index 26326f18b8..be1814d8e3 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/tf2xla/xla_resource.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/tile_ops.cc b/tensorflow/compiler/tf2xla/kernels/tile_ops.cc
index c9e5694262..1233a37565 100644
--- a/tensorflow/compiler/tf2xla/kernels/tile_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tile_ops.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/topk_op.cc b/tensorflow/compiler/tf2xla/kernels/topk_op.cc
index 82d4a69777..e73fb283b0 100644
--- a/tensorflow/compiler/tf2xla/kernels/topk_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/topk_op.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/numeric.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/types.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/training_ops.cc b/tensorflow/compiler/tf2xla/kernels/training_ops.cc
index 98df730249..be5e911386 100644
--- a/tensorflow/compiler/tf2xla/kernels/training_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/training_ops.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/lib/math.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/types.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/transpose_op.cc b/tensorflow/compiler/tf2xla/kernels/transpose_op.cc
index 6c721c48fe..f9148b3942 100644
--- a/tensorflow/compiler/tf2xla/kernels/transpose_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/transpose_op.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/bounds_check.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
index e6ec794cfd..0bdfc05726 100644
--- a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/lib/math.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/tf2xla/kernels/unpack_op.cc b/tensorflow/compiler/tf2xla/kernels/unpack_op.cc
index f951127bb9..8671632976 100644
--- a/tensorflow/compiler/tf2xla/kernels/unpack_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/unpack_op.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/variable_ops.cc b/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
index bb27b5d56f..2c92a585f5 100644
--- a/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/types.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/while_op.cc b/tensorflow/compiler/tf2xla/kernels/while_op.cc
index 009fdd81b2..1e8a376765 100644
--- a/tensorflow/compiler/tf2xla/kernels/while_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/while_op.cc
@@ -21,7 +21,8 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/op_kernel.h"
diff --git a/tensorflow/compiler/tf2xla/legacy_flags/backend_registration_flags.cc b/tensorflow/compiler/tf2xla/legacy_flags/backend_registration_flags.cc
deleted file mode 100644
index 661505021f..0000000000
--- a/tensorflow/compiler/tf2xla/legacy_flags/backend_registration_flags.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Legacy flags for the XLA bridge's backend registration modules.
-
-#include <mutex>  // NOLINT
-#include <vector>
-
-#include "tensorflow/compiler/tf2xla/legacy_flags/backend_registration_flags.h"
-#include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace tensorflow {
-namespace legacy_flags {
-
-// Pointers to the parsed value of the flags and flag descriptors, initialized
-// via flags_init.
-static BackendRegistrationFlags* flags;
-static std::vector<Flag>* flag_list;
-static std::once_flag flags_init;
-
-// Allocate *flags.  Called via call_once(&flags_init,...).
-static void AllocateFlags() {
-  flags = new BackendRegistrationFlags;
-  flags->tf_enable_prng_ops_gpu = false;
-  flag_list = new std::vector<Flag>({
-      Flag("tf_enable_prng_ops_gpu", &flags->tf_enable_prng_ops_gpu,
-           "Whether to enable PRNG ops: [RandomStandardNormal | RandomUniform "
-           "| RandomUniformInt | TruncatedNormal] on GPU."),
-  });
-  xla::legacy_flags::ParseFlagsFromEnv(*flag_list);
-}
-
-// Append to *append_to flag definitions associated with the XLA bridge's
-// backend registration modules.
-void AppendBackendRegistrationFlags(std::vector<Flag>* append_to) {
-  std::call_once(flags_init, &AllocateFlags);
-  append_to->insert(append_to->end(), flag_list->begin(), flag_list->end());
-}
-
-// Return a pointer to the BackendRegistrationFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-BackendRegistrationFlags* GetBackendRegistrationFlags() {
-  std::call_once(flags_init, &AllocateFlags);
-  return flags;
-}
-
-}  // namespace legacy_flags
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/legacy_flags/backend_registration_flags.h b/tensorflow/compiler/tf2xla/legacy_flags/backend_registration_flags.h
deleted file mode 100644
index 861c923dd5..0000000000
--- a/tensorflow/compiler/tf2xla/legacy_flags/backend_registration_flags.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_TF2XLA_LEGACY_FLAGS_BACKEND_REGISTRATION_FLAGS_H_
-#define TENSORFLOW_COMPILER_TF2XLA_LEGACY_FLAGS_BACKEND_REGISTRATION_FLAGS_H_
-
-// Legacy flags for the XLA bridge's backend registration modules.
-
-#include <vector>
-
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace tensorflow {
-namespace legacy_flags {
-
-// Append to *flag_list flag definitions associated with the XLA bridge's
-// backend registration modules.
-void AppendBackendRegistrationFlags(std::vector<tensorflow::Flag>* append_to);
-
-// The values of flags associated with the XLA bridge's backend registration
-// module.
-typedef struct {
-  // Whether to enable RandomUniform op on GPU backend.
-  // TODO (b/32333178): Remove this flag or set its default to true.
-  bool tf_enable_prng_ops_gpu;
-} BackendRegistrationFlags;
-
-// Return a pointer to the BackendRegistrationFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-BackendRegistrationFlags* GetBackendRegistrationFlags();
-
-}  // namespace legacy_flags
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_TF2XLA_LEGACY_FLAGS_BACKEND_REGISTRATION_FLAGS_H_
diff --git a/tensorflow/compiler/tf2xla/lib/BUILD b/tensorflow/compiler/tf2xla/lib/BUILD
index 30039e256a..cb7a40e23d 100644
--- a/tensorflow/compiler/tf2xla/lib/BUILD
+++ b/tensorflow/compiler/tf2xla/lib/BUILD
@@ -25,8 +25,8 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/core:lib",
     ],
 )
@@ -44,9 +44,9 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:constants",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/core:lib",
     ],
 )
@@ -59,9 +59,9 @@ cc_library(
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client/lib:constants",
         "//tensorflow/compiler/xla/client/lib:math",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/core:protos_all_cc",
     ],
 )
@@ -78,12 +78,12 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/client/lib:constants",
         "//tensorflow/compiler/xla/client/lib:math",
         "//tensorflow/compiler/xla/client/lib:numeric",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/core:lib",
     ],
 )
@@ -100,9 +100,9 @@ cc_library(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/core:lib",
     ],
 )
@@ -119,10 +119,10 @@ cc_library(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:constants",
         "//tensorflow/compiler/xla/client/lib:numeric",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/core:lib",
     ],
 )
@@ -142,7 +142,7 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -162,8 +162,8 @@ cc_library(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/core:lib",
     ],
 )
@@ -200,8 +200,8 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/core:lib",
     ],
 )
diff --git a/tensorflow/compiler/tf2xla/lib/batch_dot.cc b/tensorflow/compiler/tf2xla/lib/batch_dot.cc
index 3c4eec081b..f666d22ea4 100644
--- a/tensorflow/compiler/tf2xla/lib/batch_dot.cc
+++ b/tensorflow/compiler/tf2xla/lib/batch_dot.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/tf2xla/lib/batch_dot.h b/tensorflow/compiler/tf2xla/lib/batch_dot.h
index d07a9486f1..8757b16a1c 100644
--- a/tensorflow/compiler/tf2xla/lib/batch_dot.h
+++ b/tensorflow/compiler/tf2xla/lib/batch_dot.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_BATCH_DOT_H_
 #define TENSORFLOW_COMPILER_TF2XLA_LIB_BATCH_DOT_H_
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/compiler/tf2xla/lib/cholesky.cc b/tensorflow/compiler/tf2xla/lib/cholesky.cc
index 35b137aa2c..87d73eb3f0 100644
--- a/tensorflow/compiler/tf2xla/lib/cholesky.cc
+++ b/tensorflow/compiler/tf2xla/lib/cholesky.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/lib/util.h"
 #include "tensorflow/compiler/tf2xla/lib/while_loop.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
diff --git a/tensorflow/compiler/tf2xla/lib/cholesky.h b/tensorflow/compiler/tf2xla/lib/cholesky.h
index 0f6e0e9d15..1bef9bb166 100644
--- a/tensorflow/compiler/tf2xla/lib/cholesky.h
+++ b/tensorflow/compiler/tf2xla/lib/cholesky.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_CHOLESKY_H_
 #define TENSORFLOW_COMPILER_TF2XLA_LIB_CHOLESKY_H_
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/compiler/tf2xla/lib/qr.cc b/tensorflow/compiler/tf2xla/lib/qr.cc
index 9c8ac7af25..fc0c1ee838 100644
--- a/tensorflow/compiler/tf2xla/lib/qr.cc
+++ b/tensorflow/compiler/tf2xla/lib/qr.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/lib/math.h"
 #include "tensorflow/compiler/xla/client/lib/numeric.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
diff --git a/tensorflow/compiler/tf2xla/lib/qr.h b/tensorflow/compiler/tf2xla/lib/qr.h
index 3aa6a9b075..abd2316ac9 100644
--- a/tensorflow/compiler/tf2xla/lib/qr.h
+++ b/tensorflow/compiler/tf2xla/lib/qr.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_QR_H_
 #define TENSORFLOW_COMPILER_TF2XLA_LIB_QR_H_
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/compiler/tf2xla/lib/random.cc b/tensorflow/compiler/tf2xla/lib/random.cc
index 8ff10fbd3f..5e7cf00ee5 100644
--- a/tensorflow/compiler/tf2xla/lib/random.cc
+++ b/tensorflow/compiler/tf2xla/lib/random.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/lib/math.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/tf2xla/lib/random.h b/tensorflow/compiler/tf2xla/lib/random.h
index 2c573fd85b..59fc5d0433 100644
--- a/tensorflow/compiler/tf2xla/lib/random.h
+++ b/tensorflow/compiler/tf2xla/lib/random.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_RANDOM_H_
 #define TENSORFLOW_COMPILER_TF2XLA_LIB_RANDOM_H_
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/framework/types.pb.h"
 
diff --git a/tensorflow/compiler/tf2xla/lib/scatter.cc b/tensorflow/compiler/tf2xla/lib/scatter.cc
index 739032fef7..ba22eff73a 100644
--- a/tensorflow/compiler/tf2xla/lib/scatter.cc
+++ b/tensorflow/compiler/tf2xla/lib/scatter.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/lib/util.h"
 #include "tensorflow/compiler/tf2xla/lib/while_loop.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
diff --git a/tensorflow/compiler/tf2xla/lib/scatter.h b/tensorflow/compiler/tf2xla/lib/scatter.h
index 87309e10ed..13a5f1b850 100644
--- a/tensorflow/compiler/tf2xla/lib/scatter.h
+++ b/tensorflow/compiler/tf2xla/lib/scatter.h
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include <functional>
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve.cc b/tensorflow/compiler/tf2xla/lib/triangular_solve.cc
index a2dd5a0d57..04fa10108c 100644
--- a/tensorflow/compiler/tf2xla/lib/triangular_solve.cc
+++ b/tensorflow/compiler/tf2xla/lib/triangular_solve.cc
@@ -22,16 +22,315 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/lib/util.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/lib/numeric.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/math/math_util.h"
 
 namespace tensorflow {
 
+// Get the diagonal blocks of the coefficient matrix
+xla::XlaOp DiagonalBlocks(xla::XlaOp a, int64 block_size) {
+  xla::XlaBuilder* builder = a.builder();
+  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
+    TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(a));
+    int ndims = xla::ShapeUtil::Rank(shape);
+    int64 n = xla::ShapeUtil::GetDimension(shape, -1);
+    int64 num_blocks = n / block_size;
+
+    xla::XlaOp diag_blocks;
+
+    // If the coefficient matrix is exactly the block size, we just add a
+    // singleton dimension i.e. [..., n, n] -> [..., 1, n, n]
+    if (n == block_size) {
+      std::vector<int64> permutation(ndims);
+      std::iota(permutation.begin(), permutation.end(), 1);
+      permutation.insert(permutation.end() - 2, 0);
+      return Transpose(Broadcast(a, /*broadcast_sizes=*/{1}), permutation);
+    }
+
+    // We can grab entire blocks using gather
+    if (n > block_size) {
+      // Construct the starting indices of the diagonal blocks
+      auto gather_indices =
+          Transpose(Broadcast(Mul(Iota(builder, xla::S32, num_blocks),
+                                  xla::ConstantR0<int32>(builder, block_size)),
+                              /*broadcast_sizes=*/{2}),
+                    /*permutation=*/{1, 0});
+
+      // Gather the diagonal blocks
+      xla::GatherDimensionNumbers dim_numbers;
+      dim_numbers.add_output_window_dims(ndims - 1);
+      dim_numbers.add_output_window_dims(ndims);
+      dim_numbers.add_gather_dims_to_operand_dims(ndims - 2);
+      dim_numbers.add_gather_dims_to_operand_dims(ndims - 1);
+      dim_numbers.set_index_vector_dim(1);
+      diag_blocks = Gather(a, gather_indices, dim_numbers,
+                           /*window_bounds=*/{block_size, block_size});
+    }
+
+    // The last block might be smaller than the block size,
+    // so we will need to pad it
+    if (n % block_size != 0) {
+      // Pad with zeros
+      auto last_blocks =
+          SliceInMinorDims(a, {n - n % block_size, n - n % block_size}, {n, n});
+      xla::PaddingConfig config = xla::MakeNoPaddingConfig(ndims);
+      int64 padding = block_size - n % block_size;
+      config.mutable_dimensions(ndims - 1)->set_edge_padding_high(padding);
+      config.mutable_dimensions(ndims - 2)->set_edge_padding_high(padding);
+      last_blocks =
+          Pad(last_blocks, Zero(builder, shape.element_type()), config);
+
+      // Add a singleton dimension
+      // i.e. [..., block_size, block_size] -> [..., 1, block_size, block_size]
+      TF_ASSIGN_OR_RETURN(xla::Shape blocks_shape,
+                          builder->GetShape(last_blocks));
+      auto shape_dims = xla::AsInt64Slice(blocks_shape.dimensions());
+      auto last_blocks_dims = std::vector<int64>(ndims);
+      std::copy(shape_dims.begin(), shape_dims.end(), last_blocks_dims.begin());
+      last_blocks_dims.insert(last_blocks_dims.end() - 2, 1);
+      last_blocks = Reshape(last_blocks, last_blocks_dims);
+
+      // Concatenate with the other blocks if necessary
+      if (n > block_size) {
+        diag_blocks =
+            xla::ConcatInDim(builder, {diag_blocks, last_blocks}, ndims - 2);
+      } else {
+        diag_blocks = last_blocks;
+      }
+    }
+
+    return diag_blocks;
+  });
+}
+
+xla::XlaOp InvertDiagonalBlocks(xla::XlaOp diag_blocks, bool lower,
+                                bool transpose_a, bool conjugate_a) {
+  xla::XlaBuilder* builder = diag_blocks.builder();
+  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
+    // Input is a batch of square lower triangular square matrices. Its shape is
+    // (..., size, size). We resize this to (num_blocks, size, size).
+    TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(diag_blocks));
+    int64 block_size = xla::ShapeUtil::GetDimension(shape, -1);
+    int64 num_blocks = xla::ShapeUtil::ElementsIn(shape) /
+                       tensorflow::MathUtil::IPow(block_size, 2);
+    diag_blocks = Reshape(diag_blocks, {num_blocks, block_size, block_size});
+
+    // The input must be triangular because we rely on that when doing
+    // multiplications later on
+    diag_blocks = Triangle(diag_blocks, /*lower=*/lower);
+
+    // Rescale blocks to be unit triangular, but avoid dividing by
+    // zero (which can happen if the last block was padded) otherwise it will
+    // introduce nans which will propagate
+    auto diags = GetMatrixDiagonal(diag_blocks);
+    TF_ASSIGN_OR_RETURN(xla::Shape diags_shape, builder->GetShape(diags));
+    auto one = ScalarLike(diags, 1);
+    auto ones = Broadcast(one, xla::AsInt64Slice(diags_shape.dimensions()));
+    diags = Select(Eq(diags, Zero(builder, shape.element_type())), ones, diags);
+    auto scaled_diag_blocks = Div(diag_blocks, diags, {0, 2});
+
+    // We can now use the fact that for an upper triangular matrix
+    // [[L11, 0], [L21, L22]], given the inverses L11' and L22', we have
+    // L22' = -L22' * L21 * L11'. In our case, L21 is a vector and our blocks
+    // have been rescaled to be unit triangular, so L22 = L22' = 1.
+
+    // Initialize the output matrix with -1s on the diagonal. We use -1 instead
+    // of 1 because we cannot do matrix-vector multiplies with variable shapes
+    // inside of a loop, or do irregularly shaped in-place updates. Hence,
+    // L21 <- -L22 * L21 * L11 cannot be done naively. Instead, we update the
+    // entire row i.e. we calculate
+    // [L21 L22 0] <- -[L21 L22 0] @ diag_blocks([L11', -I, -I])
+    // which means [L21 L22 0] <- [-L21 * L11', L22, 0].
+    auto identity =
+        IdentityMatrix(builder, shape.element_type(), block_size, block_size);
+    auto neg_identity = -identity;
+
+    // The first or last  diagonal element should be set to 1 instead of -1
+    // though, since we never update it
+    auto pos_one = Reshape(One(builder, shape.element_type()), {1, 1});
+    auto start_index = (lower) ? 0 : block_size - 1;
+    auto output_block = DynamicUpdateSlice(
+        neg_identity, pos_one,
+        /*start_indices=*/xla::ConstantR1<int>(builder, 2, start_index));
+
+    // Broadcast diag([1, -1, -1, ...]) to every block
+    xla::XlaOp output = Broadcast(output_block,
+                                  /*broadcast_sizes=*/{num_blocks});
+
+    // Now we construct a loop that performs matrix-vector multiplications
+    // inverting the blocks one row at a time
+    std::vector<xla::Shape> tuple_shapes = {
+        // The loop iteration counter is a scalar, incremented each iteration.
+        xla::ShapeUtil::MakeShape(xla::S32, {}),
+        // The output has the shape of A, with one row updated each iteration.
+        xla::ShapeUtil::MakeShape(shape.element_type(),
+                                  {num_blocks, block_size, block_size}),
+        // The input is a loop invariant.
+        xla::ShapeUtil::MakeShape(shape.element_type(),
+                                  {num_blocks, block_size, block_size})};
+    xla::Shape tuple_shape = xla::ShapeUtil::MakeTupleShape(tuple_shapes);
+
+    auto init_i = One(builder, xla::S32);
+    auto init = xla::Tuple(builder, {init_i, output, scaled_diag_blocks});
+
+    // Construct the loop condition function.
+    std::unique_ptr<xla::XlaBuilder> condb =
+        builder->CreateSubBuilder("InvertDiagCond");
+    {
+      auto i = GetTupleElement(
+          Parameter(condb.get(), 0, tuple_shape, "InvertDiagCondTuple"), 0);
+      Lt(i, xla::ConstantR0<int32>(condb.get(), block_size));
+    }
+    TF_ASSIGN_OR_RETURN(auto cond, condb->Build());
+
+    // Construct the loop body function.
+    std::unique_ptr<xla::XlaBuilder> bodyb =
+        builder->CreateSubBuilder("InvertDiagBody");
+    {
+      auto input_tuple =
+          Parameter(bodyb.get(), 0, tuple_shape, "InvertDiagBodyTuple");
+
+      auto i = GetTupleElement(input_tuple, 0);
+      auto body_out = GetTupleElement(input_tuple, 1);
+      auto body_input = GetTupleElement(input_tuple, 2);
+
+      auto zero = xla::ConstantR1<int32>(bodyb.get(), 1, 0);
+      auto j = (lower) ? i : ScalarLike(i, block_size - 1) - i;
+      auto start_indices =
+          xla::ConcatInDim(bodyb.get(), {zero, Reshape(j, {1}), zero}, 0);
+      auto input_row =
+          DynamicSlice(body_input, start_indices,
+                       /*slice_sizes=*/{num_blocks, 1, block_size});
+
+      // We want -L21 L11^{-1}
+      xla::DotDimensionNumbers dnums;
+      dnums.add_lhs_batch_dimensions(0);
+      dnums.add_rhs_batch_dimensions(0);
+      dnums.add_lhs_contracting_dimensions(2);
+      dnums.add_rhs_contracting_dimensions(1);
+      auto update = -DotGeneral(input_row, body_out, dnums);
+
+      body_out = DynamicUpdateSlice(body_out, update, start_indices);
+
+      auto next_i = i + ScalarLike(i, 1);
+      xla::Tuple(bodyb.get(), {next_i, body_out, body_input});
+    }
+    TF_ASSIGN_OR_RETURN(auto body, bodyb->Build());
+
+    // Construct the While loop and return the result,
+    // return while_loop(cond_fun, body_fun, init)[1]
+    auto invert_while = While(cond, body, init);
+    auto inv_diag_blocks = GetTupleElement(invert_while, 1);
+
+    // Undo the scaling
+    inv_diag_blocks = Div(inv_diag_blocks, diags,
+                          /*broadcast_dimensions=*/{0, 1});
+
+    // Reshape back to original batch major dimensions
+    return Reshape(inv_diag_blocks, xla::AsInt64Slice(shape.dimensions()));
+  });
+}
+
+xla::XlaOp SolveWithInvertedDiagonalBlocks(xla::XlaOp a, xla::XlaOp b,
+                                           xla::XlaOp inv_diag_blocks,
+                                           bool left_side, bool lower,
+                                           bool transpose_a, bool conjugate_a) {
+  xla::XlaBuilder* builder = a.builder();
+  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
+    TF_ASSIGN_OR_RETURN(xla::Shape blocks_shape,
+                        builder->GetShape(inv_diag_blocks));
+    TF_ASSIGN_OR_RETURN(xla::Shape b_shape, builder->GetShape(b));
+    int64 block_size = xla::ShapeUtil::GetDimension(blocks_shape, -1);
+
+    TF_ASSIGN_OR_RETURN(xla::Shape a_shape, builder->GetShape(a));
+    int64 ndims = xla::ShapeUtil::Rank(a_shape);
+    int64 n = xla::ShapeUtil::GetDimension(a_shape, -1);
+    int64 num_blocks = n / block_size + (n % block_size != 0);
+    int64 m_dim = (left_side) ? -1 : -2;
+    int64 m = xla::ShapeUtil::GetDimension(b_shape, m_dim);
+
+    // Initialize the solution
+    auto x = ZerosLike(b);
+
+    // This loop is unrolled for performance reasons, but it could be expressed
+    // rolled as well since the matrices are of the same size each iteration
+    for (int i = 0; i < num_blocks; i++) {
+      // High-level intuition: We have B[i] = L[i] @ X. Since L is upper
+      // triangular this means B[i] = L[i, :i + 1] @ X[:i + 1]. We can split
+      // this into two parts: B[i] = L[i, :i] @ X[:i] + L[i, i] @ X[i] which
+      // can be solved for X[i] as X[i] = inv(L[i, i]) @ B[i] - L[i, :i] @ X[:i]
+
+      // Decide whether we go from first block to last or vice versa
+      auto j = (left_side ^ lower ^ transpose_a) ? num_blocks - 1 - i : i;
+
+      // Get the size of the inverse blocks (the last one might be smaller)
+      int64 block = (n % block_size != 0 && j + 1 == num_blocks)
+                        ? n % block_size
+                        : block_size;
+      auto inv_block =
+          MaybeConjugate(Collapse(SliceInMinorDims(inv_diag_blocks, {j, 0, 0},
+                                                   {j + 1, block, block}),
+                                  /*dimensions=*/{ndims - 2, ndims - 1}),
+                         conjugate_a);
+
+      // Get the corresponding row of B
+      int64 k = std::min((j + 1) * block_size, n);
+      std::vector<int64> start = {j * block_size, 0};
+      std::vector<int64> end = {k, m};
+      if (!left_side) {
+        std::swap(start[0], start[1]);
+        std::swap(end[0], end[1]);
+      }
+      auto b_row = SliceInMinorDims(b, start, end);
+
+      xla::XlaOp remainder;
+      if (i == 0) {
+        remainder = b_row;
+      } else {
+        // This matrix multiply involves a lot of multiplying with zero (namely,
+        // X[i * block_size:] = 0), but this is faster than slicing...
+        end = {k, n};
+        if (!left_side) {
+          std::swap(end[0], end[1]);
+        }
+        if (transpose_a) {
+          std::swap(start[0], start[1]);
+          std::swap(end[0], end[1]);
+        }
+        auto a_row =
+            MaybeConjugate(SliceInMinorDims(a, start, end), conjugate_a);
+        if (left_side) {
+          remainder = b_row - BatchDot(a_row, x, transpose_a, false);
+        } else {
+          remainder = b_row - BatchDot(x, a_row, false, transpose_a);
+        }
+      }
+
+      xla::XlaOp x_update;
+      auto zero = Zero(builder, xla::S32);
+      auto start_index =
+          xla::ConstantR0WithType(builder, xla::S32, j * block_size);
+      std::vector<xla::XlaOp> update_starts = {start_index, zero};
+      if (left_side) {
+        x_update = BatchDot(inv_block, remainder, transpose_a, false);
+      } else {
+        x_update = BatchDot(remainder, inv_block, false, transpose_a);
+        std::swap(update_starts[0], update_starts[1]);
+      }
+      x = DynamicUpdateSliceInMinorDims(x, x_update, /*starts=*/update_starts);
+    }
+
+    return x;
+  });
+}
+
 xla::XlaOp TriangularSolve(xla::XlaOp a, xla::XlaOp b, bool left_side,
                            bool lower, bool transpose_a, bool conjugate_a,
                            int64 block_size) {
@@ -45,7 +344,7 @@ xla::XlaOp TriangularSolve(xla::XlaOp a, xla::XlaOp b, bool left_side,
           xla::ShapeUtil::HumanString(a_shape), " vs. ",
           xla::ShapeUtil::HumanString(b_shape));
     }
-    const int ndims = xla::ShapeUtil::Rank(a_shape);
+    const int64 ndims = xla::ShapeUtil::Rank(a_shape);
     if (ndims < 2) {
       return errors::InvalidArgument(
           "Arguments to TriangularSolve must have rank >= 2: ", ndims);
@@ -85,528 +384,18 @@ xla::XlaOp TriangularSolve(xla::XlaOp a, xla::XlaOp b, bool left_side,
           block_size);
     }
 
-    std::map<int, xla::XlaComputation> base_computations;
-    auto get_base_triangular_solve =
-        [&](int k) -> xla::StatusOr<xla::XlaComputation*> {
-      xla::XlaComputation& computation = base_computations[k];
-      if (computation.IsNull()) {
-        std::unique_ptr<xla::XlaBuilder> sub = builder->CreateSubBuilder(
-            tensorflow::strings::StrCat("trsm_base_", k));
-
-        auto a_param = xla::Parameter(
-            sub.get(), 0,
-            xla::ShapeUtil::MakeShape(b_shape.element_type(),
-                                      ConcatVectors(batch_dimensions, {k, k})),
-            "a");
-
-        std::array<int64, 2> b_lastd;
-        if (left_side) {
-          b_lastd = {k, n};
-        } else {
-          b_lastd = {m, k};
-        }
-        auto b_param = xla::Parameter(
-            sub.get(), 1,
-            xla::ShapeUtil::MakeShape(b_shape.element_type(),
-                                      ConcatVectors(batch_dimensions, b_lastd)),
-            "b");
-
-        // We use a left-looking or right-looking subroutine on the block
-        // diagonal in the lower=true cases, while falling back to a recursive
-        // call in others. The left-looking and right-looking subroutines are
-        // written with a While loop and so yields much faster compile times.
-        // Moreover, they can give higher performance on smaller (sub)problems.
-        if (left_side && lower) {
-          TriangularSolveLeftLooking(a_param, b_param, transpose_a,
-                                     conjugate_a);
-        } else if (!left_side && lower) {
-          TriangularSolveRightLooking(a_param, b_param, transpose_a,
-                                      conjugate_a);
-        } else {
-          TriangularSolve(a_param, b_param, left_side, lower, transpose_a,
-                          conjugate_a,
-                          /*block_size=*/1);
-        }
-
-        TF_ASSIGN_OR_RETURN(computation, sub->Build());
-      }
-      return &computation;
-    };
-
-    xla::XlaOp output = xla::ZerosLike(b);
-
-    // Right-looking blocked triangular solve.
-    // For an explanation of the algorithm, see the TRSM discussion in:
-    // Goto, Kazushige, and Robert Van De Geijn. "High-performance
-    // implementation of the level-3 BLAS." ACM Transactions on Mathematical
-    // Software (TOMS) 35.1 (2008): 4.
-
-    // In the code comments below, T = lambda x: np.swapaxes(x, -1, -2) if
-    // conjugate_a is False, or T = lambda x: np.conj(np.swapaxes(x, -1, -2)) if
-    // conjugate_a is True.
-
-    if (!left_side && lower == transpose_a) {
-      // for i in range(0, a.shape[-1], block_size):
-      for (int64 i = 0; i < n; i += block_size) {
-        int64 k = std::min(block_size, n - i);
-
-        // output[..., :, i:i+k] = triangular_solve(
-        //     a[..., i:i+k, i:i+k],
-        //     b[..., :, i:i+k] - np.matmul(output[..., :, :i],
-        //                                  a[..., :i, i:i+k]),
-        //     ..., block_size=1)
-        auto a_slice = SliceInMinorDims(a, {i, i}, {i + k, i + k});
-        auto b_slice = SliceInMinorDims(b, {0, i}, {m, i + k});
-
-        // Note that we multiply with the full output, since this is faster
-        // than slicing, and output[..., :, i:] = 0
-        xla::XlaOp a_prev;
-        if (lower) {
-          a_prev = SliceInMinorDims(a, {i, 0}, {i + k, n});
-        } else {
-          a_prev = SliceInMinorDims(a, {0, i}, {n, i + k});
-        }
-        auto prev_contribution = BatchDot(output, a_prev,
-                                          /*transpose_x=*/false,
-                                          /*transpose_y=*/transpose_a,
-                                          /*conjugate_x=*/false,
-                                          /*conjugate_y=*/conjugate_a);
-        auto to_solve = b_slice - prev_contribution;
-
-        xla::XlaOp update;
-        if (k > 1) {
-          TF_ASSIGN_OR_RETURN(xla::XlaComputation * solve,
-                              get_base_triangular_solve(k));
-          update = xla::Call(builder, *solve, {a_slice, to_solve});
-        } else {
-          auto a_slice_conj = MaybeConjugate(a_slice, conjugate_a);
-          update = to_solve / a_slice_conj;
-        }
-        output = UpdateSliceInMinorDims(output, update, {0, i});
-      }
-
-    } else if (left_side && lower != transpose_a) {
-      // for i in range(0, a.shape[-1], block_size):
-      for (int64 i = 0; i < m; i += block_size) {
-        int64 k = std::min(block_size, m - i);
-
-        // output[..., i:i+k, :] = triangular_solve(
-        //     a[..., i:i+k, i:i+k],
-        //     b[..., i:i+k, :] - np.matmul(a[..., i:i+k, :i],
-        //                                  output[..., :i, :]),
-        //     ..., block_size=1)
-        auto a_slice = SliceInMinorDims(a, {i, i}, {i + k, i + k});
-        auto b_slice = SliceInMinorDims(b, {i, 0}, {i + k, n});
-
-        xla::XlaOp a_prev;
-        if (lower) {
-          a_prev = SliceInMinorDims(a, {i, 0}, {i + k, m});
-        } else {
-          a_prev = SliceInMinorDims(a, {0, i}, {m, i + k});
-        }
-        auto prev_contribution = BatchDot(a_prev, output,
-                                          /*transpose_x=*/transpose_a,
-                                          /*transpose_y=*/false,
-                                          /*conjugate_x=*/conjugate_a,
-                                          /*conjugate_y=*/false);
-        auto to_solve = b_slice - prev_contribution;
-
-        xla::XlaOp update;
-        if (k > 1) {
-          TF_ASSIGN_OR_RETURN(xla::XlaComputation * solve,
-                              get_base_triangular_solve(k));
-          update = xla::Call(builder, *solve, {a_slice, to_solve});
-        } else {
-          auto a_slice_conj = MaybeConjugate(a_slice, conjugate_a);
-          update = to_solve / a_slice_conj;
-        }
-        output = UpdateSliceInMinorDims(output, update, {i, 0});
-      }
-    } else if (!left_side && lower != transpose_a) {
-      // for i in reversed(range(0, a.shape[-1], block_size)):
-      const int64 last_blk_ix =
-          xla::RoundUpToNearest(n, block_size) - block_size;
-      for (int64 i = last_blk_ix; i >= 0; i -= block_size) {
-        int64 k = std::min(block_size, n - i);
-
-        // output[..., :, i:i+k] = triangular_solve(
-        //     a[..., i:i+k, i:i+k],
-        //     b[..., :, i:i+k] - np.matmul(output[..., :, :i],
-        //                                  a[..., :i, i:i+k]),\
-        //     ..., block_size=1)
-        auto a_slice = SliceInMinorDims(a, {i, i}, {i + k, i + k});
-        auto b_slice = SliceInMinorDims(b, {0, i}, {m, i + k});
-
-        xla::XlaOp a_prev;
-        if (lower) {
-          a_prev = SliceInMinorDims(a, {0, i}, {n, i + k});
-        } else {
-          a_prev = SliceInMinorDims(a, {i, 0}, {i + k, n});
-        }
-        auto prev_contribution = BatchDot(output, a_prev,
-                                          /*transpose_x=*/false,
-                                          /*transpose_y=*/transpose_a,
-                                          /*conjugate_x=*/false,
-                                          /*conjugate_y=*/conjugate_a);
-        auto to_solve = b_slice - prev_contribution;
-
-        xla::XlaOp update;
-        if (k > 1) {
-          TF_ASSIGN_OR_RETURN(xla::XlaComputation * solve,
-                              get_base_triangular_solve(k));
-          update = xla::Call(builder, *solve, {a_slice, to_solve});
-        } else {
-          auto a_slice_conj = MaybeConjugate(a_slice, conjugate_a);
-          update = to_solve / a_slice_conj;
-        }
-        output = UpdateSliceInMinorDims(output, update, {0, i});
-      }
-    } else {  // left_side && lower == transpose_a
-      // for i in reversed(range(0, a.shape[-1], block_size)):
-      const int64 last_blk_ix =
-          xla::RoundUpToNearest(m, block_size) - block_size;
-      for (int64 i = last_blk_ix; i >= 0; i -= block_size) {
-        int64 k = std::min(block_size, m - i);
-
-        // output[..., i:i+k, :] = triangular_solve(
-        //     a[..., i:i+k, i:i+k],
-        //     b[..., i:i+k, :] - np.matmul(a[..., i:i+k, :i],
-        //                                  output[..., :i, :]),
-        //     ..., block_size=1)
-        auto a_slice = SliceInMinorDims(a, {i, i}, {i + k, i + k});
-        auto b_slice = SliceInMinorDims(b, {i, 0}, {i + k, n});
-
-        xla::XlaOp a_prev;
-        if (lower) {
-          a_prev = SliceInMinorDims(a, {0, i}, {m, i + k});
-        } else {
-          a_prev = SliceInMinorDims(a, {i, 0}, {i + k, m});
-        }
-        auto prev_contribution = BatchDot(a_prev, output,
-                                          /*transpose_x=*/transpose_a,
-                                          /*transpose_y=*/false,
-                                          /*conjugate_x=*/conjugate_a,
-                                          /*conjugate_y=*/false);
-        auto to_solve = b_slice - prev_contribution;
-
-        xla::XlaOp update;
-        if (k > 1) {
-          TF_ASSIGN_OR_RETURN(xla::XlaComputation * solve,
-                              get_base_triangular_solve(k));
-          update = xla::Call(builder, *solve, {a_slice, to_solve});
-        } else {
-          auto a_slice_conj = MaybeConjugate(a_slice, conjugate_a);
-          update = to_solve / a_slice_conj;
-        }
-        output = UpdateSliceInMinorDims(output, update, {i, 0});
-      }
-    }
-
-    return output;
-  });
-}
-
-xla::XlaOp TriangularSolveLeftLooking(xla::XlaOp a, xla::XlaOp b,
-                                      bool transpose_a, bool conjugate_a) {
-  xla::XlaBuilder* builder = a.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    TF_ASSIGN_OR_RETURN(xla::Shape a_shape, builder->GetShape(a));
-    TF_ASSIGN_OR_RETURN(xla::Shape b_shape, builder->GetShape(b));
-    const int64 m = xla::ShapeUtil::GetDimension(b_shape, -2);
-    const int64 n = xla::ShapeUtil::GetDimension(b_shape, -1);
-    const int64 ndims = xla::ShapeUtil::Rank(a_shape);
-
-    std::vector<int64> batch_dimensions;
-    int64 num_batches = 1;
-    for (int i = 0; i < ndims - 2; ++i) {
-      int64 a_size = a_shape.dimensions(i);
-      batch_dimensions.push_back(a_size);
-      num_batches = num_batches * a_size;
-    }
-
-    // Rescale the input to be unit triangular
-    auto diag = xla::GetMatrixDiagonal(a);
-    xla::XlaOp scaled_a;
-    std::vector<int64> broadcast_dimensions(ndims - 1);
-    std::iota(broadcast_dimensions.begin(), broadcast_dimensions.end(), 0);
-    if (transpose_a) {
-      scaled_a = Div(a, diag, broadcast_dimensions);
-    } else {
-      // Broadcast over the rows
-      broadcast_dimensions[ndims - 2] = ndims - 1;
-      scaled_a = Div(a, diag, broadcast_dimensions);
-    }
-
-    // The main computation is performed in a While loop.
-
-    // Allocate the output and set its first or last row,
-    // output = np.zeros_like(b)
-    // if transpose_a:
-    //   output[..., m-1:, :] = b[..., m-1:, :] / a[..., m-1:, m-1:]
-    // else:
-    //   output[..., :1, :] = b[..., :1, :] / a[..., :1, :1]
-    xla::XlaOp output = xla::ZerosLike(b);
-    {
-      auto i = transpose_a ? m - 1 : 0;
-      auto a_slice = SliceInMinorDims(scaled_a, {i, i}, {i + 1, i + 1});
-      auto b_slice = SliceInMinorDims(b, {i, 0}, {i + 1, n});
-      auto a_slice_conj = MaybeConjugate(a_slice, conjugate_a);
-      auto update = b_slice / a_slice_conj;
-      output = UpdateSliceInMinorDims(output, update, {i, 0});
-    }
-
-    // Construct the initial loop carry tuple,
-    // if transpose_a:
-    //   init = (m-2, output, a, b)
-    // else:
-    //   init = (1, output, a, b)
-    std::vector<xla::Shape> tuple_shapes = {
-        // The loop iteration counter is a scalar, incremented each iteration.
-        xla::ShapeUtil::MakeShape(xla::S32, {}),
-        // The output has the shape of b, with one row updated each iteration.
-        b_shape,
-        // The coefficient matrix a is a loop invariant.
-        a_shape,
-        // The right-hand-side matrix b is a loop invariant.
-        b_shape};
-    xla::Shape tuple_shape = xla::ShapeUtil::MakeTupleShape(tuple_shapes);
-    auto init_i = xla::ConstantR0<int32>(builder, transpose_a ? m - 2 : 1);
-    auto init = xla::Tuple(builder, {init_i, output, scaled_a, b});
-
-    // Construct the loop condition function,
-    // def cond_fun(loop_carry):
-    //   i, output, a, b = loop_carry
-    //   return i >= 0 if transpose_a else i < m
-    std::unique_ptr<xla::XlaBuilder> condb =
-        builder->CreateSubBuilder("TriangularSolveLeftLookingWhileCond");
-    {
-      auto i = xla::GetTupleElement(
-          xla::Parameter(condb.get(), 0, tuple_shape,
-                         "TriangularSolveLeftLookingWhileTuple"),
-          0);
-      if (transpose_a) {
-        xla::Ge(i, xla::ConstantR0<int32>(condb.get(), 0));
-      } else {
-        xla::Lt(i, xla::ConstantR0<int32>(condb.get(), m));
-      }
-    }
-    TF_ASSIGN_OR_RETURN(auto cond, condb->Build());
-
-    // Construct the loop body function,
-    // def body_fun(loop_carry):
-    //   i, output, a, b = loop_carry
-    //   if transpose_a:
-    //     a_row = np.swapaxes(a[..., i+1:, i:i+1], -1 -2)
-    //   else:
-    //     a_row = a[..., i:i+1, :i]
-    //   result_row = b[..., i:i+1, :] - np.matmul(a_row, output[..., :, :])
-    //   output[..., i:i+1, :] = result_row / a[..., i:i+1, i:i+1]
-    //   if transpose_a:
-    //     return (i - 1, output, a, b)
-    //   else:
-    //     return (i + 1, output, a, b)
-    // We have to do some extra FLOPs propagating zeros in the matrix multiply
-    // because we can't have the size of its arguments depend on the loop
-    // counter.
-    std::unique_ptr<xla::XlaBuilder> bodyb =
-        builder->CreateSubBuilder("TriangularSolveLeftLookingWhileBody");
-    {
-      auto input_tuple = xla::Parameter(bodyb.get(), 0, tuple_shape,
-                                        "TriangularSolveLeftLookingWhileTuple");
-
-      // i, output, a, b = loop_carry
-      auto i = xla::GetTupleElement(input_tuple, 0);
-      auto body_out = xla::GetTupleElement(input_tuple, 1);
-      auto body_a = xla::GetTupleElement(input_tuple, 2);
-      auto body_b = xla::GetTupleElement(input_tuple, 3);
-      auto zero = xla::ConstantR0<int32>(bodyb.get(), 0);
-
-      // We'd like to implement this:
-      //   if transpose_a:
-      //     a_row = T(a[..., i+1:, i:i+1])
-      //     result_row = (b[..., i:i+1, :]
-      //                   - np.matmul(a_row, body_out[..., i+1:, :]))
-      //   else:
-      //     result_row = (b[..., i:i+1, :]
-      //                   - np.matmul(a[..., i:i+1, :i], body_out[..., :i, :]))
-      // But since we can't have intermediate array sizes depend on the loop
-      // counter, we instead exploit the fact that we initialized the output to
-      // all zeros and use that as zero-padding (doing unnecessary FLOPs).
-      xla::XlaOp a_row;
-      if (transpose_a) {
-        a_row = DynamicSliceInMinorDims(body_a, {zero, i}, {m, 1});
-      } else {
-        a_row = DynamicSliceInMinorDims(body_a, {i, zero}, {1, m});
-      }
-      auto b_update = BatchDot(a_row, body_out,
-                               /*transpose_x=*/transpose_a,
-                               /*transpose_y=*/false,
-                               /*conjugate_x=*/conjugate_a,
-                               /*conjugate_y=*/false);
-      auto result_row_slice =
-          DynamicSliceInMinorDims(body_b, {i, zero}, {1, n});
-      auto result_row = result_row_slice - b_update;
-
-      // body_out[..., i:i+1, :] = result_row
-      body_out = DynamicUpdateSliceInMinorDims(body_out, result_row, {i, zero});
-
-      // if transpose_a:
-      //   return (i - 1, body_out, a, b)
-      // else:
-      //   return (i + 1, body_out, a, b)
-      auto next_i = xla::Add(
-          i, xla::ConstantR0<int32>(bodyb.get(), transpose_a ? -1 : 1));
-      xla::Tuple(bodyb.get(), {next_i, body_out, body_a, body_b});
-    }
-    TF_ASSIGN_OR_RETURN(auto body, bodyb->Build());
-
-    // Construct the While loop and return the result,
-    // return while_loop(cond_fun, body_fun, init)[1]
-    auto triangular_solve_left_looking_while = xla::While(cond, body, init);
-    output = xla::GetTupleElement(triangular_solve_left_looking_while, 1);
-    auto scaling = MaybeConjugate(diag, conjugate_a);
-    // Broadcast over the columns
-    broadcast_dimensions[ndims - 2] = ndims - 2;
-    return Div(output, scaling, broadcast_dimensions);
-  });
-}
-
-xla::XlaOp TriangularSolveRightLooking(xla::XlaOp a, xla::XlaOp b,
-                                       bool transpose_a, bool conjugate_a) {
-  xla::XlaBuilder* builder = a.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    TF_ASSIGN_OR_RETURN(xla::Shape a_shape, builder->GetShape(a));
-    TF_ASSIGN_OR_RETURN(xla::Shape b_shape, builder->GetShape(b));
-    const int64 m = xla::ShapeUtil::GetDimension(b_shape, -2);
-    const int64 n = xla::ShapeUtil::GetDimension(b_shape, -1);
-    const int64 ndims = xla::ShapeUtil::Rank(a_shape);
-
-    std::vector<int64> batch_dimensions;
-    int64 num_batches = 1;
-    for (int i = 0; i < ndims - 2; ++i) {
-      int64 a_size = a_shape.dimensions(i);
-      batch_dimensions.push_back(a_size);
-      num_batches = num_batches * a_size;
-    }
+    // We find the diagonal blocks of the coefficient matrix
+    auto diag_blocks = DiagonalBlocks(a, block_size);
 
-    // Rescale the input to be unit triangular
-    auto diag = xla::GetMatrixDiagonal(a);
-    xla::XlaOp scaled_a;
-    std::vector<int64> broadcast_dimensions(ndims - 1);
-    std::iota(broadcast_dimensions.begin(), broadcast_dimensions.end(), 0);
-    if (transpose_a) {
-      // Broadcast over the rows
-      broadcast_dimensions[ndims - 2] = ndims - 1;
-      scaled_a = Div(a, diag, broadcast_dimensions);
-    } else {
-      scaled_a = Div(a, diag, broadcast_dimensions);
-    }
+    // We invert these blocks in parallel using batched matrix-vector products
+    auto inv_diag_blocks =
+        InvertDiagonalBlocks(diag_blocks, lower, transpose_a, conjugate_a);
 
-    // The main computation is performed in a While loop.
-    xla::XlaOp output = xla::ZerosLike(b);
+    // We now find the solution using GEMMs
+    auto x = SolveWithInvertedDiagonalBlocks(a, b, inv_diag_blocks, left_side,
+                                             lower, transpose_a, conjugate_a);
 
-    // Construct the initial loop carry tuple,
-    // if transpose_a:
-    //   init = (0, output, a, b)
-    // else:
-    //   init = (n-1, output, a, b)
-    std::vector<xla::Shape> tuple_shapes = {
-        // The loop iteration counter is a scalar, incremented each iteration.
-        xla::ShapeUtil::MakeShape(xla::S32, {}),
-        // The output has the shape of b, with one row updated each iteration.
-        b_shape,
-        // The coefficient matrix a is a loop invariant.
-        a_shape,
-        // The right-hand-side matrix b is a loop invariant.
-        b_shape};
-    xla::Shape tuple_shape = xla::ShapeUtil::MakeTupleShape(tuple_shapes);
-    auto init_i = xla::ConstantR0<int32>(builder, transpose_a ? 0 : n - 1);
-    auto init = xla::Tuple(builder, {init_i, output, scaled_a, b});
-
-    // Construct the loop condition function,
-    // def cond_fun(loop_carry):
-    //   i, output, a, b = loop_carry
-    //   return i < n if transpose_a else i >= 0
-    std::unique_ptr<xla::XlaBuilder> condb =
-        builder->CreateSubBuilder("TriangularSolveRightLookingWhileCond");
-    {
-      auto i = xla::GetTupleElement(
-          xla::Parameter(condb.get(), 0, tuple_shape,
-                         "TriangularSolveRightLookingWhileTuple"),
-          0);
-      if (transpose_a) {
-        xla::Lt(i, xla::ConstantR0<int32>(condb.get(), n));
-      } else {
-        xla::Ge(i, xla::ConstantR0<int32>(condb.get(), 0));
-      }
-    }
-    TF_ASSIGN_OR_RETURN(auto cond, condb->Build());
-
-    // Construct the loop body function,
-    // def body_fun(loop_carry):
-    //   i, output, a, b = loop_carry
-    //   if transpose_a:
-    //     a_row = np.swapaxes(a[..., :, i:i+1], -1, -2)
-    //   else:
-    //     a_row = a[..., :, i:i+1]
-    //   result_row = b[..., :, i:i+1] - np.matmul(output, a_row)
-    //   output[..., :, i:i+1] = result_row / a[..., i:i+1, i:i+1]
-    //   if transpose_a:
-    //     return (i - 1, output, a, b)
-    //   else:
-    //     return (i + 1, output, a, b)
-    // We have to do some extra FLOPs propagating zeros in the matrix multiply
-    // because we can't have the size of its arguments depend on the loop
-    // counter.
-    std::unique_ptr<xla::XlaBuilder> bodyb =
-        builder->CreateSubBuilder("TriangularSolveRightLookingWhileBody");
-    {
-      auto input_tuple = xla::Parameter(
-          bodyb.get(), 0, tuple_shape, "TriangularSolveRightLookingWhileTuple");
-
-      // i, output, a, b = loop_carry
-      auto i = xla::GetTupleElement(input_tuple, 0);
-      auto body_out = xla::GetTupleElement(input_tuple, 1);
-      auto body_a = xla::GetTupleElement(input_tuple, 2);
-      auto body_b = xla::GetTupleElement(input_tuple, 3);
-      auto zero = xla::ConstantR0<int32>(bodyb.get(), 0);
-
-      // result = b - np.matmul(output, a)
-      // result_row = result[..., :, i:i+1]
-      auto body_b_slice = DynamicSliceInMinorDims(body_b, {zero, i}, {m, 1});
-      xla::XlaOp a_slice;
-      if (transpose_a) {
-        a_slice = DynamicSliceInMinorDims(body_a, {i, zero}, {1, n});
-      } else {
-        a_slice = DynamicSliceInMinorDims(body_a, {zero, i}, {n, 1});
-      }
-      auto b_update = body_b_slice - BatchDot(body_out, a_slice,
-                                              /*transpose_x=*/false,
-                                              /*transpose_y=*/transpose_a,
-                                              /*conjugate_x=*/false,
-                                              /*conjugate_y=*/conjugate_a);
-
-      // body_out[..., :, i:i+1] = b_update
-      body_out = DynamicUpdateSliceInMinorDims(body_out, b_update, {zero, i});
-
-      // if transpose_a:
-      //   return (i + 1, body_out, a, b)
-      // else:
-      //   return (i - 1, body_out, a, b)
-      auto next_i = xla::Add(
-          i, xla::ConstantR0<int32>(bodyb.get(), transpose_a ? 1 : -1));
-      xla::Tuple(bodyb.get(), {next_i, body_out, body_a, body_b});
-    }
-    TF_ASSIGN_OR_RETURN(auto body, bodyb->Build());
-
-    // Construct the While loop and return the result,
-    // return while_loop(cond_fun, body_fun, init)[1]
-    auto triangular_solve_left_looking_while = xla::While(cond, body, init);
-    output = xla::GetTupleElement(triangular_solve_left_looking_while, 1);
-    auto scaling = MaybeConjugate(diag, conjugate_a);
-    // Broadcast over the rows
-    broadcast_dimensions[ndims - 2] = ndims - 1;
-    return Div(output, scaling, broadcast_dimensions);
+    return x;
   });
 }
 
diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve.h b/tensorflow/compiler/tf2xla/lib/triangular_solve.h
index 7eb9238014..555760b7ef 100644
--- a/tensorflow/compiler/tf2xla/lib/triangular_solve.h
+++ b/tensorflow/compiler/tf2xla/lib/triangular_solve.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_TRIANGULAR_SOLVE_H_
 #define TENSORFLOW_COMPILER_TF2XLA_LIB_TRIANGULAR_SOLVE_H_
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 
 namespace tensorflow {
 
@@ -61,12 +61,6 @@ xla::XlaOp TriangularSolve(xla::XlaOp a, xla::XlaOp b, bool left_side,
                            bool lower, bool transpose_a, bool conjugate_a,
                            int64 block_size = 128);
 
-xla::XlaOp TriangularSolveLeftLooking(xla::XlaOp a, xla::XlaOp b,
-                                      bool transpose_a, bool conjugate_a);
-
-xla::XlaOp TriangularSolveRightLooking(xla::XlaOp a, xla::XlaOp b,
-                                       bool transpose_a, bool conjugate_a);
-
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_TF2XLA_LIB_TRIANGULAR_SOLVE_H_
diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve_test.cc b/tensorflow/compiler/tf2xla/lib/triangular_solve_test.cc
index f1bff6037b..aeebf16028 100644
--- a/tensorflow/compiler/tf2xla/lib/triangular_solve_test.cc
+++ b/tensorflow/compiler/tf2xla/lib/triangular_solve_test.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/array2d.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
@@ -207,6 +207,28 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerNotranspose) {
                              xla::ErrorSpec(1e-2, 1e-2));
 }
 
+XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerNotransposeIrregularblock) {
+  xla::XlaBuilder builder(TestName());
+
+  xla::XlaOp a, b;
+  auto a_data = CreateR2Parameter<float>(AValsLower(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
+  TriangularSolve(a, b,
+                  /*left_side=*/true, /*lower=*/true,
+                  /*transpose_a=*/false, /*conjugate_a=*/false,
+                  /*block_size=*/3);
+
+  xla::Array2D<float> expected({
+      {0.5, 1.0, 1.5},
+      {0.41666667, 0.33333333, 0.25},
+      {0.23148148, 0.18518519, 0.13888889},
+      {0.16835017, 0.13468013, 0.1010101},
+  });
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             xla::ErrorSpec(1e-2, 1e-2));
+}
+
 XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperTranspose) {
   xla::XlaBuilder builder(TestName());
 
@@ -307,47 +329,5 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperTransposeNoconjugate) {
                                  xla::ErrorSpec(1e-2, 1e-2));
 }
 
-XLA_TEST_F(TriangularSolveLeftLookingTest, Simple) {
-  xla::XlaBuilder builder(TestName());
-
-  xla::XlaOp a, b;
-  auto a_data = CreateR2Parameter<float>(AValsLower(), 0, "a", &builder, &a);
-  auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
-  TriangularSolveLeftLooking(a, b,
-                             /*transpose_a=*/false,
-                             /*conjugate_a=*/false);
-
-  xla::Array2D<float> expected({
-      {0.5, 1.0, 1.5},
-      {0.41666667, 0.33333333, 0.25},
-      {0.23148148, 0.18518519, 0.13888889},
-      {0.16835017, 0.13468013, 0.1010101},
-  });
-
-  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
-                             xla::ErrorSpec(1e-2, 1e-2));
-}
-
-XLA_TEST_F(TriangularSolveLeftLookingTest, NonzeroUpperTriangle) {
-  xla::XlaBuilder builder(TestName());
-
-  xla::XlaOp a, b;
-  auto a_data = CreateR2Parameter<float>(AValsFull(), 0, "a", &builder, &a);
-  auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
-  TriangularSolveLeftLooking(a, b,
-                             /*transpose_a=*/false,
-                             /*conjugate_a=*/false);
-
-  xla::Array2D<float> expected({
-      {0.5, 1.0, 1.5},
-      {0.41666667, 0.33333333, 0.25},
-      {0.23148148, 0.18518519, 0.13888889},
-      {0.16835017, 0.13468013, 0.1010101},
-  });
-
-  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
-                             xla::ErrorSpec(1e-2, 1e-2));
-}
-
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/lib/util.cc b/tensorflow/compiler/tf2xla/lib/util.cc
index a6f5d346cb..8b5beba383 100644
--- a/tensorflow/compiler/tf2xla/lib/util.cc
+++ b/tensorflow/compiler/tf2xla/lib/util.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
diff --git a/tensorflow/compiler/tf2xla/lib/util.h b/tensorflow/compiler/tf2xla/lib/util.h
index 6cb6c088e9..b4905c9528 100644
--- a/tensorflow/compiler/tf2xla/lib/util.h
+++ b/tensorflow/compiler/tf2xla/lib/util.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_UTIL_H_
 #define TENSORFLOW_COMPILER_TF2XLA_LIB_UTIL_H_
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 
diff --git a/tensorflow/compiler/tf2xla/lib/while_loop.cc b/tensorflow/compiler/tf2xla/lib/while_loop.cc
index 574e70ddee..d64394f140 100644
--- a/tensorflow/compiler/tf2xla/lib/while_loop.cc
+++ b/tensorflow/compiler/tf2xla/lib/while_loop.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/lib/while_loop.h"
 #include "tensorflow/compiler/tf2xla/lib/util.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 
diff --git a/tensorflow/compiler/tf2xla/lib/while_loop.h b/tensorflow/compiler/tf2xla/lib/while_loop.h
index 5b6684c995..9493b1f109 100644
--- a/tensorflow/compiler/tf2xla/lib/while_loop.h
+++ b/tensorflow/compiler/tf2xla/lib/while_loop.h
@@ -19,8 +19,8 @@ limitations under the License.
 #include <functional>
 #include <vector>
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
diff --git a/tensorflow/compiler/tf2xla/tf2xla.cc b/tensorflow/compiler/tf2xla/tf2xla.cc
index ac768b206e..48568c825b 100644
--- a/tensorflow/compiler/tf2xla/tf2xla.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph.pb.h"
diff --git a/tensorflow/compiler/tf2xla/tf2xla.h b/tensorflow/compiler/tf2xla/tf2xla.h
index d02fc56c5b..432a12a516 100644
--- a/tensorflow/compiler/tf2xla/tf2xla.h
+++ b/tensorflow/compiler/tf2xla/tf2xla.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/tf2xla.pb.h"
 #include "tensorflow/compiler/xla/client/client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/core/framework/graph.pb.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/tf2xla/tf2xla_test.cc b/tensorflow/compiler/tf2xla/tf2xla_test.cc
index f0b30dcf4e..56f7045a98 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_test.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/tf2xla.pb.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/tf2xla/xla_compilation_device.cc b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
index fe7ec633ec..e89f473328 100644
--- a/tensorflow/compiler/tf2xla/xla_compilation_device.cc
+++ b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/sharding_util.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/common_runtime/local_device.h"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/platform/mem.h"
diff --git a/tensorflow/compiler/tf2xla/xla_compilation_device.h b/tensorflow/compiler/tf2xla/xla_compilation_device.h
index d0b9e34e16..a6e7882533 100644
--- a/tensorflow/compiler/tf2xla/xla_compilation_device.h
+++ b/tensorflow/compiler/tf2xla/xla_compilation_device.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/compiler/tf2xla/xla_resource.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/common_runtime/local_device.h"
 #include "tensorflow/core/framework/device_base.h"
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index cb47581e36..226c89bcf1 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -28,12 +28,14 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/executor.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/graph_optimizer.h"
 #include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
@@ -688,12 +690,12 @@ Status ValidateFunctionDef(const FunctionDef* fdef,
 Status ValidateGraph(const Graph* graph,
                      const FunctionLibraryDefinition& flib_def,
                      const DeviceType& device_type, const string& name) {
-  auto maybe_error = [&](const string& op, const Status& s) -> Status {
+  auto maybe_error = [&](const Node* node, const Status& s) -> Status {
     if (!s.ok()) {
       return errors::InvalidArgument(strings::StrCat(
           "Detected unsupported operations when trying to compile graph ", name,
-          " on ", device_type.type_string(), ": ", op, " (", s.error_message(),
-          ")"));
+          " on ", device_type.type_string(), ": ", node->def().op(), " (",
+          s.error_message(), ")", FormatNodeForError(*node)));
     }
     return Status::OK();
   };
@@ -706,15 +708,15 @@ Status ValidateGraph(const Graph* graph,
     Status s;
     if (fdef) {
       s = ValidateFunctionDef(fdef, flib_def);
-      TF_RETURN_IF_ERROR(maybe_error(node->def().op(), s));
+      TF_RETURN_IF_ERROR(maybe_error(node, s));
       continue;
     }
     const OpDef* op_def;
     s = OpRegistry::Global()->LookUpOpDef(node->def().op(), &op_def);
-    TF_RETURN_IF_ERROR(maybe_error(node->def().op(), s));
+    TF_RETURN_IF_ERROR(maybe_error(node, s));
     TF_RETURN_IF_ERROR(ValidateNodeDef(node->def(), *op_def));
     s = FindKernelDef(device_type, node->def(), nullptr, nullptr);
-    TF_RETURN_IF_ERROR(maybe_error(node->def().op(), s));
+    TF_RETURN_IF_ERROR(maybe_error(node, s));
   }
   return Status::OK();
 }
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index 079c99797e..acc64d99d3 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
diff --git a/tensorflow/compiler/tf2xla/xla_compiler_test.cc b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
index 2fb93be01d..be00ed8813 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
@@ -312,7 +312,7 @@ TEST_F(XlaCompilerTest, HasSaneErrorOnNonCompileTimeConstantInputToReshape) {
       str_util::StrContains(status.error_message(), "depends on a parameter"))
       << status.error_message();
   EXPECT_TRUE(
-      str_util::StrContains(status.error_message(), "[[Node: C = Reshape"))
+      str_util::StrContains(status.error_message(), "[[{{node C}} = Reshape"))
       << status.error_message();
 }
 
@@ -1077,6 +1077,8 @@ TEST_F(XlaCompilerTest, FunctionWithInvalidOp) {
   ASSERT_FALSE(status.ok());
   EXPECT_TRUE(str_util::StrContains(status.error_message(), "InvalidOp"))
       << status.error_message();
+  EXPECT_TRUE(str_util::StrContains(status.error_message(), "{{node fill_fn}}"))
+      << status.error_message();
 }
 
 // Tests a graph which has a node with invalid data type.
@@ -1101,6 +1103,8 @@ TEST_F(XlaCompilerTest, NodeWithInvalidDataType) {
   EXPECT_TRUE(str_util::StrContains(status.error_message(),
                                     "is not in the list of allowed values"))
       << status.error_message();
+  EXPECT_TRUE(str_util::StrContains(status.error_message(), "{{node Shape}}"))
+      << status.error_message();
 }
 
 TEST_F(XlaCompilerTest, SingleOpWithoutInputs) {
@@ -1122,9 +1126,10 @@ TEST_F(XlaCompilerTest, SingleOpWithoutInputs) {
     status = compiler.CompileGraph(XlaCompiler::CompileOptions(), "NoOp",
                                    std::move(graph_copy), args, &result);
     ASSERT_FALSE(status.ok());
-    EXPECT_TRUE(str_util::StrContains(status.error_message(),
-                                      "The following nodes are unreachable "
-                                      "from the source in the graph: NoOp"))
+    EXPECT_TRUE(
+        str_util::StrContains(status.error_message(),
+                              "The following nodes are unreachable "
+                              "from the source in the graph: {{node NoOp}}"))
         << status.error_message();
   }
 
diff --git a/tensorflow/compiler/tf2xla/xla_context.cc b/tensorflow/compiler/tf2xla/xla_context.cc
index 0dea366476..b24e3aabbe 100644
--- a/tensorflow/compiler/tf2xla/xla_context.cc
+++ b/tensorflow/compiler/tf2xla/xla_context.cc
@@ -25,7 +25,8 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/tf2xla/xla_context.h b/tensorflow/compiler/tf2xla/xla_context.h
index 38d8cd653c..3db37afdba 100644
--- a/tensorflow/compiler/tf2xla/xla_context.h
+++ b/tensorflow/compiler/tf2xla/xla_context.h
@@ -22,8 +22,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
diff --git a/tensorflow/compiler/tf2xla/xla_gpu_backend.cc b/tensorflow/compiler/tf2xla/xla_gpu_backend.cc
index dc98d4fda6..1398e9ee53 100644
--- a/tensorflow/compiler/tf2xla/xla_gpu_backend.cc
+++ b/tensorflow/compiler/tf2xla/xla_gpu_backend.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/tf2xla/legacy_flags/backend_registration_flags.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/framework/kernel_def.pb.h"
@@ -21,20 +20,6 @@ limitations under the License.
 namespace tensorflow {
 
 bool GpuOpFilter(KernelDef* kdef) {
-  // TODO(b/31361304): The GPU backend does not parallelize PRNG ops, leading to
-  // slow code.
-  legacy_flags::BackendRegistrationFlags* flags =
-      legacy_flags::GetBackendRegistrationFlags();
-  VLOG(2) << "flags->tf_enable_prng_ops_gpu: " << flags->tf_enable_prng_ops_gpu;
-  if (!flags->tf_enable_prng_ops_gpu &&
-      (kdef->op() == "RandomStandardNormal" || kdef->op() == "RandomUniform" ||
-       kdef->op() == "RandomUniformInt" || kdef->op() == "TruncatedNormal")) {
-    return false;
-  }
-  // TODO(b/26783907): The GPU backend currently does not implement sort.
-  if (kdef->op() == "XlaSort" || kdef->op() == "TopKV2") {
-    return false;
-  }
   if (kdef->op() == "Const") {
     AddDtypeToKernalDefConstraint("dtype", DT_STRING, kdef);
   }
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.cc b/tensorflow/compiler/tf2xla/xla_helpers.cc
index 4d1b3b1a13..8efb3d55c8 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.cc
+++ b/tensorflow/compiler/tf2xla/xla_helpers.cc
@@ -26,7 +26,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/lib/numeric.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/status.h"
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.h b/tensorflow/compiler/tf2xla/xla_helpers.h
index d6ca4ab934..e6522157a5 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.h
+++ b/tensorflow/compiler/tf2xla/xla_helpers.h
@@ -19,7 +19,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_TF2XLA_XLA_HELPERS_H_
 
 #include "tensorflow/compiler/tf2xla/xla_context.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 
diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
index 9e17756b27..00ccfb1c78 100644
--- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
+++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_executable.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.cc b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
index e8eafb3819..82028c8b9c 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
@@ -21,7 +21,8 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
 
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.h b/tensorflow/compiler/tf2xla/xla_op_kernel.h
index 6203cffd80..ac9dfe3369 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.h
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.h
@@ -17,7 +17,8 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_TF2XLA_XLA_OP_KERNEL_H_
 
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/platform/macros.h"
diff --git a/tensorflow/compiler/tf2xla/xla_resource.cc b/tensorflow/compiler/tf2xla/xla_resource.cc
index baea814965..7928fa0347 100644
--- a/tensorflow/compiler/tf2xla/xla_resource.cc
+++ b/tensorflow/compiler/tf2xla/xla_resource.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/sharding_util.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/compiler/tf2xla/xla_resource.h b/tensorflow/compiler/tf2xla/xla_resource.h
index 4de18a7788..2438490be1 100644
--- a/tensorflow/compiler/tf2xla/xla_resource.h
+++ b/tensorflow/compiler/tf2xla/xla_resource.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.pb.h"
diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index f1c383fd9e..fdf13bb18c 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -636,7 +636,7 @@ cc_library(
         ":window_util",
         ":xla_data_proto",
         "//tensorflow/compiler/xla/client:padding",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_evaluator",
         "//tensorflow/compiler/xla/service:shape_inference",
diff --git a/tensorflow/compiler/xla/client/BUILD b/tensorflow/compiler/xla/client/BUILD
index 25666cad40..ad3fcee05b 100644
--- a/tensorflow/compiler/xla/client/BUILD
+++ b/tensorflow/compiler/xla/client/BUILD
@@ -64,6 +64,7 @@ cc_library(
     hdrs = ["client.h"],
     deps = [
         ":global_data",
+        ":xla_computation",
         "//tensorflow/compiler/xla:execution_options_util",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:service_interface",
@@ -73,7 +74,6 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla:xla_proto",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/core:lib",
@@ -100,12 +100,12 @@ cc_library(
     deps = [
         ":client",
         ":executable_build_options",
+        ":xla_computation",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:backend",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service:device_memory_allocator",
@@ -114,6 +114,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:local_service",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/compiler/xla/service:source_map_util",
+        "//tensorflow/compiler/xla/service:stream_pool",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "@llvm//:support",
@@ -126,11 +127,11 @@ cc_library(
     hdrs = ["compile_only_client.h"],
     deps = [
         ":client",
+        ":xla_computation",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:compile_only_service",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/core:stream_executor_no_cuda",
@@ -174,3 +175,60 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
     ],
 )
+
+cc_library(
+    name = "xla_computation",
+    srcs = ["xla_computation.cc"],
+    hdrs = ["xla_computation.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo_proto",
+    ],
+)
+
+cc_library(
+    name = "xla_builder",
+    srcs = ["xla_builder.cc"],
+    hdrs = ["xla_builder.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":padding",
+        ":sharding_builder",
+        ":xla_computation",
+        "//tensorflow/compiler/xla:execution_options_util",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_proto",
+        "//tensorflow/compiler/xla/service:shape_inference",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "xla_builder_test",
+    srcs = ["xla_builder_test.cc"],
+    deps = [
+        ":xla_builder",
+        ":xla_computation",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/core:test",
+    ],
+)
diff --git a/tensorflow/compiler/xla/client/client.cc b/tensorflow/compiler/xla/client/client.cc
index 8e54311bad..d0ce5e8a6a 100644
--- a/tensorflow/compiler/xla/client/client.cc
+++ b/tensorflow/compiler/xla/client/client.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal.h"
diff --git a/tensorflow/compiler/xla/client/client.h b/tensorflow/compiler/xla/client/client.h
index d751e183dd..be50cebfcc 100644
--- a/tensorflow/compiler/xla/client/client.h
+++ b/tensorflow/compiler/xla/client/client.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/client/global_data.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service_interface.h"
diff --git a/tensorflow/compiler/xla/client/compile_only_client.h b/tensorflow/compiler/xla/client/compile_only_client.h
index 332c965036..a551edeab0 100644
--- a/tensorflow/compiler/xla/client/compile_only_client.h
+++ b/tensorflow/compiler/xla/client/compile_only_client.h
@@ -17,7 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_CLIENT_COMPILE_ONLY_CLIENT_H_
 
 #include "tensorflow/compiler/xla/client/client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/service/compile_only_service.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/xla/client/lib/BUILD b/tensorflow/compiler/xla/client/lib/BUILD
index 77ba474cf6..789daf4728 100644
--- a/tensorflow/compiler/xla/client/lib/BUILD
+++ b/tensorflow/compiler/xla/client/lib/BUILD
@@ -29,8 +29,8 @@ cc_library(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/core:lib",
     ],
 )
@@ -45,7 +45,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
     ],
 )
 
@@ -58,7 +58,7 @@ xla_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
@@ -72,7 +72,7 @@ cc_library(
         ":constants",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
     ],
 )
 
@@ -86,7 +86,7 @@ xla_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
@@ -101,7 +101,7 @@ cc_library(
         ":constants",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/core:lib",
     ],
 )
@@ -115,7 +115,7 @@ xla_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
@@ -131,7 +131,7 @@ cc_library(
         ":numeric",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/core:lib",
     ],
 )
@@ -150,8 +150,8 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client",
         "//tensorflow/compiler/xla/client:global_data",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:lib",
     ],
diff --git a/tensorflow/compiler/xla/client/lib/arithmetic.cc b/tensorflow/compiler/xla/client/lib/arithmetic.cc
index de1d785e19..9225b1acd6 100644
--- a/tensorflow/compiler/xla/client/lib/arithmetic.cc
+++ b/tensorflow/compiler/xla/client/lib/arithmetic.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
diff --git a/tensorflow/compiler/xla/client/lib/arithmetic.h b/tensorflow/compiler/xla/client/lib/arithmetic.h
index 8367e09450..632e8cc8bc 100644
--- a/tensorflow/compiler/xla/client/lib/arithmetic.h
+++ b/tensorflow/compiler/xla/client/lib/arithmetic.h
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/client/lib/constants.h b/tensorflow/compiler/xla/client/lib/constants.h
index b47f5243f0..0c8a9b8cc0 100644
--- a/tensorflow/compiler/xla/client/lib/constants.h
+++ b/tensorflow/compiler/xla/client/lib/constants.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <type_traits>
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
diff --git a/tensorflow/compiler/xla/client/lib/constants_test.cc b/tensorflow/compiler/xla/client/lib/constants_test.cc
index f1e3439862..f4320f65c1 100644
--- a/tensorflow/compiler/xla/client/lib/constants_test.cc
+++ b/tensorflow/compiler/xla/client/lib/constants_test.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
diff --git a/tensorflow/compiler/xla/client/lib/math.h b/tensorflow/compiler/xla/client/lib/math.h
index d003d529cc..13db232556 100644
--- a/tensorflow/compiler/xla/client/lib/math.h
+++ b/tensorflow/compiler/xla/client/lib/math.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_MATH_H_
 #define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_MATH_H_
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 
 namespace xla {
 
diff --git a/tensorflow/compiler/xla/client/lib/math_test.cc b/tensorflow/compiler/xla/client/lib/math_test.cc
index 1df287d7db..14c259a7fa 100644
--- a/tensorflow/compiler/xla/client/lib/math_test.cc
+++ b/tensorflow/compiler/xla/client/lib/math_test.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/client/lib/math.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
diff --git a/tensorflow/compiler/xla/client/lib/numeric.cc b/tensorflow/compiler/xla/client/lib/numeric.cc
index a6e460aa75..1c91237ae1 100644
--- a/tensorflow/compiler/xla/client/lib/numeric.cc
+++ b/tensorflow/compiler/xla/client/lib/numeric.cc
@@ -106,4 +106,32 @@ XlaOp GetMatrixDiagonal(XlaOp x) {
   });
 }
 
+XlaOp Triangle(XlaOp x, bool lower) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
+    const int64 n_dims = ShapeUtil::Rank(shape);
+    TF_RET_CHECK(n_dims >= 2);
+    const int64 m = shape.dimensions(n_dims - 2);
+    const int64 n = shape.dimensions(n_dims - 1);
+    tensorflow::gtl::ArraySlice<int64> major_dims(
+        AsInt64Slice(shape.dimensions()), /*pos=*/0, /*len=*/n_dims - 2);
+    auto a = Iota(builder, U32, n);
+    auto b = Iota(builder, U32, m);
+    xla::XlaOp indicator;
+    if (lower) {
+      indicator = Ge(b, Broadcast(a, {m}), /*broadcast_dimensions=*/{0});
+    } else {
+      indicator = Le(b, Broadcast(a, {m}), /*broadcast_dimensions=*/{0});
+    }
+    auto mask = Broadcast(indicator, major_dims);
+
+    return Select(mask, x, Zeros(builder, shape));
+  });
+}
+
+XlaOp UpperTriangle(XlaOp x) { return Triangle(x, false); }
+
+XlaOp LowerTriangle(XlaOp x) { return Triangle(x, true); }
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/numeric.h b/tensorflow/compiler/xla/client/lib/numeric.h
index e9037b722c..efd8cdc257 100644
--- a/tensorflow/compiler/xla/client/lib/numeric.h
+++ b/tensorflow/compiler/xla/client/lib/numeric.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_NUMERIC_H_
 #define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_NUMERIC_H_
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
@@ -34,6 +34,15 @@ XlaOp IdentityMatrix(XlaBuilder* builder, PrimitiveType type, int64 m, int64 n);
 // diagonal elements (i.e., with indices [..., i, i]).
 XlaOp GetMatrixDiagonal(XlaOp x);
 
+// Get the upper or lower triangle part of the last two dimensions
+XlaOp Triangle(XlaOp x, bool lower);
+
+// Get the upper triangle part of the last two dimensions
+XlaOp UpperTriangle(XlaOp x);
+
+// Get the lower triangle part of the last two dimensions
+XlaOp LowerTriangle(XlaOp x);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_NUMERIC_H_
diff --git a/tensorflow/compiler/xla/client/lib/numeric_test.cc b/tensorflow/compiler/xla/client/lib/numeric_test.cc
index bfea3f539d..8a96ec68d2 100644
--- a/tensorflow/compiler/xla/client/lib/numeric_test.cc
+++ b/tensorflow/compiler/xla/client/lib/numeric_test.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/client/lib/numeric.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -30,6 +30,9 @@ class NumericTest : public ClientLibraryTestBase {
   void TestMatrixDiagonal();
 };
 
+// TODO(b/64798317): Delete this test case once xla::IotaGen is converted to
+// xla::Iota. This test is already implemented for xla::IotaGen in
+// xla/tests/iota_test.cc.
 XLA_TEST_F(NumericTest, Iota) {
   XlaBuilder builder(TestName());
   Iota(&builder, S32, 10);
@@ -37,6 +40,20 @@ XLA_TEST_F(NumericTest, Iota) {
   ComputeAndCompareR1<int32>(&builder, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {});
 }
 
+XLA_TEST_F(NumericTest, Triangle) {
+  XlaBuilder builder(TestName());
+  Array3D<int32> input(2, 3, 4);
+  input.FillIota(0);
+
+  XlaOp a;
+  auto a_data = CreateR3Parameter<int32>(input, 0, "a", &builder, &a);
+  LowerTriangle(a);
+  Array3D<int32> expected({{{0, 0, 0, 0}, {4, 5, 0, 0}, {8, 9, 10, 0}},
+                           {{12, 0, 0, 0}, {16, 17, 0, 0}, {20, 21, 22, 0}}});
+
+  ComputeAndCompareR3<int32>(&builder, expected, {a_data.get()});
+}
+
 template <typename T>
 void NumericTest::TestMatrixDiagonal() {
   XlaBuilder builder("GetMatrixDiagonal");
diff --git a/tensorflow/compiler/xla/client/lib/prng.cc b/tensorflow/compiler/xla/client/lib/prng.cc
index 299a6ac2b6..3a744148fb 100644
--- a/tensorflow/compiler/xla/client/lib/prng.cc
+++ b/tensorflow/compiler/xla/client/lib/prng.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/lib/math.h"
 #include "tensorflow/compiler/xla/client/lib/numeric.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/casts.h"
 
diff --git a/tensorflow/compiler/xla/client/lib/prng.h b/tensorflow/compiler/xla/client/lib/prng.h
index ac86390239..ad000b1fa1 100644
--- a/tensorflow/compiler/xla/client/lib/prng.h
+++ b/tensorflow/compiler/xla/client/lib/prng.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <array>
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/client/lib/testing.cc b/tensorflow/compiler/xla/client/lib/testing.cc
index 534c509868..b1a776b8b8 100644
--- a/tensorflow/compiler/xla/client/lib/testing.cc
+++ b/tensorflow/compiler/xla/client/lib/testing.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/lib/testing.h"
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/shape_util.h"
diff --git a/tensorflow/compiler/xla/client/lib/testing.h b/tensorflow/compiler/xla/client/lib/testing.h
index dc613099e2..03695ce2a3 100644
--- a/tensorflow/compiler/xla/client/lib/testing.h
+++ b/tensorflow/compiler/xla/client/lib/testing.h
@@ -21,7 +21,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/client.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index 5f9710914b..e7250e11d5 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -18,10 +18,12 @@ limitations under the License.
 #include <utility>
 
 #include "llvm/ADT/Triple.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/service_executable_run_options.h"
 #include "tensorflow/compiler/xla/service/source_map_util.h"
+#include "tensorflow/compiler/xla/service/stream_pool.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 
 using xla::source_map_util::InvalidParameterArgument;
@@ -29,8 +31,8 @@ using xla::source_map_util::InvalidParameterArgument;
 namespace xla {
 
 namespace {
-StatusOr<Backend::StreamPtr> BorrowStreamForDevice(int device_ordinal,
-                                                   Backend* backend) {
+StatusOr<StreamPool::Ptr> BorrowStreamForDevice(int device_ordinal,
+                                                Backend* backend) {
   if (device_ordinal < 0) {
     device_ordinal = backend->default_device_ordinal();
   }
@@ -141,7 +143,7 @@ StatusOr<ScopedShapedBuffer> LocalExecutable::Run(
   TF_RETURN_IF_ERROR(
       ValidateExecutionOptions(arguments, run_options, *backend_));
 
-  Backend::StreamPtr stream;
+  StreamPool::Ptr stream;
   if (run_options.stream() == nullptr) {
     // NB!  The lifetime of `stream` needs to match the lifetime of
     // `actual_options` (otherwise we will end up using a returned stream in
diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h
index 4d9e0d7cd9..ae23809261 100644
--- a/tensorflow/compiler/xla/client/local_client.h
+++ b/tensorflow/compiler/xla/client/local_client.h
@@ -20,7 +20,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/client.h"
 #include "tensorflow/compiler/xla/client/executable_build_options.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/executable_run_options.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index ced26fc2ed..53be5a79c2 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 
 #include <functional>
 #include <numeric>
@@ -22,6 +22,7 @@ limitations under the License.
 #include <utility>
 
 #include "tensorflow/compiler/xla/client/sharding_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
@@ -2873,4 +2874,11 @@ XlaOp BatchNormGrad(const XlaOp& operand, const XlaOp& scale,
                                           grad_output, epsilon, feature_index);
 }
 
+XlaOp IotaGen(XlaBuilder* builder, PrimitiveType type, int64 size) {
+  HloInstructionProto instr;
+  *instr.mutable_shape() = ShapeUtil::MakeShape(type, {size});
+  return builder->ReportErrorOrReturn(
+      builder->AddInstruction(std::move(instr), HloOpcode::kIota));
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
new file mode 100644
index 0000000000..ae331407d6
--- /dev/null
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -0,0 +1,2241 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_XLA_BUILDER_H_
+#define TENSORFLOW_COMPILER_XLA_CLIENT_XLA_BUILDER_H_
+
+#include <map>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "tensorflow/compiler/xla/client/padding.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/stacktrace.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+
+class XlaBuilder;
+
+// This represents an instruction that has been enqueued using the XlaBuilder.
+// This is used to pass to subsequent computations that depends upon the
+// instruction as an operand.
+class XlaOp {
+ public:
+  XlaOp() : handle_(-1), builder_(nullptr) {
+    static_assert(std::is_trivially_destructible<XlaOp>::value,
+                  "XlaOp should be trivially destructible");
+  }
+  ~XlaOp() = default;
+
+  // Precondition: !IsUninitialized().
+  //
+  // It's very common to do foo.builder()->bar().  Without this precondition, if
+  // foo.builder() is null, the call to bar will segfault at some point possibly
+  // deep in the callstack when we finally dereference `this`.  The precondition
+  // lets us avoid this tricky-to-debug problem.
+  XlaBuilder* builder() const {
+    CHECK(builder_ != nullptr);
+    return builder_;
+  }
+
+  // Returns true if the XlaOp represents valid, non-erroneous value.
+  bool valid() const { return handle_ >= 0; }
+
+  // Returns true if the XlaOp was created by the XlaOp() constructor and
+  // not returned by a builder.
+  bool IsUninitialized() const { return builder_ == nullptr; }
+
+  bool IsIdenticalTo(const XlaOp& rhs) const {
+    return handle_ == rhs.handle_ && builder_ == rhs.builder_;
+  }
+
+  friend std::ostream& operator<<(std::ostream& out, const XlaOp& op) {
+    out << op.handle();
+    return out;
+  }
+
+ private:
+  explicit XlaOp(XlaBuilder* builder) : handle_(-1), builder_(builder) {}
+  XlaOp(int64 handle, XlaBuilder* builder)
+      : handle_(handle), builder_(builder) {}
+
+  int64 handle() const { return handle_; }
+
+  friend class XlaBuilder;
+
+  // < 0 means "invalid handle".
+  int64 handle_;
+
+  // Not owned. Non-null for any handle returned by XlaBuilder, even if the
+  // handle is invalid.
+  XlaBuilder* builder_;
+};
+
+// Arithmetic operator overloads for the XlaOp type.
+XlaOp operator-(const XlaOp& x);
+XlaOp operator+(const XlaOp& x, const XlaOp& y);
+XlaOp operator-(const XlaOp& x, const XlaOp& y);
+XlaOp operator*(const XlaOp& x, const XlaOp& y);
+XlaOp operator/(const XlaOp& x, const XlaOp& y);
+XlaOp operator%(const XlaOp& x, const XlaOp& y);
+
+// Bitwise operator overloads for the XlaOp type.
+XlaOp operator~(const XlaOp& x);
+XlaOp operator&(const XlaOp& x, const XlaOp& y);
+XlaOp operator|(const XlaOp& x, const XlaOp& y);
+XlaOp operator^(const XlaOp& x, const XlaOp& y);
+XlaOp operator<<(const XlaOp& x, const XlaOp& y);
+// Performs a right arithmetic shift if 'x' is a signed type, otherwise performs
+// a right logical shift.
+XlaOp operator>>(const XlaOp& x, const XlaOp& y);
+
+// We don't overload the relational operators (==, !=, <, <=, >, >=) because the
+// semantics might be surprising since their result types are usually 'bool'.
+// Further programmers may expect == to be a structural equality.
+// We also choose not to overload any of the mutating operators (e.g., +=, -=)
+// because the semantics might be misleading — XLA computations are immutable.
+
+// A convenient interface for building up computations.
+//
+// Thread-compatible.
+class XlaBuilder {
+ public:
+  // computation_name: name to use for the built computation.
+  XlaBuilder(const string& computation_name);
+
+  XlaBuilder(const XlaBuilder&) = delete;
+  XlaBuilder& operator=(const XlaBuilder&) = delete;
+
+  ~XlaBuilder();
+
+  // Returns the computation name.
+  const string& name() const { return name_; }
+
+  // Sets OpMetadata that will be added to all instructions until cleared.
+  //
+  // OpMetadata is often applied to a series of XLA HLO instructions. As a
+  // result, OpMetadata is set on the Computation Builder. All subsequent
+  // instructions generated via this Computation Builder will have the same
+  // OpMetadata attached until a call to ClearOpMetadata.
+  void SetOpMetadata(const OpMetadata& metadata) { metadata_ = metadata; }
+
+  // Clears the HloMetadata state.
+  void ClearOpMetadata() { metadata_.Clear(); }
+
+  // Sets an OpSharding that will be attached to all instructions until cleared.
+  void SetSharding(const OpSharding& sharding) { sharding_ = sharding; }
+
+  // Clears the sharding. Ops will be sharded according to the default placement
+  // policy.
+  void ClearSharding() { sharding_ = tensorflow::gtl::nullopt; }
+
+  // Returns the OpSharding that will be attached to all instructions.
+  const tensorflow::gtl::optional<OpSharding>& sharding() const {
+    return sharding_;
+  }
+
+  // Sets the builder to a mode where it will die immediately when an error is
+  // encountered, rather than producing it in a deferred fashion when Build() is
+  // called (which is the default).
+  void set_die_immediately_on_error(bool enabled) {
+    die_immediately_on_error_ = enabled;
+  }
+
+  // Default dimension numbers used for a 2D convolution.
+  static constexpr int64 kConvBatchDimension = 0;
+  static constexpr int64 kConvFeatureDimension = 1;
+  static constexpr int64 kConvFirstSpatialDimension = 2;
+  static constexpr int64 kConvSecondSpatialDimension = 3;
+  static constexpr int64 kConvKernelOutputDimension = 0;
+  static constexpr int64 kConvKernelInputDimension = 1;
+  static constexpr int64 kConvKernelFirstSpatialDimension = 2;
+  static constexpr int64 kConvKernelSecondSpatialDimension = 3;
+
+  // Creates a default ConvolutionDimensionNumbers. For a 2D convolution, for
+  // the input operand {batch, feature, height, width} = {0, 1, 2, 3} and for
+  // the kernel operand
+  // {output_feature, input_feature, height, width} = {0, 1, 2, 3}.
+  static ConvolutionDimensionNumbers CreateDefaultConvDimensionNumbers(
+      int num_spatial_dims = 2);
+
+  // Returns an error if the convolution dimension numbers have conflicts.
+  static Status Validate(const ConvolutionDimensionNumbers& dnum);
+
+  // Returns a new XlaBuilder whose resultant Computation is used only by this
+  // XlaBuilder. The sub-XlaBuilder has the same die_immediately_on_error
+  // behavior as the parent.
+  std::unique_ptr<XlaBuilder> CreateSubBuilder(const string& computation_name);
+
+  // Builds the computation with the requested operations, or returns a non-ok
+  // status. Note that all ops that have been enqueued will be moved to the
+  // computation being returned.
+  StatusOr<XlaComputation> Build();
+
+  // Builds the computation with the requested operations, or notes an error in
+  // the parent XlaBuilder and returns an empty computation if building failed.
+  // This function is intended to be used where the returned XlaComputation is
+  // only used by the parent XlaBuilder and hence further operation on the
+  // returned XlaComputation will simply be error'ed out if an error occurred
+  // while building this computation. If the built computation is to be used by
+  // a XlaBuilder other than the parent XlaBuilder then Build() should be used
+  // instead.
+  XlaComputation BuildAndNoteError();
+
+  // Returns a subgraph that roots on the given root. If the root is not a
+  // compile-time constant (see `IsConstant`), returns an error.
+  //
+  // This will copy the needed ops/computations to the subgraph.
+  StatusOr<XlaComputation> BuildConstantSubGraph(const XlaOp& root_op) const;
+
+  // Returns the first error that was encountered while building the
+  // computation. When an error is encountered, by default we return a vacuous
+  // XlaOp and inform the user of the error that occurred while
+  // building the computation when they make a final call to Build().
+  //
+  // See also set_die_immediately_on_error().
+  Status first_error() const { return first_error_; }
+
+  // Returns the shape of the given op.
+  StatusOr<Shape> GetShape(const XlaOp& op) const;
+
+  // Returns the (inferred) result for the current computation's shape.
+  StatusOr<ProgramShape> GetProgramShape() const;
+
+  // Reports an error to the builder, by
+  // * storing it internally and capturing a backtrace if it's the first error
+  //   (this deferred value will be produced on the call to
+  //    Build()/GetShape()/...)
+  // * dying if die_immediately_on_error_ is true.
+  // Returns an XlaOp with an invalid handle but a valid builder. This value can
+  // be returned in place of a value in APIs that return an XlaOp.
+  XlaOp ReportError(const Status& error);
+
+  // A helper function that converts a StatusOr<XlaOp> into an XlaOp.
+  // If the Status was an error, reports the error to builder and returns an
+  // invalid XlaOp handle.
+  XlaOp ReportErrorOrReturn(const StatusOr<XlaOp>& op);
+
+  // A helper function that runs a function that returns a StatusOr<XlaOp> and
+  // returns an XlaOp.
+  XlaOp ReportErrorOrReturn(const std::function<StatusOr<XlaOp>()>& op_creator);
+
+  // Returns true if 'operand' is a compile-time constant. A compile-time
+  // constant does not depend on any parameters, or on stateful operators such
+  // as `RngNormal` or `Infeed`.
+  //
+  // This tests whether a computation is a compile-time constant without
+  // evaluating the computation.
+  StatusOr<bool> IsConstant(const XlaOp& operand) const;
+
+ private:
+  // Enqueues a "retrieve parameter value" instruction for a parameter that was
+  // passed to the computation.
+  XlaOp Parameter(int64 parameter_number, const Shape& shape,
+                  const string& name);
+
+  // Enqueues a constant with the value of the given literal onto the
+  // computation.
+  XlaOp ConstantLiteral(const LiteralSlice& literal);
+
+  // Enqueues a constant onto the computation. Methods are templated on the
+  // native host type (NativeT) which corresponds to a specific XLA
+  // PrimitiveType as given in the following table:
+  //
+  //  Native Type   PrimitiveType
+  // -----------------------------
+  //   bool           PRED
+  //   int32          S32
+  //   int64          S64
+  //   uint32         U32
+  //   uint64         U64
+  //   float          F32
+  //   double         F64
+  //
+  // Note: not all primitive types defined in xla_data.proto have a
+  // corresponding native type yet.
+  template <typename NativeT>
+  XlaOp ConstantR0(NativeT value);
+  template <typename NativeT>
+  XlaOp ConstantR1(tensorflow::gtl::ArraySlice<NativeT> values);
+  XlaOp ConstantR1(const tensorflow::core::Bitmap& values);
+  template <typename NativeT>
+  XlaOp ConstantR2(
+      std::initializer_list<std::initializer_list<NativeT>> values);
+  template <typename NativeT>
+  XlaOp ConstantFromArrayWithLayout(const Array<NativeT>& values,
+                                    const Layout& layout);
+  template <typename NativeT>
+  XlaOp ConstantFromArray(const Array<NativeT>& values);
+  template <typename NativeT>
+  XlaOp ConstantR2FromArray2DWithLayout(const Array2D<NativeT>& values,
+                                        const Layout& layout);
+  template <typename NativeT>
+  XlaOp ConstantR2FromArray2D(const Array2D<NativeT>& values);
+  template <typename NativeT>
+  XlaOp ConstantR3FromArray3DWithLayout(const Array3D<NativeT>& values,
+                                        const Layout& layout);
+  template <typename NativeT>
+  XlaOp ConstantR3FromArray3D(const Array3D<NativeT>& values);
+  template <typename NativeT>
+  XlaOp ConstantR4FromArray4DWithLayout(const Array4D<NativeT>& values,
+                                        const Layout& layout);
+  template <typename NativeT>
+  XlaOp ConstantR4FromArray4D(const Array4D<NativeT>& values);
+
+  // Enqueues a rank one constant (vector) onto the computation. The vector has
+  // size 'length' and every element has the value 'value'.
+  template <typename NativeT>
+  XlaOp ConstantR1(int64 length, NativeT value);
+
+  // Adds dimensions to an array by duplicating the data in the array.
+  //
+  // The new dimensions are inserted on the left, i.e. if
+  // broadcast_sizes has values {a0, ..., aN} and the operand shape
+  // has dimensions {b0, ..., bM} then the shape of the output has
+  // dimensions {a0, ..., aN, b0, ..., bM}.
+  //
+  // The new dimensions index into copies of the operand, i.e.
+  //
+  //   output[i0, ..., iN, j0, ..., jM] = operand[j0, ..., jM]
+  XlaOp Broadcast(const XlaOp& operand,
+                  tensorflow::gtl::ArraySlice<int64> broadcast_sizes);
+
+  // Performs in-dimension-style broadcast.
+  //
+  // Operand specifies the input to be broadcast. "shape" is expected output
+  // shape. "broadcast_dimensions" are the dimensions to be broadcasting into.
+  // Dimension numbers in broadcast_dimensions map to individual dimensions
+  // of the operand, and specify what dimension of the output shape they
+  // should be broadcast.
+  // e.g.
+  // Say operand = [1, 2], i.e., a 1D tensor with 2 elements.
+  // and dimension of shape is [2,2].
+  // Specifying {1} as brodcast_dimension will generate output
+  // [1 , 2]
+  // [1 , 2]
+  // On the other hand, specifying {0} as broadcast_dimension
+  // will generate output
+  // [1 , 1]
+  // [2 , 2]
+  XlaOp BroadcastInDim(
+      const XlaOp& operand, const Shape& shape,
+      const tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+
+  // Enqueues a pad operation onto the computation that pads the given value on
+  // the edges as well as between the elements of the input. padding_config
+  // specifies the padding amount for each dimension.
+  XlaOp Pad(const XlaOp& operand, const XlaOp& padding_value,
+            const PaddingConfig& padding_config);
+
+  // Enqueues an operation onto the computation that flattens the operand based
+  // on the dimension order (major/slowest-varying to minor/fastest-varying)
+  // given, followed by reshaping it into the shape with the given dimension
+  // sizes (also major to minor). Conceptually, this is a limited form of
+  // "shape casting".
+  XlaOp Reshape(const XlaOp& operand,
+                tensorflow::gtl::ArraySlice<int64> dimensions,
+                tensorflow::gtl::ArraySlice<int64> new_sizes);
+
+  // Enqueues an operation onto the computation that collapses the operand, from
+  // first to last dimension (C order), then reshapes it to the given dimension
+  // sizes. Conceptually, this is a limited form of "shape casting".
+  XlaOp Reshape(const XlaOp& operand,
+                tensorflow::gtl::ArraySlice<int64> new_sizes);
+
+  // Wrapper for Reshape.
+  // Enqueues an operation to collapse the provided dimensions; e.g. an
+  // operand with dimensions {x=256, y=2, z=2, p=32} can be collapsed to
+  // {x=1024, y=32} by collapsing dims {0, 1, 2}. Collapsing dimensions must
+  // be a consecutive, in-order subsequence of the operand dimensions.
+  //
+  // Note that collapsing a single dimension does nothing:
+  //
+  //    {256} collapsing {0} => {256}
+  //    {1} collapsing {0} => {1}
+  //
+  // Collapsing multiple dimensions produces a single result dimension:
+  //
+  //    {256, 2} collapsing {0,1} => {512}
+  //    {256, 2, 3} collapsing {0,1} => {512, 3}
+  //
+  // This could potentially cause data to be moved -- it provides a more
+  // structured form of reshaping than an arbitrary Reshape operation.
+  XlaOp Collapse(const XlaOp& operand,
+                 tensorflow::gtl::ArraySlice<int64> dimensions);
+
+  // Enqueues a slice operation onto the computation that slices the operand
+  // from the start indices to the limit indices; e.g.
+  //
+  //        x
+  //   [ 0 1 2 3 ]
+  // y [ 4 5 6 7 ] => slice(start={1, 1}, limit={2, 3}) => [ 5 6 ]
+  //   [ 8 9 a b ]
+  //
+  // Note that "limit" means up-to-but-not-including; i.e. [start, limit) in 1D
+  // range notation.
+  // The strides parameter determines the stride over the slice
+  XlaOp Slice(const XlaOp& operand,
+              tensorflow::gtl::ArraySlice<int64> start_indices,
+              tensorflow::gtl::ArraySlice<int64> limit_indices,
+              tensorflow::gtl::ArraySlice<int64> strides);
+
+  // Enqueues a slice operation in a given dimension, taking all other
+  // dimensions as they are; e.g. if dimno is 1 from start_index 2 to
+  // limit_index 4 by 1, and the shape is f32[7,8,9], this call is short-hand
+  // for:
+  //
+  //  array[:, 2:4:1, :]
+  XlaOp SliceInDim(const XlaOp& operand, int64 start_index, int64 limit_index,
+                   int64 stride, int64 dimno);
+
+  // Enqueues a slice operation onto the computation that slices the 'operand'
+  // from dynamic start indices which are passed in 'start_indices'.
+  // The size of the slice in each dimension is passed in 'slice_sizes',
+  // which specify the end point of exclusive slice intervals in each
+  // dimension [start, start + size).
+  // The shape of 'start_indices' must be rank == 1, with dimension size
+  // equal to the rank of the 'operand'.
+  // Slice index calculations are computed modulo input dimension sizes to
+  // prevent dynamic start indices from generating out-of-bound array accesses.
+  XlaOp DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
+                     tensorflow::gtl::ArraySlice<int64> slice_sizes);
+
+  // Enqueues a dynamic update slice operation onto the computation, which
+  // updates a slice of 'operand' with 'update' at dynamic 'start_indices'.
+  // The shape of 'update' determines the shape of the slice of 'operand'
+  // which is updated.
+  // The indices specified in 'start_indices' specify the offset of the slice
+  // of 'operand' which is updated.
+  //
+  //               update = {10, 11} // calculated at runtime.
+  //   [1 2 3]     start  = {1, 1}   // calculated at runtime.  [1 2  3 ]
+  //   [4 5 6]  => DynamicUpdateslice(data, update, start)   => [4 10 11]
+  //   [7 8 9]                                                  [7 8  9 ]
+  //
+  // The shape of 'start_indices' must be rank == 1, with dimension size
+  // equal to the rank of the 'operand'.
+  // Slice index calculations are computed modulo update dimension sizes to
+  // prevent dynamic start indices from generating out-of-bound array accesses.
+  XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
+                           const XlaOp& start_indices);
+
+  // Enqueues a concatenate instruction onto the computation. 'operands' must
+  // have >= 1 entry.
+  XlaOp ConcatInDim(tensorflow::gtl::ArraySlice<XlaOp> operands,
+                    int64 dimension);
+
+  // Enqueue a tracing operation onto the computation; the computation will emit
+  // a logging message with the operand.
+  void Trace(const string& tag, const XlaOp& operand);
+
+  // Enqueues a conditional-move-like select operation onto the computation;
+  // predicated on pred, selects between on_true and on_false.
+  XlaOp Select(const XlaOp& pred, const XlaOp& on_true, const XlaOp& on_false);
+
+  // Enqueues a tuple-creation instruction onto the computation.
+  XlaOp Tuple(tensorflow::gtl::ArraySlice<XlaOp> elements);
+
+  // Enqueues a tuple-element-get instruction onto the computation.
+  XlaOp GetTupleElement(const XlaOp& tuple_data, int64 index);
+
+  // Enqueues an equal-to comparison instruction onto the computation.
+  XlaOp Eq(const XlaOp& lhs, const XlaOp& rhs,
+           tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a not-equal comparison instruction onto the computation.
+  XlaOp Ne(const XlaOp& lhs, const XlaOp& rhs,
+           tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a greater-or-equal comparison instruction onto the computation.
+  XlaOp Ge(const XlaOp& lhs, const XlaOp& rhs,
+           tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a greater-than comparison instruction onto the computation.
+  XlaOp Gt(const XlaOp& lhs, const XlaOp& rhs,
+           tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a less-than comparison instruction onto the computation.
+  XlaOp Lt(const XlaOp& lhs, const XlaOp& rhs,
+           tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a less-or-equal comparison instruction onto the computation.
+  XlaOp Le(const XlaOp& lhs, const XlaOp& rhs,
+           tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a dot instruction onto the computation.
+  XlaOp Dot(const XlaOp& lhs, const XlaOp& rhs);
+
+  // Enqueues a general dot instruction onto the computation.
+  XlaOp DotGeneral(const XlaOp& lhs, const XlaOp& rhs,
+                   const DotDimensionNumbers& dimension_numbers);
+
+  // Enqueues a convolution instruction onto the computation, which uses the
+  // default convolution dimension numbers.
+  XlaOp Conv(const XlaOp& lhs, const XlaOp& rhs,
+             tensorflow::gtl::ArraySlice<int64> window_strides,
+             Padding padding);
+
+  // Enqueues a convolution instruction onto the computation, with the caller
+  // provided padding configuration in the format returned by MakePadding().
+  XlaOp ConvWithGeneralPadding(
+      const XlaOp& lhs, const XlaOp& rhs,
+      tensorflow::gtl::ArraySlice<int64> window_strides,
+      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding);
+
+  // Enqueues a convolution instruction onto the computation, with the caller
+  // provided dimension numbers configuration.
+  XlaOp ConvWithGeneralDimensions(
+      const XlaOp& lhs, const XlaOp& rhs,
+      tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
+      const ConvolutionDimensionNumbers& dimension_numbers);
+
+  // Enqueues a convolution instruction onto the computation, with the caller
+  // provided padding configuration as well as the dimension numbers.
+  XlaOp ConvGeneral(
+      const XlaOp& lhs, const XlaOp& rhs,
+      tensorflow::gtl::ArraySlice<int64> window_strides,
+      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+      const ConvolutionDimensionNumbers& dimension_numbers);
+
+  // Enqueues a convolution instruction onto the computation, with the caller
+  // provided padding configuration, dilation factors and dimension numbers.
+  XlaOp ConvGeneralDilated(
+      const XlaOp& lhs, const XlaOp& rhs,
+      tensorflow::gtl::ArraySlice<int64> window_strides,
+      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+      tensorflow::gtl::ArraySlice<int64> lhs_dilation,
+      tensorflow::gtl::ArraySlice<int64> rhs_dilation,
+      const ConvolutionDimensionNumbers& dimension_numbers);
+
+  // Enqueues an FFT instruction onto the computation, of the given type and
+  // with the given FFT length.
+  XlaOp Fft(const XlaOp& operand, FftType fft_type,
+            tensorflow::gtl::ArraySlice<int64> fft_length);
+
+  // Enqueues an infeed instruction onto the computation, which writes data of
+  // the given shape to the infeed buffer of the device.
+  XlaOp Infeed(const Shape& shape, const string& config = "");
+  XlaOp InfeedWithToken(const XlaOp& token, const Shape& shape,
+                        const string& config = "");
+
+  // Enqueues an outfeed instruction onto the computation. This instruction
+  // generates outgoing data transfers for the given data.
+  //
+  // shape_with_layout communicates the laid out shape that we want to outfeed
+  // -- if !ShapeUtil::Compatible(GetShape(operand), shape_with_layout) an error
+  // will occur.
+  void Outfeed(const XlaOp& operand, const Shape& shape_with_layout,
+               const string& outfeed_config);
+  XlaOp OutfeedWithToken(const XlaOp& operand, const XlaOp& token,
+                         const Shape& shape_with_layout,
+                         const string& outfeed_config);
+
+  // Enqueues a call instruction onto the computation.
+  XlaOp Call(const XlaComputation& computation,
+             tensorflow::gtl::ArraySlice<XlaOp> operands);
+
+  // Enqueues a custom call instruction onto the computation.
+  // During code generation, a call instruction is emitted which targets a
+  // symbol with the name |call_target_name|.  The |operands| are passed to the
+  // call instruction.  |shape| is the resultant shape.
+  XlaOp CustomCall(const string& call_target_name,
+                   tensorflow::gtl::ArraySlice<XlaOp> operands,
+                   const Shape& shape);
+
+  // Enqueues a pseudo-op to represent host-side computation data-dependencies.
+  // During code generation, host send and receive operations will be generated
+  // to transfer |operands| to the host and a single result of |shape| back to
+  // the device.  Host send/recv operations are emitted using |channel_name|.
+  // Dataflow dependencies and the |cost_estimate_ns| field may be used in HLO
+  // instruction scheduling.
+  XlaOp HostCompute(tensorflow::gtl::ArraySlice<XlaOp> operands,
+                    const string& channel_name, int64 cost_estimate_ns,
+                    const Shape& shape);
+
+  // The following methods enqueue element-wise binary arithmetic operations
+  // onto the computation. The shapes of the operands have to match unless one
+  // of the operands is a scalar, or an explicit broadcast dimension is given
+  // (see g3doc for more details).
+
+  // Enqueues a complex compose instruction onto the computation.
+  XlaOp Complex(const XlaOp& real, const XlaOp& imag,
+                tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a complex conjugate instruction onto the computation.
+  XlaOp Conj(const XlaOp& operand);
+
+  // Enqueues an add instruction onto the computation.
+  XlaOp Add(const XlaOp& lhs, const XlaOp& rhs,
+            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a subtract instruction onto the computation.
+  XlaOp Sub(const XlaOp& lhs, const XlaOp& rhs,
+            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a multiply instruction onto the computation.
+  XlaOp Mul(const XlaOp& lhs, const XlaOp& rhs,
+            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a divide instruction onto the computation.
+  XlaOp Div(const XlaOp& lhs, const XlaOp& rhs,
+            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a remainder instruction onto the computation.
+  XlaOp Rem(const XlaOp& lhs, const XlaOp& rhs,
+            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a max instruction onto the computation.
+  XlaOp Max(const XlaOp& lhs, const XlaOp& rhs,
+            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a min instruction onto the computation.
+  XlaOp Min(const XlaOp& lhs, const XlaOp& rhs,
+            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Element-wise logical operators
+  XlaOp And(const XlaOp& lhs, const XlaOp& rhs,
+            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  XlaOp Or(const XlaOp& lhs, const XlaOp& rhs,
+           tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  XlaOp Xor(const XlaOp& lhs, const XlaOp& rhs,
+            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  XlaOp Not(const XlaOp& operand);
+
+  XlaOp ShiftLeft(const XlaOp& lhs, const XlaOp& rhs,
+                  tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+  XlaOp ShiftRightArithmetic(
+      const XlaOp& lhs, const XlaOp& rhs,
+      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+  XlaOp ShiftRightLogical(
+      const XlaOp& lhs, const XlaOp& rhs,
+      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Reduces an array among the provided dimensions, given "computation" as a
+  // reduction operator.
+  XlaOp Reduce(const XlaOp& operand, const XlaOp& init_value,
+               const XlaComputation& computation,
+               tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce);
+
+  // Convenience wrapper around the above that reduces all the dimensions in the
+  // operand shape.
+  XlaOp ReduceAll(const XlaOp& operand, const XlaOp& init_value,
+                  const XlaComputation& computation);
+
+  // Enqueues a windowed reduce instruction onto the computation.
+  XlaOp ReduceWindow(const XlaOp& operand, const XlaOp& init_value,
+                     const XlaComputation& computation,
+                     tensorflow::gtl::ArraySlice<int64> window_dimensions,
+                     tensorflow::gtl::ArraySlice<int64> window_strides,
+                     Padding padding);
+
+  // As ReduceWindow(), but the padding is given in the format
+  // returned by MakePadding().
+  XlaOp ReduceWindowWithGeneralPadding(
+      const XlaOp& operand, const XlaOp& init_value,
+      const XlaComputation& computation,
+      tensorflow::gtl::ArraySlice<int64> window_dimensions,
+      tensorflow::gtl::ArraySlice<int64> window_strides,
+      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding);
+
+  // Returns the sum of the operand value within each subgroup of replicas. All
+  // replicas supply one input to the sum and all replicas receive the resulting
+  // sum for each subgroup.
+  XlaOp CrossReplicaSum(
+      const XlaOp& operand,
+      tensorflow::gtl::ArraySlice<int64> replica_group_ids = {});
+
+  // Enqueues an operation that do an AllReduce of the operand cross cores. Here
+  // AllReduce means doing a reduction on the input operand cross cores and then
+  // broadcasting the reduction result to those cores. The reduction function is
+  // defined by `computation`, which should be a commutative computation on
+  // scalars, e.g., add, min, or max. The way that AllReduce is applied is
+  // configured by:
+  //
+  // - `replica_group_ids`: maps replica ids to subgroup ids. If empty, all
+  // replicas belong to one group. Allreduce will be applied within subgroups.
+  // For example, we have 4 replicas, then replica_group_ids={0,1,0,1} means,
+  // replica 0 and 2 are in subgroup 0, replica 1 and 3 are in subgroup 1.
+  //
+  // - `channel_id`: for Allreduce nodes from different models, if they have the
+  // same channel_id, they will be 'Allreduce'd. If empty, Allreduce will not be
+  // applied cross models.
+  //
+  // TODO(b/79737069): Rename this to AllReduce when it's ready to use.
+  XlaOp CrossReplicaSum(
+      const XlaOp& operand, const XlaComputation& computation,
+      tensorflow::gtl::ArraySlice<int64> replica_group_ids = {},
+      const tensorflow::gtl::optional<ChannelHandle>& channel_id =
+          tensorflow::gtl::nullopt);
+
+  // Enqueues an operation that scatters the `source` array to the selected
+  // indices of each window.
+  XlaOp SelectAndScatter(const XlaOp& operand, const XlaComputation& select,
+                         tensorflow::gtl::ArraySlice<int64> window_dimensions,
+                         tensorflow::gtl::ArraySlice<int64> window_strides,
+                         Padding padding, const XlaOp& source,
+                         const XlaOp& init_value,
+                         const XlaComputation& scatter);
+
+  // As SelectAndScatter(), but the padding is given in the format
+  // returned by MakePadding().
+  XlaOp SelectAndScatterWithGeneralPadding(
+      const XlaOp& operand, const XlaComputation& select,
+      tensorflow::gtl::ArraySlice<int64> window_dimensions,
+      tensorflow::gtl::ArraySlice<int64> window_strides,
+      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+      const XlaOp& source, const XlaOp& init_value,
+      const XlaComputation& scatter);
+
+  // Enqueues an abs instruction onto the computation.
+  XlaOp Abs(const XlaOp& operand);
+
+  // Enqueues a atan2 instruction onto the computation.
+  XlaOp Atan2(const XlaOp& y, const XlaOp& x,
+              tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues an exp instruction onto the computation.
+  XlaOp Exp(const XlaOp& operand);
+
+  // Enqueues an expm1 instruction onto the computation.
+  XlaOp Expm1(const XlaOp& operand);
+
+  // Enqueues a floor instruction onto the computation.
+  XlaOp Floor(const XlaOp& operand);
+
+  // Enqueues a ceil instruction onto the computation.
+  XlaOp Ceil(const XlaOp& operand);
+
+  // Enqueues a round instruction onto the computation, rounding to nearest even
+  // with half-way cases rounding away from zero.
+  XlaOp Round(const XlaOp& operand);
+
+  // Enqueues an log instruction (natural logarithm) onto the computation.
+  XlaOp Log(const XlaOp& operand);
+
+  // Enqueues an log1p instruction (log(x+1)) onto the computation.
+  XlaOp Log1p(const XlaOp& operand);
+
+  // Enqueues a sign instruction onto the computation.
+  XlaOp Sign(const XlaOp& operand);
+
+  // Enqueues a count leading zeros instruction onto the computation.
+  XlaOp Clz(const XlaOp& operand);
+
+  // Enqueues a cosine instruction onto the computation.
+  XlaOp Cos(const XlaOp& operand);
+
+  // Enqueues a sine instruction onto the computation.
+  XlaOp Sin(const XlaOp& operand);
+
+  // Enqueues a tanh instruction onto the computation.
+  XlaOp Tanh(const XlaOp& operand);
+
+  // Enqueues a real-part instruction onto the computation.
+  XlaOp Real(const XlaOp& operand);
+
+  // Enqueues an imaginary-part instruction onto the computation.
+  XlaOp Imag(const XlaOp& operand);
+
+  // Enqueues a lhs^rhs computation onto the computation.
+  XlaOp Pow(const XlaOp& lhs, const XlaOp& rhs,
+            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues an operator that tests if the operand's values are finite, i.e.,
+  // not Inf or NaN. Defined only for floating-point types. Returns an array of
+  // booleans with the same shape where entries are true iff the corresponding
+  // entry was NaN.
+  XlaOp IsFinite(const XlaOp& operand);
+
+  // Enqueues a convert instruction onto the computation that changes the
+  // element type of the operand array to primitive_type.
+  XlaOp ConvertElementType(const XlaOp& operand,
+                           PrimitiveType new_element_type);
+
+  // Enqueues a no-op instruction onto the computation that changes
+  // the element type of the operand array to primitive_type. The
+  // bit-widths of the source and destination element types must be
+  // identical.
+  XlaOp BitcastConvertType(const XlaOp& operand,
+                           PrimitiveType new_element_type);
+
+  // Enqueues a negate instruction onto the computation.
+  XlaOp Neg(const XlaOp& operand);
+
+  // Enqueues a transpose instruction onto the computation.
+  XlaOp Transpose(const XlaOp& operand,
+                  tensorflow::gtl::ArraySlice<int64> permutation);
+
+  // Enqueues a reverse instruction onto the computation. The order of the
+  // elements in the given dimensions is reversed (i.e., the element at index i
+  // is moved to index dimension_size - 1 - i).
+  XlaOp Rev(const XlaOp& operand,
+            tensorflow::gtl::ArraySlice<int64> dimensions);
+
+  // Enqueues a sort (as increasing order) instruction onto the computation.
+  // If only keys are provided:
+  // * If the keys are an rank-1 tensor (an array), the result is a sorted array
+  // of keys, in ascending order.
+  // * If the keys have higher rank, the keys are sorted along the provided
+  // dimension. For example, for a rank-2 tensor (a matrix) of keys, a dimension
+  // value of 0 will indepenently sort every column, and a dimension value of 1
+  // will independently sort each row. If no dimension number is provided, then
+  // the last dimension is chosen by default.
+  //
+  // If both keys and values are provided:
+  // * The keys and the values must tensors with the same dimensions. The
+  // element types of the tensors may be different.
+  // * The result is a tuple that consists of a sorted tensor of keys (along the
+  // provided dimension, as above) as the first element, and a tensor with their
+  // corresponding values as the second element.
+  XlaOp Sort(XlaOp keys,
+             tensorflow::gtl::optional<XlaOp> values = tensorflow::gtl::nullopt,
+             int64 dimension = -1);
+
+  // Enqueues a clamp instruction onto the computation.
+  XlaOp Clamp(const XlaOp& min, const XlaOp& operand, const XlaOp& max);
+
+  // Enqueues a map instruction onto the computation.
+  XlaOp Map(tensorflow::gtl::ArraySlice<XlaOp> operands,
+            const XlaComputation& computation,
+            tensorflow::gtl::ArraySlice<int64> dimensions,
+            tensorflow::gtl::ArraySlice<XlaOp> static_operands = {});
+
+  // Enqueues a N(mu, sigma) random number generation instruction onto the
+  // computation.
+  XlaOp RngNormal(const XlaOp& mu, const XlaOp& sigma, const Shape& shape);
+
+  // Enqueues a U(a, b) random number generation instruction onto the
+  // computation. Returns values in the semi-open interval [a, b).
+  XlaOp RngUniform(const XlaOp& a, const XlaOp& b, const Shape& shape);
+
+  // Enqueues a while node onto the computation.
+  XlaOp While(const XlaComputation& condition, const XlaComputation& body,
+              const XlaOp& init);
+
+  // Enqueues a conditional node onto the computation.
+  XlaOp Conditional(const XlaOp& predicate, const XlaOp& true_operand,
+                    const XlaComputation& true_computation,
+                    const XlaOp& false_operand,
+                    const XlaComputation& false_computation);
+
+  // Enqueues a ReducePrecision node onto the computation.
+  XlaOp ReducePrecision(const XlaOp& operand, const int exponent_bits,
+                        const int mantissa_bits);
+
+  // Enqueues a Gather node onto the computation.
+  XlaOp Gather(const XlaOp& input, const XlaOp& gather_indices,
+               const GatherDimensionNumbers& dimension_numbers,
+               tensorflow::gtl::ArraySlice<int64> window_bounds);
+
+  // Enqueues a Send node onto the computation for device-to-device
+  // communication, to send the given operand to a Recv instruction that shares
+  // the same channel handle.
+  void Send(const XlaOp& operand, const ChannelHandle& handle);
+  XlaOp SendWithToken(const XlaOp& operand, const XlaOp& token,
+                      const ChannelHandle& handle);
+
+  // Enqueues a Send node which sends data to the host.
+  XlaOp SendToHost(const XlaOp& operand, const XlaOp& token,
+                   const Shape& shape_with_layout, const ChannelHandle& handle);
+
+  // Enqueues a Recv node which receives data from the host.
+  XlaOp RecvFromHost(const XlaOp& token, const Shape& shape,
+                     const ChannelHandle& handle);
+
+  // Enqueues an AfterAll operation with no operands producing a token-shaped
+  // value.
+  XlaOp CreateToken();
+
+  // Enqueues an AfterAll operation with no operands producing a token-shaped
+  // value.
+  XlaOp AfterAll(tensorflow::gtl::ArraySlice<XlaOp> tokens);
+
+  // Enqueues a Recv node onto the computation. The data comes from a Send
+  // instruction that shares the same channel handle and its shape must
+  // be the same as the given shape.
+  XlaOp Recv(const Shape& shape, const ChannelHandle& handle);
+  XlaOp RecvWithToken(const XlaOp& token, const Shape& shape,
+                      const ChannelHandle& handle);
+
+  // Normalizes operand across spatial and batch dimensions for each feature.
+  //
+  // Returns a tuple (normalized, batch_mean, batch_var) where `normalized`
+  // is the normalized result and batch_mean and batch_var are the mean and
+  // variance, respectively, across batch for the operand.
+  XlaOp BatchNormTraining(const XlaOp& operand, const XlaOp& scale,
+                          const XlaOp& offset, float epsilon,
+                          int64 feature_index);
+
+  // Normalizes operand across spatial and batch dimensions for each feature.
+  //
+  // `BatchNormInference` is equivalent to calling `BatchNormTraining` without
+  // computing `mean` and `variance` for each batch inside the operation. It
+  // uses the input `mean` and `variance` instead as estimated values. The
+  // purpose of this op is to reduce latency in inference, hence the name
+  // `BatchNormInference`.
+  //
+  // The output has the same shape as `operand`, and contains the normalized
+  // values for each batch.
+  XlaOp BatchNormInference(const XlaOp& operand, const XlaOp& scale,
+                           const XlaOp& offset, const XlaOp& mean,
+                           const XlaOp& variance, float epsilon,
+                           int64 feature_index);
+
+  // Calculates the gradients of a batch norm op.
+  //
+  // The inputs `batch_mean` and `batch_var` represent the mean and variance
+  // across the batch.
+  //
+  // Returns a tuple of three elements:
+  //   - grad_operand: Gradient with respect to input `operand`
+  //   - grad_offset: Gradient with respect to input `offset`
+  //   - grad_scale: Gradient with respect to input `scale`
+  XlaOp BatchNormGrad(const XlaOp& operand, const XlaOp& scale,
+                      const XlaOp& batch_mean, const XlaOp& batch_var,
+                      const XlaOp& grad_output, float epsilon,
+                      int64 feature_index);
+
+  StatusOr<XlaOp> AddInstruction(
+      HloInstructionProto&& instr, HloOpcode opcode,
+      tensorflow::gtl::ArraySlice<XlaOp> operands = {});
+
+  void AddCalledComputation(const XlaComputation& computation,
+                            HloInstructionProto* instr);
+
+  StatusOr<const HloInstructionProto*> LookUpInstruction(const XlaOp& op) const;
+
+  // Internal helper method that does the building for an arbitrary unary op.
+  XlaOp UnaryOp(HloOpcode unop, const XlaOp& operand);
+
+  // Internal helper method that does the building for an arbitrary binary op.
+  // broadcast_dimensions specifies which dimensions to use for broadcasting
+  // when the operation is between tensors of different ranks.
+  XlaOp BinaryOp(HloOpcode binop, const XlaOp& lhs, const XlaOp& rhs,
+                 tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+
+  // Internal helper method that does the building for an arbitrary ternary op.
+  XlaOp TernaryOp(HloOpcode triop, const XlaOp& lhs, const XlaOp& rhs,
+                  const XlaOp& ehs);
+
+  XlaOp RngOp(RandomDistribution distribution,
+              tensorflow::gtl::ArraySlice<XlaOp> parameters,
+              const Shape& shape);
+
+  StatusOr<XlaOp> InDimBroadcast(
+      const Shape& shape, const XlaOp& operand,
+      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+
+  // Internal helper method that creates a sequence of instructions that
+  // performs an explicit broadcast of the operand to the target shape.
+  StatusOr<XlaOp> AddBroadcastSequence(const Shape& output_shape,
+                                       const XlaOp& operand);
+
+  // Internal helper method for creating a Reshape op with the already inferred
+  // shape.
+  StatusOr<XlaOp> Reshape(const Shape& shape, const XlaOp& operand);
+
+  // Returns the (inferred) result for the program shape for the current
+  // computation and fills the root_id in the pointer.
+  StatusOr<ProgramShape> GetProgramShape(int64* root_id) const;
+
+  // Returns shapes for the operands.
+  StatusOr<std::vector<Shape>> GetOperandShapes(
+      tensorflow::gtl::ArraySlice<XlaOp> operands) const;
+
+  // A visitor which checks whether an operation is a compile-time constant,
+  // meaning that it doesn't depend on any parameters, or on any stateful
+  // operation such as `RngNormal` or `Infeed`. The visitor walks the
+  // computation starting at a given operation and sets is_constant to false iff
+  // a parameter or stateful operation is encountered.
+  void IsConstantVisitor(const int64 op_handle, std::set<int64>* visited,
+                         bool* is_constant) const;
+
+  // Checks bounds for convolution parameters.
+  Status VerifyConvolution(
+      const Shape& lhs_shape, const Shape& rhs_shape,
+      const ConvolutionDimensionNumbers& dimension_numbers) const;
+
+  // Helper function for creating a Window proto from user-supplied data.
+  // Returns error if the user-supplied data was invalid.
+  StatusOr<Window> MakeWindow(
+      tensorflow::gtl::ArraySlice<int64> window_dimensions,
+      tensorflow::gtl::ArraySlice<int64> window_strides,
+      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+      tensorflow::gtl::ArraySlice<int64> lhs_dilation,
+      tensorflow::gtl::ArraySlice<int64> rhs_dilation) const;
+
+  string name_;  // Name to use for the built computation.
+
+  // The first error encountered while building the computation.
+  // This is OK until the first error is encountered.
+  Status first_error_;
+
+  // The saved stack trace from the point at which the first error occurred.
+  tensorflow::SavedStackTrace first_error_backtrace_;
+
+  // The instructions of this computation.
+  std::vector<HloInstructionProto> instructions_;
+
+  // The embedded computations used by this computation. Each computation was
+  // the entry computation of some XlaComputation, the key is the unique id of
+  // that XlaComputation.
+  std::map<int64, HloComputationProto> embedded_;
+
+  // The unique parameter numbers.
+  tensorflow::gtl::FlatSet<int64> parameter_numbers_;
+
+  // The metadata to attach to each op. This is structured as a "modal"-like
+  // operation, in order to simplify client code (and not sprinkle this metadata
+  // throughout the TensorFlow op kernel implementations).
+  OpMetadata metadata_;
+
+  // Sharding for this operator. This is structured as a "model"-like operation,
+  // in order to simplify client code, similar to metadata_.
+  tensorflow::gtl::optional<OpSharding> sharding_;
+
+  // Mode bit that indicates whether to die when a first error is encountered.
+  bool die_immediately_on_error_ = false;
+
+  XlaBuilder* parent_builder_{nullptr};
+
+  friend XlaOp Parameter(XlaBuilder* builder, int64 parameter_number,
+                         const Shape& shape, const string& name);
+  friend XlaOp ConstantLiteral(XlaBuilder* builder,
+                               const LiteralSlice& literal);
+  template <typename NativeT>
+  friend XlaOp ConstantR0(XlaBuilder* builder, NativeT value);
+  template <typename NativeT>
+  friend XlaOp ConstantR1(XlaBuilder* builder,
+                          tensorflow::gtl::ArraySlice<NativeT> values);
+  friend XlaOp ConstantR1(XlaBuilder* builder,
+                          const tensorflow::core::Bitmap& values);
+  template <typename NativeT>
+  friend XlaOp ConstantR2(
+      XlaBuilder* builder,
+      std::initializer_list<std::initializer_list<NativeT>> values);
+  template <typename NativeT>
+  friend XlaOp ConstantFromArrayWithLayout(XlaBuilder* builder,
+                                           const Array<NativeT>& values,
+                                           const Layout& layout);
+  template <typename NativeT>
+  friend XlaOp ConstantFromArray(XlaBuilder* builder,
+                                 const Array<NativeT>& values);
+  template <typename NativeT>
+  friend XlaOp ConstantR2FromArray2DWithLayout(XlaBuilder* builder,
+                                               const Array2D<NativeT>& values,
+                                               const Layout& layout);
+  template <typename NativeT>
+  friend XlaOp ConstantR2FromArray2D(XlaBuilder* builder,
+                                     const Array2D<NativeT>& values);
+  template <typename NativeT>
+  friend XlaOp ConstantR3FromArray3DWithLayout(XlaBuilder* builder,
+                                               const Array3D<NativeT>& values,
+                                               const Layout& layout);
+  template <typename NativeT>
+  friend XlaOp ConstantR3FromArray3D(XlaBuilder* builder,
+                                     const Array3D<NativeT>& values);
+  template <typename NativeT>
+  friend XlaOp ConstantR4FromArray4DWithLayout(XlaBuilder* builder,
+                                               const Array4D<NativeT>& values,
+                                               const Layout& layout);
+  template <typename NativeT>
+  friend XlaOp ConstantR4FromArray4D(XlaBuilder* builder,
+                                     const Array4D<NativeT>& values);
+
+  template <typename NativeT>
+  friend XlaOp ConstantR1(XlaBuilder* builder, int64 length, NativeT value);
+
+  friend XlaOp Broadcast(const XlaOp& operand,
+                         tensorflow::gtl::ArraySlice<int64> broadcast_sizes);
+
+  friend XlaOp BroadcastInDim(
+      const XlaOp& operand, const Shape& shape,
+      const tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+
+  friend XlaOp Pad(const XlaOp& operand, const XlaOp& padding_value,
+                   const PaddingConfig& padding_config);
+
+  friend XlaOp Reshape(const XlaOp& operand,
+                       tensorflow::gtl::ArraySlice<int64> dimensions,
+                       tensorflow::gtl::ArraySlice<int64> new_sizes);
+
+  friend XlaOp Reshape(const XlaOp& operand,
+                       tensorflow::gtl::ArraySlice<int64> new_sizes);
+
+  friend XlaOp Collapse(const XlaOp& operand,
+                        tensorflow::gtl::ArraySlice<int64> dimensions);
+
+  friend XlaOp Slice(const XlaOp& operand,
+                     tensorflow::gtl::ArraySlice<int64> start_indices,
+                     tensorflow::gtl::ArraySlice<int64> limit_indices,
+                     tensorflow::gtl::ArraySlice<int64> strides);
+
+  friend XlaOp SliceInDim(const XlaOp& operand, int64 start_index,
+                          int64 limit_index, int64 stride, int64 dimno);
+
+  friend XlaOp DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
+                            tensorflow::gtl::ArraySlice<int64> slice_sizes);
+
+  friend XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
+                                  const XlaOp& start_indices);
+
+  friend XlaOp ConcatInDim(XlaBuilder* builder,
+                           tensorflow::gtl::ArraySlice<XlaOp> operands,
+                           int64 dimension);
+
+  friend void Trace(const string& tag, const XlaOp& operand);
+
+  friend XlaOp Select(const XlaOp& pred, const XlaOp& on_true,
+                      const XlaOp& on_false);
+  friend XlaOp Tuple(XlaBuilder* builder,
+                     tensorflow::gtl::ArraySlice<XlaOp> elements);
+  friend XlaOp GetTupleElement(const XlaOp& tuple_data, int64 index);
+  friend XlaOp Eq(const XlaOp& lhs, const XlaOp& rhs,
+                  tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+  friend XlaOp Ne(const XlaOp& lhs, const XlaOp& rhs,
+                  tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+  friend XlaOp Ge(const XlaOp& lhs, const XlaOp& rhs,
+                  tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+  friend XlaOp Gt(const XlaOp& lhs, const XlaOp& rhs,
+                  tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+  friend XlaOp Lt(const XlaOp& lhs, const XlaOp& rhs,
+                  tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+  friend XlaOp Le(const XlaOp& lhs, const XlaOp& rhs,
+                  tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+  friend XlaOp Dot(const XlaOp& lhs, const XlaOp& rhs);
+  friend XlaOp DotGeneral(const XlaOp& lhs, const XlaOp& rhs,
+                          const DotDimensionNumbers& dimension_numbers);
+  friend XlaOp Conv(const XlaOp& lhs, const XlaOp& rhs,
+                    tensorflow::gtl::ArraySlice<int64> window_strides,
+                    Padding padding);
+  friend XlaOp ConvWithGeneralPadding(
+      const XlaOp& lhs, const XlaOp& rhs,
+      tensorflow::gtl::ArraySlice<int64> window_strides,
+      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding);
+  friend XlaOp ConvWithGeneralDimensions(
+      const XlaOp& lhs, const XlaOp& rhs,
+      tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
+      const ConvolutionDimensionNumbers& dimension_numbers);
+  friend XlaOp ConvGeneral(
+      const XlaOp& lhs, const XlaOp& rhs,
+      tensorflow::gtl::ArraySlice<int64> window_strides,
+      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+      const ConvolutionDimensionNumbers& dimension_numbers);
+  friend XlaOp ConvGeneralDilated(
+      const XlaOp& lhs, const XlaOp& rhs,
+      tensorflow::gtl::ArraySlice<int64> window_strides,
+      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+      tensorflow::gtl::ArraySlice<int64> lhs_dilation,
+      tensorflow::gtl::ArraySlice<int64> rhs_dilation,
+      const ConvolutionDimensionNumbers& dimension_numbers);
+  friend XlaOp Fft(const XlaOp& operand, FftType fft_type,
+                   tensorflow::gtl::ArraySlice<int64> fft_length);
+  friend XlaOp Infeed(XlaBuilder* builder, const Shape& shape,
+                      const string& config);
+  friend void Outfeed(const XlaOp& operand, const Shape& shape_with_layout,
+                      const string& outfeed_config);
+  friend XlaOp Call(XlaBuilder* builder, const XlaComputation& computation,
+                    tensorflow::gtl::ArraySlice<XlaOp> operands);
+  friend XlaOp CustomCall(XlaBuilder* builder, const string& call_target_name,
+                          tensorflow::gtl::ArraySlice<XlaOp> operands,
+                          const Shape& shape);
+  friend XlaOp HostCompute(XlaBuilder* builder,
+                           tensorflow::gtl::ArraySlice<XlaOp> operands,
+                           const string& channel_name, int64 cost_estimate_ns,
+                           const Shape& shape);
+  friend XlaOp Complex(const XlaOp& real, const XlaOp& imag,
+                       tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+  friend XlaOp Conj(const XlaOp& operand);
+  friend XlaOp Add(const XlaOp& lhs, const XlaOp& rhs,
+                   tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+  friend XlaOp Sub(const XlaOp& lhs, const XlaOp& rhs,
+                   tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+  friend XlaOp Mul(const XlaOp& lhs, const XlaOp& rhs,
+                   tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+  friend XlaOp Div(const XlaOp& lhs, const XlaOp& rhs,
+                   tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+  friend XlaOp Rem(const XlaOp& lhs, const XlaOp& rhs,
+                   tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+  friend XlaOp Max(const XlaOp& lhs, const XlaOp& rhs,
+                   tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+  friend XlaOp Min(const XlaOp& lhs, const XlaOp& rhs,
+                   tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+  friend XlaOp And(const XlaOp& lhs, const XlaOp& rhs,
+                   tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+  friend XlaOp Or(const XlaOp& lhs, const XlaOp& rhs,
+                  tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+  friend XlaOp Xor(const XlaOp& lhs, const XlaOp& rhs,
+                   tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+  friend XlaOp Not(const XlaOp& operand);
+  friend XlaOp ShiftLeft(
+      const XlaOp& lhs, const XlaOp& rhs,
+      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+  friend XlaOp ShiftRightArithmetic(
+      const XlaOp& lhs, const XlaOp& rhs,
+      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+  friend XlaOp ShiftRightLogical(
+      const XlaOp& lhs, const XlaOp& rhs,
+      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+  friend XlaOp Reduce(const XlaOp& operand, const XlaOp& init_value,
+                      const XlaComputation& computation,
+                      tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce);
+  friend XlaOp ReduceAll(const XlaOp& operand, const XlaOp& init_value,
+                         const XlaComputation& computation);
+  friend XlaOp ReduceWindow(
+      const XlaOp& operand, const XlaOp& init_value,
+      const XlaComputation& computation,
+      tensorflow::gtl::ArraySlice<int64> window_dimensions,
+      tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding);
+  friend XlaOp ReduceWindowWithGeneralPadding(
+      const XlaOp& operand, const XlaOp& init_value,
+      const XlaComputation& computation,
+      tensorflow::gtl::ArraySlice<int64> window_dimensions,
+      tensorflow::gtl::ArraySlice<int64> window_strides,
+      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding);
+  friend XlaOp CrossReplicaSum(
+      const XlaOp& operand,
+      tensorflow::gtl::ArraySlice<int64> replica_group_ids);
+  friend XlaOp CrossReplicaSum(
+      const XlaOp& operand, const XlaComputation& computation,
+      tensorflow::gtl::ArraySlice<int64> replica_group_ids,
+      const tensorflow::gtl::optional<ChannelHandle>& channel_id);
+  friend XlaOp SelectAndScatter(
+      const XlaOp& operand, const XlaComputation& select,
+      tensorflow::gtl::ArraySlice<int64> window_dimensions,
+      tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
+      const XlaOp& source, const XlaOp& init_value,
+      const XlaComputation& scatter);
+  friend XlaOp SelectAndScatterWithGeneralPadding(
+      const XlaOp& operand, const XlaComputation& select,
+      tensorflow::gtl::ArraySlice<int64> window_dimensions,
+      tensorflow::gtl::ArraySlice<int64> window_strides,
+      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+      const XlaOp& source, const XlaOp& init_value,
+      const XlaComputation& scatter);
+  friend XlaOp Abs(const XlaOp& operand);
+  friend XlaOp Atan2(const XlaOp& y, const XlaOp& x,
+                     tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+  friend XlaOp Exp(const XlaOp& operand);
+  friend XlaOp Expm1(const XlaOp& operand);
+  friend XlaOp Floor(const XlaOp& operand);
+  friend XlaOp Ceil(const XlaOp& operand);
+  friend XlaOp Round(const XlaOp& operand);
+  friend XlaOp Log(const XlaOp& operand);
+  friend XlaOp Log1p(const XlaOp& operand);
+  friend XlaOp Sign(const XlaOp& operand);
+  friend XlaOp Clz(const XlaOp& operand);
+  friend XlaOp Cos(const XlaOp& operand);
+  friend XlaOp Sin(const XlaOp& operand);
+  friend XlaOp Tanh(const XlaOp& operand);
+  friend XlaOp Real(const XlaOp& operand);
+  friend XlaOp Imag(const XlaOp& operand);
+  friend XlaOp Pow(const XlaOp& lhs, const XlaOp& rhs,
+                   tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+  friend XlaOp IsFinite(const XlaOp& operand);
+  // TODO(b/64798317): Finish CPU & GPU implementation, then replace xla::Iota
+  // in xla/client/lib/numeric.h with this (renamed to xla::Iota).
+  friend XlaOp IotaGen(XlaBuilder* builder, PrimitiveType type, int64 size);
+  friend XlaOp ConvertElementType(const XlaOp& operand,
+                                  PrimitiveType new_element_type);
+  friend XlaOp BitcastConvertType(const XlaOp& operand,
+                                  PrimitiveType new_element_type);
+  friend XlaOp Neg(const XlaOp& operand);
+  friend XlaOp Transpose(const XlaOp& operand,
+                         tensorflow::gtl::ArraySlice<int64> permutation);
+  friend XlaOp Rev(const XlaOp& operand,
+                   tensorflow::gtl::ArraySlice<int64> dimensions);
+  friend XlaOp Sort(XlaOp keys, tensorflow::gtl::optional<XlaOp> values,
+                    int64 dimension);
+  friend XlaOp Clamp(const XlaOp& min, const XlaOp& operand, const XlaOp& max);
+  friend XlaOp Map(XlaBuilder* builder,
+                   tensorflow::gtl::ArraySlice<XlaOp> operands,
+                   const XlaComputation& computation,
+                   tensorflow::gtl::ArraySlice<int64> dimensions,
+                   tensorflow::gtl::ArraySlice<XlaOp> static_operands);
+  friend XlaOp RngNormal(const XlaOp& mu, const XlaOp& sigma,
+                         const Shape& shape);
+  friend XlaOp RngUniform(const XlaOp& a, const XlaOp& b, const Shape& shape);
+  friend XlaOp While(const XlaComputation& condition,
+                     const XlaComputation& body, const XlaOp& init);
+  friend XlaOp Conditional(const XlaOp& predicate, const XlaOp& true_operand,
+                           const XlaComputation& true_computation,
+                           const XlaOp& false_operand,
+                           const XlaComputation& false_computation);
+  friend XlaOp ReducePrecision(const XlaOp& operand, const int exponent_bits,
+                               const int mantissa_bits);
+  friend XlaOp Gather(const XlaOp& input, const XlaOp& gather_indices,
+                      const GatherDimensionNumbers& dimension_numbers,
+                      tensorflow::gtl::ArraySlice<int64> window_bounds);
+  friend void Send(const XlaOp& operand, const ChannelHandle& handle);
+  friend XlaOp Recv(XlaBuilder* builder, const Shape& shape,
+                    const ChannelHandle& handle);
+  friend XlaOp BatchNormTraining(const XlaOp& operand, const XlaOp& scale,
+                                 const XlaOp& offset, float epsilon,
+                                 int64 feature_index);
+  friend XlaOp BatchNormInference(const XlaOp& operand, const XlaOp& scale,
+                                  const XlaOp& offset, const XlaOp& mean,
+                                  const XlaOp& variance, float epsilon,
+                                  int64 feature_index);
+  friend XlaOp BatchNormGrad(const XlaOp& operand, const XlaOp& scale,
+                             const XlaOp& batch_mean, const XlaOp& batch_var,
+                             const XlaOp& grad_output, float epsilon,
+                             int64 feature_index);
+  friend XlaOp SendWithToken(const XlaOp& operand, const XlaOp& token,
+                             const ChannelHandle& handle);
+  friend XlaOp RecvWithToken(const XlaOp& token, const Shape& shape,
+                             const ChannelHandle& handle);
+  friend XlaOp SendToHost(const XlaOp& operand, const XlaOp& token,
+                          const Shape& shape_with_layout,
+                          const ChannelHandle& handle);
+  friend XlaOp RecvFromHost(const XlaOp& token, const Shape& shape,
+                            const ChannelHandle& handle);
+  friend XlaOp InfeedWithToken(const XlaOp& token, const Shape& shape,
+                               const string& config);
+  friend XlaOp OutfeedWithToken(const XlaOp& operand, const XlaOp& token,
+                                const Shape& shape_with_layout,
+                                const string& outfeed_config);
+  friend XlaOp CreateToken(XlaBuilder* builder);
+  friend XlaOp AfterAll(XlaBuilder* builder,
+                        tensorflow::gtl::ArraySlice<XlaOp> tokens);
+};
+
+// RAII-style object: sets the current sharding assignment in builder on
+// construction, and sets back to the previous assignment on destruction.
+class XlaScopedShardingAssignment {
+ public:
+  XlaScopedShardingAssignment(xla::XlaBuilder* builder,
+                              tensorflow::gtl::optional<OpSharding> sharding)
+      : builder_(builder), prev_sharding_(builder->sharding()) {
+    SetSharding(sharding);
+  }
+
+  XlaScopedShardingAssignment(const XlaScopedShardingAssignment&) = delete;
+  XlaScopedShardingAssignment& operator=(const XlaScopedShardingAssignment&) =
+      delete;
+
+  ~XlaScopedShardingAssignment() { SetSharding(prev_sharding_); }
+
+ private:
+  void SetSharding(const tensorflow::gtl::optional<OpSharding>& sharding) {
+    if (sharding.has_value()) {
+      builder_->SetSharding(sharding.value());
+    } else {
+      builder_->ClearSharding();
+    }
+  }
+
+  xla::XlaBuilder* const builder_;
+  tensorflow::gtl::optional<OpSharding> prev_sharding_;
+};
+
+// Free functions for building XlaOps. The intention is that these will
+// become the public API for building XlaOps rather than calling methods on
+// XlaBuilder directly.
+
+// Enqueues a "retrieve parameter value" instruction for a parameter that was
+// passed to the computation.
+XlaOp Parameter(XlaBuilder* builder, int64 parameter_number, const Shape& shape,
+                const string& name);
+
+// Enqueues a constant with the value of the given literal onto the
+// computation.
+XlaOp ConstantLiteral(XlaBuilder* builder, const LiteralSlice& literal);
+
+// Enqueues a constant onto the computation. Methods are templated on the
+// native host type (NativeT) which corresponds to a specific XLA
+// PrimitiveType as given in the following table:
+//
+//  Native Type   PrimitiveType
+// -----------------------------
+//   bool           PRED
+//   int32          S32
+//   int64          S64
+//   uint32         U32
+//   uint64         U64
+//   float          F32
+//   double         F64
+//
+// Note: not all primitive types defined in xla_data.proto have a
+// corresponding native type yet.
+template <typename NativeT>
+XlaOp ConstantR0(XlaBuilder* builder, NativeT value);
+template <typename NativeT>
+XlaOp ConstantR1(XlaBuilder* builder,
+                 tensorflow::gtl::ArraySlice<NativeT> values);
+XlaOp ConstantR1(XlaBuilder* builder, const tensorflow::core::Bitmap& values);
+template <typename NativeT>
+XlaOp ConstantR2(XlaBuilder* builder,
+                 std::initializer_list<std::initializer_list<NativeT>> values);
+template <typename NativeT>
+XlaOp ConstantFromArrayWithLayout(XlaBuilder* builder,
+                                  const Array<NativeT>& values,
+                                  const Layout& layout);
+template <typename NativeT>
+XlaOp ConstantFromArray(XlaBuilder* builder, const Array<NativeT>& values);
+template <typename NativeT>
+XlaOp ConstantR2FromArray2DWithLayout(XlaBuilder* builder,
+                                      const Array2D<NativeT>& values,
+                                      const Layout& layout);
+template <typename NativeT>
+XlaOp ConstantR2FromArray2D(XlaBuilder* builder,
+                            const Array2D<NativeT>& values);
+template <typename NativeT>
+XlaOp ConstantR3FromArray3DWithLayout(XlaBuilder* builder,
+                                      const Array3D<NativeT>& values,
+                                      const Layout& layout);
+template <typename NativeT>
+XlaOp ConstantR3FromArray3D(XlaBuilder* builder,
+                            const Array3D<NativeT>& values);
+template <typename NativeT>
+XlaOp ConstantR4FromArray4DWithLayout(XlaBuilder* builder,
+                                      const Array4D<NativeT>& values,
+                                      const Layout& layout);
+template <typename NativeT>
+XlaOp ConstantR4FromArray4D(XlaBuilder* builder,
+                            const Array4D<NativeT>& values);
+
+// Enqueues a rank one constant (XlaBuilder* builder, vector) onto the
+// computation. The vector has size 'length' and every element has the value
+// 'value'.
+template <typename NativeT>
+XlaOp ConstantR1(XlaBuilder* builder, int64 length, NativeT value);
+
+// Adds dimensions to an array by duplicating the data in the array.
+//
+// The new dimensions are inserted on the left, i.e. if
+// broadcast_sizes has values {a0, ..., aN} and the operand shape
+// has dimensions {b0, ..., bM} then the shape of the output has
+// dimensions {a0, ..., aN, b0, ..., bM}.
+//
+// The new dimensions index into copies of the operand, i.e.
+//
+//   output[i0, ..., iN, j0, ..., jM] = operand[j0, ..., jM]
+XlaOp Broadcast(const XlaOp& operand,
+                tensorflow::gtl::ArraySlice<int64> broadcast_sizes);
+
+// Performs in-dimension-style broadcast.
+//
+// Operand specifies the input to be broadcast. "shape" is expected output
+// shape. "broadcast_dimensions" are the dimensions to be broadcasting into.
+// Dimension numbers in broadcast_dimensions map to individual dimensions
+// of the operand, and specify what dimension of the output shape they
+// should be broadcast.
+// e.g.
+// Say operand = [1, 2], i.e., a 1D tensor with 2 elements.
+// and dimension of shape is [2,2].
+// Specifying {1} as brodcast_dimension will generate output
+// [1 , 2]
+// [1 , 2]
+// On the other hand, specifying {0} as broadcast_dimension
+// will generate output
+// [1 , 1]
+// [2 , 2]
+XlaOp BroadcastInDim(
+    const XlaOp& operand, const Shape& shape,
+    const tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+
+// Enqueues a pad operation onto the computation that pads the given value on
+// the edges as well as between the elements of the input. padding_config
+// specifies the padding amount for each dimension.
+XlaOp Pad(const XlaOp& operand, const XlaOp& padding_value,
+          const PaddingConfig& padding_config);
+
+// Enqueues an operation onto the computation that flattens the operand based
+// on the dimension order (major/slowest-varying to minor/fastest-varying)
+// given, followed by reshaping it into the shape with the given dimension
+// sizes (also major to minor). Conceptually, this is a limited form of
+// "shape casting".
+XlaOp Reshape(const XlaOp& operand,
+              tensorflow::gtl::ArraySlice<int64> dimensions,
+              tensorflow::gtl::ArraySlice<int64> new_sizes);
+
+// Enqueues an operation onto the computation that collapses the operand, from
+// first to last dimension (C order), then reshapes it to the given dimension
+// sizes. Conceptually, this is a limited form of "shape casting".
+XlaOp Reshape(const XlaOp& operand,
+              tensorflow::gtl::ArraySlice<int64> new_sizes);
+
+// Wrapper for Reshape.
+// Enqueues an operation to collapse the provided dimensions; e.g. an
+// operand with dimensions {x=256, y=2, z=2, p=32} can be collapsed to
+// {x=1024, y=32} by collapsing dims {0, 1, 2}. Collapsing dimensions must
+// be a consecutive, in-order subsequence of the operand dimensions.
+//
+// Note that collapsing a single dimension does nothing:
+//
+//    {256} collapsing {0} => {256}
+//    {1} collapsing {0} => {1}
+//
+// Collapsing multiple dimensions produces a single result dimension:
+//
+//    {256, 2} collapsing {0,1} => {512}
+//    {256, 2, 3} collapsing {0,1} => {512, 3}
+//
+// This could potentially cause data to be moved -- it provides a more
+// structured form of reshaping than an arbitrary Reshape operation.
+XlaOp Collapse(const XlaOp& operand,
+               tensorflow::gtl::ArraySlice<int64> dimensions);
+
+// Enqueues a slice operation onto the computation that slices the operand
+// from the start indices to the limit indices; e.g.
+//
+//        x
+//   [ 0 1 2 3 ]
+// y [ 4 5 6 7 ] => slice(start={1, 1}, limit={2, 3}) => [ 5 6 ]
+//   [ 8 9 a b ]
+//
+// Note that "limit" means up-to-but-not-including; i.e. [start, limit) in 1D
+// range notation.
+// The strides parameter determines the stride over the slice
+XlaOp Slice(const XlaOp& operand,
+            tensorflow::gtl::ArraySlice<int64> start_indices,
+            tensorflow::gtl::ArraySlice<int64> limit_indices,
+            tensorflow::gtl::ArraySlice<int64> strides);
+
+// Enqueues a slice operation in a given dimension, taking all other
+// dimensions as they are; e.g. if dimno is 1 from start_index 2 to
+// limit_index 4 by 1, and the shape is f32[7,8,9], this call is short-hand
+// for:
+//
+//  array[:, 2:4:1, :]
+XlaOp SliceInDim(const XlaOp& operand, int64 start_index, int64 limit_index,
+                 int64 stride, int64 dimno);
+
+// Enqueues a slice operation onto the computation that slices the 'operand'
+// from dynamic start indices which are passed in 'start_indices'.
+// The size of the slice in each dimension is passed in 'slice_sizes',
+// which specify the end point of exclusive slice intervals in each
+// dimension [start, start + size).
+// The shape of 'start_indices' must be rank == 1, with dimension size
+// equal to the rank of the 'operand'.
+// Slice index calculations are computed modulo input dimension sizes to
+// prevent dynamic start indices from generating out-of-bound array accesses.
+XlaOp DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
+                   tensorflow::gtl::ArraySlice<int64> slice_sizes);
+
+// Enqueues a dynamic update slice operation onto the computation, which
+// updates a slice of 'operand' with 'update' at dynamic 'start_indices'.
+// The shape of 'update' determines the shape of the slice of 'operand'
+// which is updated.
+// The indices specified in 'start_indices' specify the offset of the slice
+// of 'operand' which is updated.
+//
+//               update = {10, 11} // calculated at runtime.
+//   [1 2 3]     start  = {1, 1}   // calculated at runtime.  [1 2  3 ]
+//   [4 5 6]  => DynamicUpdateslice(data, update, start)   => [4 10 11]
+//   [7 8 9]                                                  [7 8  9 ]
+//
+// The shape of 'start_indices' must be rank == 1, with dimension size
+// equal to the rank of the 'operand'.
+// Slice index calculations are computed modulo update dimension sizes to
+// prevent dynamic start indices from generating out-of-bound array accesses.
+XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
+                         const XlaOp& start_indices);
+
+// Enqueues a concatenate instruction onto the computation. 'operands' must
+// have >= 1 entry.
+XlaOp ConcatInDim(XlaBuilder* builder,
+                  tensorflow::gtl::ArraySlice<XlaOp> operands, int64 dimension);
+
+// Enqueue a tracing operation onto the computation; the computation will emit
+// a logging message with the operand.
+void Trace(const string& tag, const XlaOp& operand);
+
+// Enqueues a conditional-move-like select operation onto the computation;
+// predicated on pred, selects between on_true and on_false.
+XlaOp Select(const XlaOp& pred, const XlaOp& on_true, const XlaOp& on_false);
+
+// Enqueues a tuple-creation instruction onto the computation.
+XlaOp Tuple(XlaBuilder* builder, tensorflow::gtl::ArraySlice<XlaOp> elements);
+
+// Enqueues a tuple-element-get instruction onto the computation.
+XlaOp GetTupleElement(const XlaOp& tuple_data, int64 index);
+
+// Enqueues an equal-to comparison instruction onto the computation.
+XlaOp Eq(const XlaOp& lhs, const XlaOp& rhs,
+         tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+// Enqueues a not-equal comparison instruction onto the computation.
+XlaOp Ne(const XlaOp& lhs, const XlaOp& rhs,
+         tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+// Enqueues a greater-or-equal comparison instruction onto the computation.
+XlaOp Ge(const XlaOp& lhs, const XlaOp& rhs,
+         tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+// Enqueues a greater-than comparison instruction onto the computation.
+XlaOp Gt(const XlaOp& lhs, const XlaOp& rhs,
+         tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+// Enqueues a less-than comparison instruction onto the computation.
+XlaOp Lt(const XlaOp& lhs, const XlaOp& rhs,
+         tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+// Enqueues a less-or-equal comparison instruction onto the computation.
+XlaOp Le(const XlaOp& lhs, const XlaOp& rhs,
+         tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+// Enqueues a dot instruction onto the computation.
+XlaOp Dot(const XlaOp& lhs, const XlaOp& rhs);
+
+// Enqueues a general dot instruction onto the computation.
+XlaOp DotGeneral(const XlaOp& lhs, const XlaOp& rhs,
+                 const DotDimensionNumbers& dimension_numbers);
+
+// Enqueues a convolution instruction onto the computation, which uses the
+// default convolution dimension numbers.
+XlaOp Conv(const XlaOp& lhs, const XlaOp& rhs,
+           tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding);
+
+// Enqueues a convolution instruction onto the computation, with the caller
+// provided padding configuration in the format returned by MakePadding().
+XlaOp ConvWithGeneralPadding(
+    const XlaOp& lhs, const XlaOp& rhs,
+    tensorflow::gtl::ArraySlice<int64> window_strides,
+    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding);
+
+// Enqueues a convolution instruction onto the computation, with the caller
+// provided dimension numbers configuration.
+XlaOp ConvWithGeneralDimensions(
+    const XlaOp& lhs, const XlaOp& rhs,
+    tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
+    const ConvolutionDimensionNumbers& dimension_numbers);
+
+// Enqueues a convolution instruction onto the computation, with the caller
+// provided padding configuration as well as the dimension numbers.
+XlaOp ConvGeneral(const XlaOp& lhs, const XlaOp& rhs,
+                  tensorflow::gtl::ArraySlice<int64> window_strides,
+                  tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+                  const ConvolutionDimensionNumbers& dimension_numbers);
+
+// Enqueues a convolution instruction onto the computation, with the caller
+// provided padding configuration, dilation factors and dimension numbers.
+XlaOp ConvGeneralDilated(
+    const XlaOp& lhs, const XlaOp& rhs,
+    tensorflow::gtl::ArraySlice<int64> window_strides,
+    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+    tensorflow::gtl::ArraySlice<int64> lhs_dilation,
+    tensorflow::gtl::ArraySlice<int64> rhs_dilation,
+    const ConvolutionDimensionNumbers& dimension_numbers);
+
+// Enqueues an FFT instruction onto the computation, of the given type and
+// with the given FFT length.
+XlaOp Fft(const XlaOp& operand, FftType fft_type,
+          tensorflow::gtl::ArraySlice<int64> fft_length);
+
+// Enqueues an infeed instruction onto the computation, which writes data of
+// the given shape to the infeed buffer of the device.
+XlaOp Infeed(XlaBuilder* builder, const Shape& shape,
+             const string& config = "");
+
+// Variant of Infeed which takes a token-shaped operand and produces a
+// two-element tuple containing the data value and a token-shaped value.
+// Tokens are used for ordering side-effecting operations.
+// TODO(b/110532604): Replace all uses of the non-token form with this variant.
+XlaOp InfeedWithToken(const XlaOp& token, const Shape& shape,
+                      const string& config = "");
+
+// Enqueues an outfeed instruction onto the computation. This instruction
+// generates outgoing data transfers for the given data.
+//
+// shape_with_layout communicates the laid out shape that we want to outfeed
+// -- if !ShapeUtil::Compatible(GetShape(operand), shape_with_layout) an error
+// will occur.
+void Outfeed(const XlaOp& operand, const Shape& shape_with_layout,
+             const string& outfeed_config);
+
+// Variant of Outfeed which takes a token-shaped operand and produces a
+// token-shaped value. Tokens are used for ordering side-effecting operations.
+// TODO(b/110532604): Replace all uses of the non-token form with this variant.
+XlaOp OutfeedWithToken(const XlaOp& operand, const XlaOp& token,
+                       const Shape& shape_with_layout,
+                       const string& outfeed_config);
+
+// Enqueues a call instruction onto the computation.
+XlaOp Call(XlaBuilder* builder, const XlaComputation& computation,
+           tensorflow::gtl::ArraySlice<XlaOp> operands);
+
+// Enqueues a custom call instruction onto the computation.
+// During code generation, a call instruction is emitted which targets a
+// symbol with the name |call_target_name|.  The |operands| are passed to the
+// call instruction.  |shape| is the resultant shape.
+XlaOp CustomCall(XlaBuilder* builder, const string& call_target_name,
+                 tensorflow::gtl::ArraySlice<XlaOp> operands,
+                 const Shape& shape);
+
+// Enqueues a pseudo-op to represent host-side computation data-dependencies.
+// During code generation, host send and receive operations will be generated
+// to transfer |operands| to the host and a single result of |shape| back to
+// the device.  Host send/recv operations are emitted using |channel_name|.
+// Dataflow dependencies and the |cost_estimate_ns| field may be used in HLO
+// instruction scheduling.
+XlaOp HostCompute(XlaBuilder* builder,
+                  tensorflow::gtl::ArraySlice<XlaOp> operands,
+                  const string& channel_name, int64 cost_estimate_ns,
+                  const Shape& shape);
+
+// The following methods enqueue element-wise binary arithmetic operations
+// onto the computation. The shapes of the operands have to match unless one
+// of the operands is a scalar, or an explicit broadcast dimension is given
+// (see g3doc for more details).
+
+// Enqueues a complex compose instruction onto the computation.
+XlaOp Complex(const XlaOp& real, const XlaOp& imag,
+              tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+// Enqueues a complex conjugate instruction onto the computation.
+XlaOp Conj(const XlaOp& operand);
+
+// Enqueues an add instruction onto the computation.
+XlaOp Add(const XlaOp& lhs, const XlaOp& rhs,
+          tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+// Enqueues a subtract instruction onto the computation.
+XlaOp Sub(const XlaOp& lhs, const XlaOp& rhs,
+          tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+// Enqueues a multiply instruction onto the computation.
+XlaOp Mul(const XlaOp& lhs, const XlaOp& rhs,
+          tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+// Enqueues a divide instruction onto the computation.
+XlaOp Div(const XlaOp& lhs, const XlaOp& rhs,
+          tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+// Enqueues a remainder instruction onto the computation.
+XlaOp Rem(const XlaOp& lhs, const XlaOp& rhs,
+          tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+// Enqueues a max instruction onto the computation.
+XlaOp Max(const XlaOp& lhs, const XlaOp& rhs,
+          tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+// Enqueues a min instruction onto the computation.
+XlaOp Min(const XlaOp& lhs, const XlaOp& rhs,
+          tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+// Element-wise logical operators
+XlaOp And(const XlaOp& lhs, const XlaOp& rhs,
+          tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+XlaOp Or(const XlaOp& lhs, const XlaOp& rhs,
+         tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+XlaOp Xor(const XlaOp& lhs, const XlaOp& rhs,
+          tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+XlaOp Not(const XlaOp& operand);
+
+XlaOp ShiftLeft(const XlaOp& lhs, const XlaOp& rhs,
+                tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+XlaOp ShiftRightArithmetic(
+    const XlaOp& lhs, const XlaOp& rhs,
+    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+XlaOp ShiftRightLogical(
+    const XlaOp& lhs, const XlaOp& rhs,
+    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+// Reduces an array among the provided dimensions, given "computation" as a
+// reduction operator.
+XlaOp Reduce(const XlaOp& operand, const XlaOp& init_value,
+             const XlaComputation& computation,
+             tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce);
+
+// Convenience wrapper around the above that reduces all the dimensions in the
+// operand shape.
+XlaOp ReduceAll(const XlaOp& operand, const XlaOp& init_value,
+                const XlaComputation& computation);
+
+// Enqueues a windowed reduce instruction onto the computation.
+XlaOp ReduceWindow(const XlaOp& operand, const XlaOp& init_value,
+                   const XlaComputation& computation,
+                   tensorflow::gtl::ArraySlice<int64> window_dimensions,
+                   tensorflow::gtl::ArraySlice<int64> window_strides,
+                   Padding padding);
+
+// As ReduceWindow(), but the padding is given in the format
+// returned by MakePadding().
+XlaOp ReduceWindowWithGeneralPadding(
+    const XlaOp& operand, const XlaOp& init_value,
+    const XlaComputation& computation,
+    tensorflow::gtl::ArraySlice<int64> window_dimensions,
+    tensorflow::gtl::ArraySlice<int64> window_strides,
+    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding);
+
+// Returns the sum of the operand value within each subgroup of replicas. All
+// replicas supply one input to the sum and all replicas receive the resulting
+// sum for each subgroup.
+XlaOp CrossReplicaSum(
+    const XlaOp& operand,
+    tensorflow::gtl::ArraySlice<int64> replica_group_ids = {});
+
+// Enqueues an operation that do an AllReduce of the operand cross cores. Here
+// AllReduce means doing a reduction on the input operand cross cores and then
+// broadcasting the reduction result to those cores. The reduction function is
+// defined by `computation`, which should be a commutative computation on
+// scalars, e.g., add, min, or max. The way that AllReduce is applied is
+// configured by:
+//
+// - `replica_group_ids`: maps replica ids to subgroup ids. If empty, all
+// replicas belong to one group. Allreduce will be applied within subgroups.
+// For example, we have 4 replicas, then replica_group_ids={0,1,0,1} means,
+// replica 0 and 2 are in subgroup 0, replica 1 and 3 are in subgroup 1.
+//
+// - `channel_id`: for Allreduce nodes from different models, if they have the
+// same channel_id, they will be 'Allreduce'd. If empty, Allreduce will not be
+// applied cross models.
+//
+// TODO(b/79737069): Rename this to AllReduce when it's ready to use.
+XlaOp CrossReplicaSum(const XlaOp& operand, const XlaComputation& computation,
+                      tensorflow::gtl::ArraySlice<int64> replica_group_ids = {},
+                      const tensorflow::gtl::optional<ChannelHandle>&
+                          channel_id = tensorflow::gtl::nullopt);
+
+// Enqueues an operation that scatters the `source` array to the selected
+// indices of each window.
+XlaOp SelectAndScatter(const XlaOp& operand, const XlaComputation& select,
+                       tensorflow::gtl::ArraySlice<int64> window_dimensions,
+                       tensorflow::gtl::ArraySlice<int64> window_strides,
+                       Padding padding, const XlaOp& source,
+                       const XlaOp& init_value, const XlaComputation& scatter);
+
+// As SelectAndScatter(), but the padding is given in the format
+// returned by MakePadding().
+XlaOp SelectAndScatterWithGeneralPadding(
+    const XlaOp& operand, const XlaComputation& select,
+    tensorflow::gtl::ArraySlice<int64> window_dimensions,
+    tensorflow::gtl::ArraySlice<int64> window_strides,
+    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+    const XlaOp& source, const XlaOp& init_value,
+    const XlaComputation& scatter);
+
+// Enqueues an abs instruction onto the computation.
+XlaOp Abs(const XlaOp& operand);
+
+// Enqueues a atan2 instruction onto the computation.
+XlaOp Atan2(const XlaOp& y, const XlaOp& x,
+            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+// Enqueues an exp instruction onto the computation.
+XlaOp Exp(const XlaOp& operand);
+
+// Enqueues an expm1 instruction onto the computation.
+XlaOp Expm1(const XlaOp& operand);
+
+// Enqueues a floor instruction onto the computation.
+XlaOp Floor(const XlaOp& operand);
+
+// Enqueues a ceil instruction onto the computation.
+XlaOp Ceil(const XlaOp& operand);
+
+// Enqueues a round instruction onto the computation, rounding to nearest even
+// with half-way cases rounding away from zero.
+XlaOp Round(const XlaOp& operand);
+
+// Enqueues an log instruction (natural logarithm) onto the computation.
+XlaOp Log(const XlaOp& operand);
+
+// Enqueues an log1p instruction (log(x+1)) onto the computation.
+XlaOp Log1p(const XlaOp& operand);
+
+// Enqueues a sign instruction onto the computation.
+XlaOp Sign(const XlaOp& operand);
+
+// Enqueues a count leading zeros instruction onto the computation.
+XlaOp Clz(const XlaOp& operand);
+
+// Enqueues a cosine instruction onto the computation.
+XlaOp Cos(const XlaOp& operand);
+
+// Enqueues a sine instruction onto the computation.
+XlaOp Sin(const XlaOp& operand);
+
+// Enqueues a tanh instruction onto the computation.
+XlaOp Tanh(const XlaOp& operand);
+
+// Enqueues a real-part instruction onto the computation.
+XlaOp Real(const XlaOp& operand);
+
+// Enqueues an imaginary-part instruction onto the computation.
+XlaOp Imag(const XlaOp& operand);
+
+// Enqueues a lhs^rhs computation onto the computation.
+XlaOp Pow(const XlaOp& lhs, const XlaOp& rhs,
+          tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+// Enqueues an operator that tests if the operand's values are finite, i.e.,
+// not Inf or NaN. Defined only for floating-point types. Returns an array of
+// booleans with the same shape where entries are true iff the corresponding
+// entry was NaN.
+XlaOp IsFinite(const XlaOp& operand);
+
+// Enqueues a convert instruction onto the computation that changes the
+// element type of the operand array to primitive_type.
+XlaOp ConvertElementType(const XlaOp& operand, PrimitiveType new_element_type);
+
+// Enqueues a no-op instruction onto the computation that changes
+// the element type of the operand array to primitive_type. The
+// bit-widths of the source and destination element types must be
+// identical.
+XlaOp BitcastConvertType(const XlaOp& operand, PrimitiveType new_element_type);
+
+// Enqueues a negate instruction onto the computation.
+XlaOp Neg(const XlaOp& operand);
+
+// Enqueues a transpose instruction onto the computation.
+XlaOp Transpose(const XlaOp& operand,
+                tensorflow::gtl::ArraySlice<int64> permutation);
+
+// Enqueues a reverse instruction onto the computation. The order of the
+// elements in the given dimensions is reversed (i.e., the element at index i
+// is moved to index dimension_size - 1 - i).
+XlaOp Rev(const XlaOp& operand, tensorflow::gtl::ArraySlice<int64> dimensions);
+
+// Enqueues a sort (as increasing order) instruction onto the computation.
+// If only keys are provided:
+// * If the keys are an rank-1 tensor (an array), the result is a sorted array
+// of keys, in ascending order.
+// * If the keys have higher rank, the keys are sorted along the provided
+// dimension. For example, for a rank-2 tensor (a matrix) of keys, a dimension
+// value of 0 will indepenently sort every column, and a dimension value of 1
+// will independently sort each row. If no dimension number is provided, then
+// the last dimension is chosen by default.
+//
+// If both keys and values are provided:
+// * The keys and the values must tensors with the same dimensions. The
+// element types of the tensors may be different.
+// * The result is a tuple that consists of a sorted tensor of keys (along the
+// provided dimension, as above) as the first element, and a tensor with their
+// corresponding values as the second element.
+XlaOp Sort(XlaOp keys,
+           tensorflow::gtl::optional<XlaOp> values = tensorflow::gtl::nullopt,
+           int64 dimension = -1);
+
+// Enqueues a clamp instruction onto the computation.
+XlaOp Clamp(const XlaOp& min, const XlaOp& operand, const XlaOp& max);
+
+// Enqueues a map instruction onto the computation.
+XlaOp Map(XlaBuilder* builder, tensorflow::gtl::ArraySlice<XlaOp> operands,
+          const XlaComputation& computation,
+          tensorflow::gtl::ArraySlice<int64> dimensions,
+          tensorflow::gtl::ArraySlice<XlaOp> static_operands = {});
+
+// Enqueues a N(mu, sigma) random number generation instruction onto the
+// computation.
+XlaOp RngNormal(const XlaOp& mu, const XlaOp& sigma, const Shape& shape);
+
+// Enqueues a U(a, b) random number generation instruction onto the
+// computation. Returns values in the semi-open interval [a, b).
+XlaOp RngUniform(const XlaOp& a, const XlaOp& b, const Shape& shape);
+
+// Enqueues a while node onto the computation.
+XlaOp While(const XlaComputation& condition, const XlaComputation& body,
+            const XlaOp& init);
+
+// Enqueues a conditional node onto the computation.
+XlaOp Conditional(const XlaOp& predicate, const XlaOp& true_operand,
+                  const XlaComputation& true_computation,
+                  const XlaOp& false_operand,
+                  const XlaComputation& false_computation);
+
+// Enqueues a ReducePrecision node onto the computation.
+XlaOp ReducePrecision(const XlaOp& operand, const int exponent_bits,
+                      const int mantissa_bits);
+
+// Enqueues a Gather node onto the computation.
+XlaOp Gather(const XlaOp& input, const XlaOp& gather_indices,
+             const GatherDimensionNumbers& dimension_numbers,
+             tensorflow::gtl::ArraySlice<int64> window_bounds);
+
+// Enqueues a Send node onto the computation for device-to-device
+// communication. This operation sends the given operand to
+// a Recv instruction in a different computation that shares the same channel
+// handle.
+void Send(const XlaOp& operand, const ChannelHandle& handle);
+
+// Variant of Send which takes a token-shaped operand and produces a
+// token-shaped value.  Tokens are used for ordering side-effecting operations.
+// TODO(b/110532604): Replace all uses of the non-token form with this variant.
+XlaOp SendWithToken(const XlaOp& operand, const XlaOp& token,
+                    const ChannelHandle& handle);
+
+// Enqueues a Recv node onto the computation for device-to-device
+// communication. The data comes from a Send instruction in a different
+// computation that shares the same channel handle and its shape must be the
+// same as the given shape.
+XlaOp Recv(XlaBuilder* builder, const Shape& shape,
+           const ChannelHandle& handle);
+
+// Variant of Recv which takes a token-shaped operand and produces a two-element
+// tuple containing the data value and a token-shaped value. Tokens are used
+// for ordering side-effecting operations.
+// TODO(b/110532604): Replace all uses of the non-token form with this variant.
+XlaOp RecvWithToken(const XlaOp& token, const Shape& shape,
+                    const ChannelHandle& handle);
+
+// Enqueues a Send node which transfers data from the device to the host. The
+// 'shape_with_layout' argument defines the layout of the data transferred; its
+// shape must be compatible with the shape of the operand. The operand must be
+// array-shaped.
+// TODO(b/111544877): Support tuple shapes.
+XlaOp SendToHost(const XlaOp& operand, const XlaOp& token,
+                 const Shape& shape_with_layout, const ChannelHandle& handle);
+
+// Enqueues a Recv node which transfers data from the host to the device. The
+// given shape must contain a layout and must be an array.
+// TODO(b/111544877): Support tuple shapes.
+XlaOp RecvFromHost(const XlaOp& token, const Shape& shape,
+                   const ChannelHandle& handle);
+
+// Enqueues an operation (AfterAll) with no operands that produces a
+// token-shaped value.  Tokens are used for ordering side-effecting operations.
+// This is a separate method from AfterAll to facility the removal of
+// operand-less AfterAll instructions.
+// TODO(b/110532604): Remove this function when all tokens are derived from a
+// single token generated or passed into the entry computation.
+XlaOp CreateToken(XlaBuilder* builder);
+
+// Enqueues an AfterAll instruction which produces a token-shaped value and
+// takes a variadic number of token-shaped operands. The number of operands must
+// be greater than zero. Used for joining tokens.
+XlaOp AfterAll(XlaBuilder* builder, tensorflow::gtl::ArraySlice<XlaOp> tokens);
+
+// Normalizes operand across spatial and batch dimensions for each feature.
+//
+// Returns a tuple (normalized, batch_mean, batch_var) where `normalized`
+// is the normalized result and batch_mean and batch_var are the mean and
+// variance, respectively, across batch for the operand.
+XlaOp BatchNormTraining(const XlaOp& operand, const XlaOp& scale,
+                        const XlaOp& offset, float epsilon,
+                        int64 feature_index);
+
+// Normalizes operand across spatial and batch dimensions for each feature.
+//
+// `BatchNormInference` is equivalent to calling `BatchNormTraining` without
+// computing `mean` and `variance` for each batch inside the operation. It
+// uses the input `mean` and `variance` instead as estimated values. The
+// purpose of this op is to reduce latency in inference, hence the name
+// `BatchNormInference`.
+//
+// The output has the same shape as `operand`, and contains the normalized
+// values for each batch.
+XlaOp BatchNormInference(const XlaOp& operand, const XlaOp& scale,
+                         const XlaOp& offset, const XlaOp& mean,
+                         const XlaOp& variance, float epsilon,
+                         int64 feature_index);
+
+// Calculates the gradients of a batch norm op.
+//
+// The inputs `batch_mean` and `batch_var` represent the mean and variance
+// across the batch.
+//
+// Returns a tuple of three elements:
+//   - grad_operand: Gradient with respect to input `operand`
+//   - grad_offset: Gradient with respect to input `offset`
+//   - grad_scale: Gradient with respect to input `scale`
+XlaOp BatchNormGrad(const XlaOp& operand, const XlaOp& scale,
+                    const XlaOp& batch_mean, const XlaOp& batch_var,
+                    const XlaOp& grad_output, float epsilon,
+                    int64 feature_index);
+
+// Implementation details below this point.
+
+template <typename NativeT>
+XlaOp XlaBuilder::ConstantR0(NativeT value) {
+  return ConstantLiteral(*LiteralUtil::CreateR0<NativeT>(value));
+}
+
+template <typename NativeT>
+XlaOp XlaBuilder::ConstantR1(tensorflow::gtl::ArraySlice<NativeT> values) {
+  return ConstantLiteral(*LiteralUtil::CreateR1<NativeT>(values));
+}
+
+template <typename NativeT>
+XlaOp XlaBuilder::ConstantR1(int64 length, NativeT value) {
+  Literal literal(ShapeUtil::MakeShape(
+      primitive_util::NativeToPrimitiveType<NativeT>(), {length}));
+  literal.PopulateWithValue(value);
+  return ConstantLiteral(literal);
+}
+
+inline XlaOp XlaBuilder::ConstantR1(const tensorflow::core::Bitmap& values) {
+  return ConstantLiteral(*LiteralUtil::CreateR1(values));
+}
+
+template <typename NativeT>
+XlaOp XlaBuilder::ConstantR2(
+    std::initializer_list<std::initializer_list<NativeT>> values) {
+  return ConstantLiteral(*LiteralUtil::CreateR2<NativeT>(values));
+}
+
+template <typename NativeT>
+XlaOp XlaBuilder::ConstantFromArrayWithLayout(const Array<NativeT>& values,
+                                              const Layout& layout) {
+  return ConstantLiteral(
+      *LiteralUtil::CreateFromArrayWithLayout<NativeT>(values, layout));
+}
+
+template <typename NativeT>
+XlaOp XlaBuilder::ConstantFromArray(const Array<NativeT>& values) {
+  return ConstantLiteral(*LiteralUtil::CreateFromArray<NativeT>(values));
+}
+
+template <typename NativeT>
+XlaOp XlaBuilder::ConstantR2FromArray2DWithLayout(
+    const Array2D<NativeT>& values, const Layout& layout) {
+  return ConstantLiteral(
+      *LiteralUtil::CreateFromArrayWithLayout<NativeT>(values, layout));
+}
+
+template <typename NativeT>
+XlaOp XlaBuilder::ConstantR2FromArray2D(const Array2D<NativeT>& values) {
+  return ConstantLiteral(*LiteralUtil::CreateR2FromArray2D<NativeT>(values));
+}
+
+template <typename NativeT>
+XlaOp XlaBuilder::ConstantR3FromArray3DWithLayout(
+    const Array3D<NativeT>& values, const Layout& layout) {
+  return ConstantLiteral(
+      *LiteralUtil::CreateR3FromArray3DWithLayout<NativeT>(values, layout));
+}
+
+template <typename NativeT>
+XlaOp XlaBuilder::ConstantR3FromArray3D(const Array3D<NativeT>& values) {
+  return ConstantFromArray(values);
+}
+
+template <typename NativeT>
+XlaOp XlaBuilder::ConstantR4FromArray4DWithLayout(
+    const Array4D<NativeT>& values, const Layout& layout) {
+  return ConstantFromArrayWithLayout(values, layout);
+}
+
+template <typename NativeT>
+XlaOp XlaBuilder::ConstantR4FromArray4D(const Array4D<NativeT>& values) {
+  return ConstantFromArray(values);
+}
+
+// Free function template implementations.
+
+template <typename NativeT>
+XlaOp ConstantR0(XlaBuilder* builder, NativeT value) {
+  return ConstantLiteral(builder, *LiteralUtil::CreateR0<NativeT>(value));
+}
+
+template <typename NativeT>
+XlaOp ConstantR1(XlaBuilder* builder,
+                 tensorflow::gtl::ArraySlice<NativeT> values) {
+  return ConstantLiteral(builder, *LiteralUtil::CreateR1<NativeT>(values));
+}
+
+template <typename NativeT>
+XlaOp ConstantR1(XlaBuilder* builder, int64 length, NativeT value) {
+  Literal literal(ShapeUtil::MakeShape(
+      primitive_util::NativeToPrimitiveType<NativeT>(), {length}));
+  literal.PopulateWithValue(value);
+  return ConstantLiteral(builder, literal);
+}
+
+inline XlaOp ConstantR1(XlaBuilder* builder,
+                        const tensorflow::core::Bitmap& values) {
+  return ConstantLiteral(builder, *LiteralUtil::CreateR1(values));
+}
+
+template <typename NativeT>
+XlaOp ConstantR2(XlaBuilder* builder,
+                 std::initializer_list<std::initializer_list<NativeT>> values) {
+  return ConstantLiteral(builder, *LiteralUtil::CreateR2<NativeT>(values));
+}
+
+template <typename NativeT>
+XlaOp ConstantFromArrayWithLayout(XlaBuilder* builder,
+                                  const Array<NativeT>& values,
+                                  const Layout& layout) {
+  return ConstantLiteral(
+      builder,
+      *LiteralUtil::CreateFromArrayWithLayout<NativeT>(values, layout));
+}
+
+template <typename NativeT>
+XlaOp ConstantFromArray(XlaBuilder* builder, const Array<NativeT>& values) {
+  return ConstantLiteral(builder,
+                         *LiteralUtil::CreateFromArray<NativeT>(values));
+}
+
+template <typename NativeT>
+XlaOp ConstantR2FromArray2DWithLayout(XlaBuilder* builder,
+                                      const Array2D<NativeT>& values,
+                                      const Layout& layout) {
+  return ConstantLiteral(
+      builder,
+      *LiteralUtil::CreateFromArrayWithLayout<NativeT>(values, layout));
+}
+
+template <typename NativeT>
+XlaOp ConstantR2FromArray2D(XlaBuilder* builder,
+                            const Array2D<NativeT>& values) {
+  return ConstantLiteral(builder,
+                         *LiteralUtil::CreateR2FromArray2D<NativeT>(values));
+}
+
+template <typename NativeT>
+XlaOp ConstantR3FromArray3DWithLayout(XlaBuilder* builder,
+                                      const Array3D<NativeT>& values,
+                                      const Layout& layout) {
+  return ConstantLiteral(
+      builder,
+      *LiteralUtil::CreateR3FromArray3DWithLayout<NativeT>(values, layout));
+}
+
+template <typename NativeT>
+XlaOp ConstantR3FromArray3D(XlaBuilder* builder,
+                            const Array3D<NativeT>& values) {
+  return ConstantFromArray(builder, values);
+}
+
+template <typename NativeT>
+XlaOp ConstantR4FromArray4DWithLayout(XlaBuilder* builder,
+                                      const Array4D<NativeT>& values,
+                                      const Layout& layout) {
+  return ConstantFromArrayWithLayout(builder, values, layout);
+}
+
+template <typename NativeT>
+XlaOp ConstantR4FromArray4D(XlaBuilder* builder,
+                            const Array4D<NativeT>& values) {
+  return ConstantFromArray(builder, values);
+}
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_XLA_BUILDER_H_
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder_test.cc b/tensorflow/compiler/xla/client/xla_builder_test.cc
index 3b8beb2c78..28a207b137 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder_test.cc
+++ b/tensorflow/compiler/xla/client/xla_builder_test.cc
@@ -13,10 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 
 #include <string>
 
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
diff --git a/tensorflow/compiler/xla/client/xla_client/BUILD b/tensorflow/compiler/xla/client/xla_client/BUILD
index 763653c685..2e131dbad2 100644
--- a/tensorflow/compiler/xla/client/xla_client/BUILD
+++ b/tensorflow/compiler/xla/client/xla_client/BUILD
@@ -24,56 +24,10 @@ filegroup(
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 cc_library(
-    name = "xla_computation",
-    srcs = ["xla_computation.cc"],
-    hdrs = ["xla_computation.h"],
-    deps = [
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service:hlo_proto",
-    ],
-)
-
-cc_library(
     name = "xla_builder",
-    srcs = ["xla_builder.cc"],
     hdrs = ["xla_builder.h"],
     visibility = ["//visibility:public"],
     deps = [
-        ":xla_computation",
-        "//tensorflow/compiler/xla:execution_options_util",
-        "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:literal_util",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:padding",
-        "//tensorflow/compiler/xla/client:sharding_builder",
-        "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/compiler/xla/service:hlo_proto",
-        "//tensorflow/compiler/xla/service:shape_inference",
-        "//tensorflow/core:lib",
-    ],
-)
-
-tf_cc_test(
-    name = "xla_builder_test",
-    srcs = ["xla_builder_test.cc"],
-    deps = [
-        ":xla_builder",
-        "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
-        "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/compiler/xla/service:hlo_matchers",
-        "//tensorflow/core:test",
+        "//tensorflow/compiler/xla/client:xla_builder",
     ],
 )
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.h b/tensorflow/compiler/xla/client/xla_client/xla_builder.h
index 445c1e0d77..ce2a8afd4c 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.h
@@ -16,2214 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_CLIENT_XLA_CLIENT_XLA_BUILDER_H_
 #define TENSORFLOW_COMPILER_XLA_CLIENT_XLA_CLIENT_XLA_BUILDER_H_
 
-#include <map>
-#include <string>
-#include <type_traits>
-#include <utility>
-
-#include "tensorflow/compiler/xla/client/padding.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
-#include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/hlo.pb.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
-#include "tensorflow/core/lib/gtl/array_slice.h"
-#include "tensorflow/core/lib/gtl/flatset.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/stacktrace.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace xla {
-
-class XlaBuilder;
-
-// This represents an instruction that has been enqueued using the XlaBuilder.
-// This is used to pass to subsequent computations that depends upon the
-// instruction as an operand.
-class XlaOp {
- public:
-  XlaOp() : handle_(-1), builder_(nullptr) {
-    static_assert(std::is_trivially_destructible<XlaOp>::value,
-                  "XlaOp should be trivially destructible");
-  }
-  ~XlaOp() = default;
-
-  XlaBuilder* builder() const { return builder_; }
-
-  // Returns true if the XlaOp represents valid, non-erroneous value.
-  bool valid() const { return handle_ >= 0; }
-
-  // Returns true if the XlaOp was created by the XlaOp() constructor and
-  // not returned by a builder.
-  bool IsUninitialized() const { return builder_ == nullptr; }
-
-  bool IsIdenticalTo(const XlaOp& rhs) const {
-    return handle_ == rhs.handle_ && builder_ == rhs.builder_;
-  }
-
-  friend std::ostream& operator<<(std::ostream& out, const XlaOp& op) {
-    out << op.handle();
-    return out;
-  }
-
- private:
-  explicit XlaOp(XlaBuilder* builder) : handle_(-1), builder_(builder) {}
-  XlaOp(int64 handle, XlaBuilder* builder)
-      : handle_(handle), builder_(builder) {}
-
-  int64 handle() const { return handle_; }
-
-  friend class XlaBuilder;
-
-  // < 0 means "invalid handle".
-  int64 handle_;
-
-  // Not owned. Non-null for any handle returned by XlaBuilder, even if the
-  // handle is invalid.
-  XlaBuilder* builder_;
-};
-
-// Arithmetic operator overloads for the XlaOp type.
-XlaOp operator-(const XlaOp& x);
-XlaOp operator+(const XlaOp& x, const XlaOp& y);
-XlaOp operator-(const XlaOp& x, const XlaOp& y);
-XlaOp operator*(const XlaOp& x, const XlaOp& y);
-XlaOp operator/(const XlaOp& x, const XlaOp& y);
-XlaOp operator%(const XlaOp& x, const XlaOp& y);
-
-// Bitwise operator overloads for the XlaOp type.
-XlaOp operator~(const XlaOp& x);
-XlaOp operator&(const XlaOp& x, const XlaOp& y);
-XlaOp operator|(const XlaOp& x, const XlaOp& y);
-XlaOp operator^(const XlaOp& x, const XlaOp& y);
-XlaOp operator<<(const XlaOp& x, const XlaOp& y);
-// Performs a right arithmetic shift if 'x' is a signed type, otherwise performs
-// a right logical shift.
-XlaOp operator>>(const XlaOp& x, const XlaOp& y);
-
-// We don't overload the relational operators (==, !=, <, <=, >, >=) because the
-// semantics might be surprising since their result types are usually 'bool'.
-// Further programmers may expect == to be a structural equality.
-// We also choose not to overload any of the mutating operators (e.g., +=, -=)
-// because the semantics might be misleading — XLA computations are immutable.
-
-// A convenient interface for building up computations.
-//
-// Thread-compatible.
-class XlaBuilder {
- public:
-  // computation_name: name to use for the built computation.
-  XlaBuilder(const string& computation_name);
-
-  XlaBuilder(const XlaBuilder&) = delete;
-  XlaBuilder& operator=(const XlaBuilder&) = delete;
-
-  ~XlaBuilder();
-
-  // Returns the computation name.
-  const string& name() const { return name_; }
-
-  // Sets OpMetadata that will be added to all instructions until cleared.
-  //
-  // OpMetadata is often applied to a series of XLA HLO instructions. As a
-  // result, OpMetadata is set on the Computation Builder. All subsequent
-  // instructions generated via this Computation Builder will have the same
-  // OpMetadata attached until a call to ClearOpMetadata.
-  void SetOpMetadata(const OpMetadata& metadata) { metadata_ = metadata; }
-
-  // Clears the HloMetadata state.
-  void ClearOpMetadata() { metadata_.Clear(); }
-
-  // Sets an OpSharding that will be attached to all instructions until cleared.
-  void SetSharding(const OpSharding& sharding) { sharding_ = sharding; }
-
-  // Clears the sharding. Ops will be sharded according to the default placement
-  // policy.
-  void ClearSharding() { sharding_ = tensorflow::gtl::nullopt; }
-
-  // Returns the OpSharding that will be attached to all instructions.
-  const tensorflow::gtl::optional<OpSharding>& sharding() const {
-    return sharding_;
-  }
-
-  // Sets the builder to a mode where it will die immediately when an error is
-  // encountered, rather than producing it in a deferred fashion when Build() is
-  // called (which is the default).
-  void set_die_immediately_on_error(bool enabled) {
-    die_immediately_on_error_ = enabled;
-  }
-
-  // Default dimension numbers used for a 2D convolution.
-  static constexpr int64 kConvBatchDimension = 0;
-  static constexpr int64 kConvFeatureDimension = 1;
-  static constexpr int64 kConvFirstSpatialDimension = 2;
-  static constexpr int64 kConvSecondSpatialDimension = 3;
-  static constexpr int64 kConvKernelOutputDimension = 0;
-  static constexpr int64 kConvKernelInputDimension = 1;
-  static constexpr int64 kConvKernelFirstSpatialDimension = 2;
-  static constexpr int64 kConvKernelSecondSpatialDimension = 3;
-
-  // Creates a default ConvolutionDimensionNumbers. For a 2D convolution, for
-  // the input operand {batch, feature, height, width} = {0, 1, 2, 3} and for
-  // the kernel operand
-  // {output_feature, input_feature, height, width} = {0, 1, 2, 3}.
-  static ConvolutionDimensionNumbers CreateDefaultConvDimensionNumbers(
-      int num_spatial_dims = 2);
-
-  // Returns an error if the convolution dimension numbers have conflicts.
-  static Status Validate(const ConvolutionDimensionNumbers& dnum);
-
-  // Returns a new XlaBuilder whose resultant Computation is used only by this
-  // XlaBuilder. The sub-XlaBuilder has the same die_immediately_on_error
-  // behavior as the parent.
-  std::unique_ptr<XlaBuilder> CreateSubBuilder(const string& computation_name);
-
-  // Builds the computation with the requested operations, or returns a non-ok
-  // status. Note that all ops that have been enqueued will be moved to the
-  // computation being returned.
-  StatusOr<XlaComputation> Build();
-
-  // Builds the computation with the requested operations, or notes an error in
-  // the parent XlaBuilder and returns an empty computation if building failed.
-  // This function is intended to be used where the returned XlaComputation is
-  // only used by the parent XlaBuilder and hence further operation on the
-  // returned XlaComputation will simply be error'ed out if an error occurred
-  // while building this computation. If the built computation is to be used by
-  // a XlaBuilder other than the parent XlaBuilder then Build() should be used
-  // instead.
-  XlaComputation BuildAndNoteError();
-
-  // Returns a subgraph that roots on the given root. If the root is not a
-  // compile-time constant (see `IsConstant`), returns an error.
-  //
-  // This will copy the needed ops/computations to the subgraph.
-  StatusOr<XlaComputation> BuildConstantSubGraph(const XlaOp& root_op) const;
-
-  // Returns the first error that was encountered while building the
-  // computation. When an error is encountered, by default we return a vacuous
-  // XlaOp and inform the user of the error that occurred while
-  // building the computation when they make a final call to Build().
-  //
-  // See also set_die_immediately_on_error().
-  Status first_error() const { return first_error_; }
-
-  // Returns the shape of the given op.
-  StatusOr<Shape> GetShape(const XlaOp& op) const;
-
-  // Returns the (inferred) result for the current computation's shape.
-  StatusOr<ProgramShape> GetProgramShape() const;
-
-  // Reports an error to the builder, by
-  // * storing it internally and capturing a backtrace if it's the first error
-  //   (this deferred value will be produced on the call to
-  //    Build()/GetShape()/...)
-  // * dying if die_immediately_on_error_ is true.
-  // Returns an XlaOp with an invalid handle but a valid builder. This value can
-  // be returned in place of a value in APIs that return an XlaOp.
-  XlaOp ReportError(const Status& error);
-
-  // A helper function that converts a StatusOr<XlaOp> into an XlaOp.
-  // If the Status was an error, reports the error to builder and returns an
-  // invalid XlaOp handle.
-  XlaOp ReportErrorOrReturn(const StatusOr<XlaOp>& op);
-
-  // A helper function that runs a function that returns a StatusOr<XlaOp> and
-  // returns an XlaOp.
-  XlaOp ReportErrorOrReturn(const std::function<StatusOr<XlaOp>()>& op_creator);
-
-  // Returns true if 'operand' is a compile-time constant. A compile-time
-  // constant does not depend on any parameters, or on stateful operators such
-  // as `RngNormal` or `Infeed`.
-  //
-  // This tests whether a computation is a compile-time constant without
-  // evaluating the computation.
-  StatusOr<bool> IsConstant(const XlaOp& operand) const;
-
- private:
-  // Enqueues a "retrieve parameter value" instruction for a parameter that was
-  // passed to the computation.
-  XlaOp Parameter(int64 parameter_number, const Shape& shape,
-                  const string& name);
-
-  // Enqueues a constant with the value of the given literal onto the
-  // computation.
-  XlaOp ConstantLiteral(const LiteralSlice& literal);
-
-  // Enqueues a constant onto the computation. Methods are templated on the
-  // native host type (NativeT) which corresponds to a specific XLA
-  // PrimitiveType as given in the following table:
-  //
-  //  Native Type   PrimitiveType
-  // -----------------------------
-  //   bool           PRED
-  //   int32          S32
-  //   int64          S64
-  //   uint32         U32
-  //   uint64         U64
-  //   float          F32
-  //   double         F64
-  //
-  // Note: not all primitive types defined in xla_data.proto have a
-  // corresponding native type yet.
-  template <typename NativeT>
-  XlaOp ConstantR0(NativeT value);
-  template <typename NativeT>
-  XlaOp ConstantR1(tensorflow::gtl::ArraySlice<NativeT> values);
-  XlaOp ConstantR1(const tensorflow::core::Bitmap& values);
-  template <typename NativeT>
-  XlaOp ConstantR2(
-      std::initializer_list<std::initializer_list<NativeT>> values);
-  template <typename NativeT>
-  XlaOp ConstantFromArrayWithLayout(const Array<NativeT>& values,
-                                    const Layout& layout);
-  template <typename NativeT>
-  XlaOp ConstantFromArray(const Array<NativeT>& values);
-  template <typename NativeT>
-  XlaOp ConstantR2FromArray2DWithLayout(const Array2D<NativeT>& values,
-                                        const Layout& layout);
-  template <typename NativeT>
-  XlaOp ConstantR2FromArray2D(const Array2D<NativeT>& values);
-  template <typename NativeT>
-  XlaOp ConstantR3FromArray3DWithLayout(const Array3D<NativeT>& values,
-                                        const Layout& layout);
-  template <typename NativeT>
-  XlaOp ConstantR3FromArray3D(const Array3D<NativeT>& values);
-  template <typename NativeT>
-  XlaOp ConstantR4FromArray4DWithLayout(const Array4D<NativeT>& values,
-                                        const Layout& layout);
-  template <typename NativeT>
-  XlaOp ConstantR4FromArray4D(const Array4D<NativeT>& values);
-
-  // Enqueues a rank one constant (vector) onto the computation. The vector has
-  // size 'length' and every element has the value 'value'.
-  template <typename NativeT>
-  XlaOp ConstantR1(int64 length, NativeT value);
-
-  // Adds dimensions to an array by duplicating the data in the array.
-  //
-  // The new dimensions are inserted on the left, i.e. if
-  // broadcast_sizes has values {a0, ..., aN} and the operand shape
-  // has dimensions {b0, ..., bM} then the shape of the output has
-  // dimensions {a0, ..., aN, b0, ..., bM}.
-  //
-  // The new dimensions index into copies of the operand, i.e.
-  //
-  //   output[i0, ..., iN, j0, ..., jM] = operand[j0, ..., jM]
-  XlaOp Broadcast(const XlaOp& operand,
-                  tensorflow::gtl::ArraySlice<int64> broadcast_sizes);
-
-  // Performs in-dimension-style broadcast.
-  //
-  // Operand specifies the input to be broadcast. "shape" is expected output
-  // shape. "broadcast_dimensions" are the dimensions to be broadcasting into.
-  // Dimension numbers in broadcast_dimensions map to individual dimensions
-  // of the operand, and specify what dimension of the output shape they
-  // should be broadcast.
-  // e.g.
-  // Say operand = [1, 2], i.e., a 1D tensor with 2 elements.
-  // and dimension of shape is [2,2].
-  // Specifying {1} as brodcast_dimension will generate output
-  // [1 , 2]
-  // [1 , 2]
-  // On the other hand, specifying {0} as broadcast_dimension
-  // will generate output
-  // [1 , 1]
-  // [2 , 2]
-  XlaOp BroadcastInDim(
-      const XlaOp& operand, const Shape& shape,
-      const tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-
-  // Enqueues a pad operation onto the computation that pads the given value on
-  // the edges as well as between the elements of the input. padding_config
-  // specifies the padding amount for each dimension.
-  XlaOp Pad(const XlaOp& operand, const XlaOp& padding_value,
-            const PaddingConfig& padding_config);
-
-  // Enqueues an operation onto the computation that flattens the operand based
-  // on the dimension order (major/slowest-varying to minor/fastest-varying)
-  // given, followed by reshaping it into the shape with the given dimension
-  // sizes (also major to minor). Conceptually, this is a limited form of
-  // "shape casting".
-  XlaOp Reshape(const XlaOp& operand,
-                tensorflow::gtl::ArraySlice<int64> dimensions,
-                tensorflow::gtl::ArraySlice<int64> new_sizes);
-
-  // Enqueues an operation onto the computation that collapses the operand, from
-  // first to last dimension (C order), then reshapes it to the given dimension
-  // sizes. Conceptually, this is a limited form of "shape casting".
-  XlaOp Reshape(const XlaOp& operand,
-                tensorflow::gtl::ArraySlice<int64> new_sizes);
-
-  // Wrapper for Reshape.
-  // Enqueues an operation to collapse the provided dimensions; e.g. an
-  // operand with dimensions {x=256, y=2, z=2, p=32} can be collapsed to
-  // {x=1024, y=32} by collapsing dims {0, 1, 2}. Collapsing dimensions must
-  // be a consecutive, in-order subsequence of the operand dimensions.
-  //
-  // Note that collapsing a single dimension does nothing:
-  //
-  //    {256} collapsing {0} => {256}
-  //    {1} collapsing {0} => {1}
-  //
-  // Collapsing multiple dimensions produces a single result dimension:
-  //
-  //    {256, 2} collapsing {0,1} => {512}
-  //    {256, 2, 3} collapsing {0,1} => {512, 3}
-  //
-  // This could potentially cause data to be moved -- it provides a more
-  // structured form of reshaping than an arbitrary Reshape operation.
-  XlaOp Collapse(const XlaOp& operand,
-                 tensorflow::gtl::ArraySlice<int64> dimensions);
-
-  // Enqueues a slice operation onto the computation that slices the operand
-  // from the start indices to the limit indices; e.g.
-  //
-  //        x
-  //   [ 0 1 2 3 ]
-  // y [ 4 5 6 7 ] => slice(start={1, 1}, limit={2, 3}) => [ 5 6 ]
-  //   [ 8 9 a b ]
-  //
-  // Note that "limit" means up-to-but-not-including; i.e. [start, limit) in 1D
-  // range notation.
-  // The strides parameter determines the stride over the slice
-  XlaOp Slice(const XlaOp& operand,
-              tensorflow::gtl::ArraySlice<int64> start_indices,
-              tensorflow::gtl::ArraySlice<int64> limit_indices,
-              tensorflow::gtl::ArraySlice<int64> strides);
-
-  // Enqueues a slice operation in a given dimension, taking all other
-  // dimensions as they are; e.g. if dimno is 1 from start_index 2 to
-  // limit_index 4 by 1, and the shape is f32[7,8,9], this call is short-hand
-  // for:
-  //
-  //  array[:, 2:4:1, :]
-  XlaOp SliceInDim(const XlaOp& operand, int64 start_index, int64 limit_index,
-                   int64 stride, int64 dimno);
-
-  // Enqueues a slice operation onto the computation that slices the 'operand'
-  // from dynamic start indices which are passed in 'start_indices'.
-  // The size of the slice in each dimension is passed in 'slice_sizes',
-  // which specify the end point of exclusive slice intervals in each
-  // dimension [start, start + size).
-  // The shape of 'start_indices' must be rank == 1, with dimension size
-  // equal to the rank of the 'operand'.
-  // Slice index calculations are computed modulo input dimension sizes to
-  // prevent dynamic start indices from generating out-of-bound array accesses.
-  XlaOp DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
-                     tensorflow::gtl::ArraySlice<int64> slice_sizes);
-
-  // Enqueues a dynamic update slice operation onto the computation, which
-  // updates a slice of 'operand' with 'update' at dynamic 'start_indices'.
-  // The shape of 'update' determines the shape of the slice of 'operand'
-  // which is updated.
-  // The indices specified in 'start_indices' specify the offset of the slice
-  // of 'operand' which is updated.
-  //
-  //               update = {10, 11} // calculated at runtime.
-  //   [1 2 3]     start  = {1, 1}   // calculated at runtime.  [1 2  3 ]
-  //   [4 5 6]  => DynamicUpdateslice(data, update, start)   => [4 10 11]
-  //   [7 8 9]                                                  [7 8  9 ]
-  //
-  // The shape of 'start_indices' must be rank == 1, with dimension size
-  // equal to the rank of the 'operand'.
-  // Slice index calculations are computed modulo update dimension sizes to
-  // prevent dynamic start indices from generating out-of-bound array accesses.
-  XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
-                           const XlaOp& start_indices);
-
-  // Enqueues a concatenate instruction onto the computation. 'operands' must
-  // have >= 1 entry.
-  XlaOp ConcatInDim(tensorflow::gtl::ArraySlice<XlaOp> operands,
-                    int64 dimension);
-
-  // Enqueue a tracing operation onto the computation; the computation will emit
-  // a logging message with the operand.
-  void Trace(const string& tag, const XlaOp& operand);
-
-  // Enqueues a conditional-move-like select operation onto the computation;
-  // predicated on pred, selects between on_true and on_false.
-  XlaOp Select(const XlaOp& pred, const XlaOp& on_true, const XlaOp& on_false);
-
-  // Enqueues a tuple-creation instruction onto the computation.
-  XlaOp Tuple(tensorflow::gtl::ArraySlice<XlaOp> elements);
-
-  // Enqueues a tuple-element-get instruction onto the computation.
-  XlaOp GetTupleElement(const XlaOp& tuple_data, int64 index);
-
-  // Enqueues an equal-to comparison instruction onto the computation.
-  XlaOp Eq(const XlaOp& lhs, const XlaOp& rhs,
-           tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a not-equal comparison instruction onto the computation.
-  XlaOp Ne(const XlaOp& lhs, const XlaOp& rhs,
-           tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a greater-or-equal comparison instruction onto the computation.
-  XlaOp Ge(const XlaOp& lhs, const XlaOp& rhs,
-           tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a greater-than comparison instruction onto the computation.
-  XlaOp Gt(const XlaOp& lhs, const XlaOp& rhs,
-           tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a less-than comparison instruction onto the computation.
-  XlaOp Lt(const XlaOp& lhs, const XlaOp& rhs,
-           tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a less-or-equal comparison instruction onto the computation.
-  XlaOp Le(const XlaOp& lhs, const XlaOp& rhs,
-           tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a dot instruction onto the computation.
-  XlaOp Dot(const XlaOp& lhs, const XlaOp& rhs);
-
-  // Enqueues a general dot instruction onto the computation.
-  XlaOp DotGeneral(const XlaOp& lhs, const XlaOp& rhs,
-                   const DotDimensionNumbers& dimension_numbers);
-
-  // Enqueues a convolution instruction onto the computation, which uses the
-  // default convolution dimension numbers.
-  XlaOp Conv(const XlaOp& lhs, const XlaOp& rhs,
-             tensorflow::gtl::ArraySlice<int64> window_strides,
-             Padding padding);
-
-  // Enqueues a convolution instruction onto the computation, with the caller
-  // provided padding configuration in the format returned by MakePadding().
-  XlaOp ConvWithGeneralPadding(
-      const XlaOp& lhs, const XlaOp& rhs,
-      tensorflow::gtl::ArraySlice<int64> window_strides,
-      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding);
-
-  // Enqueues a convolution instruction onto the computation, with the caller
-  // provided dimension numbers configuration.
-  XlaOp ConvWithGeneralDimensions(
-      const XlaOp& lhs, const XlaOp& rhs,
-      tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
-      const ConvolutionDimensionNumbers& dimension_numbers);
-
-  // Enqueues a convolution instruction onto the computation, with the caller
-  // provided padding configuration as well as the dimension numbers.
-  XlaOp ConvGeneral(
-      const XlaOp& lhs, const XlaOp& rhs,
-      tensorflow::gtl::ArraySlice<int64> window_strides,
-      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
-      const ConvolutionDimensionNumbers& dimension_numbers);
-
-  // Enqueues a convolution instruction onto the computation, with the caller
-  // provided padding configuration, dilation factors and dimension numbers.
-  XlaOp ConvGeneralDilated(
-      const XlaOp& lhs, const XlaOp& rhs,
-      tensorflow::gtl::ArraySlice<int64> window_strides,
-      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
-      tensorflow::gtl::ArraySlice<int64> lhs_dilation,
-      tensorflow::gtl::ArraySlice<int64> rhs_dilation,
-      const ConvolutionDimensionNumbers& dimension_numbers);
-
-  // Enqueues an FFT instruction onto the computation, of the given type and
-  // with the given FFT length.
-  XlaOp Fft(const XlaOp& operand, FftType fft_type,
-            tensorflow::gtl::ArraySlice<int64> fft_length);
-
-  // Enqueues an infeed instruction onto the computation, which writes data of
-  // the given shape to the infeed buffer of the device.
-  XlaOp Infeed(const Shape& shape, const string& config = "");
-  XlaOp InfeedWithToken(const XlaOp& token, const Shape& shape,
-                        const string& config = "");
-
-  // Enqueues an outfeed instruction onto the computation. This instruction
-  // generates outgoing data transfers for the given data.
-  //
-  // shape_with_layout communicates the laid out shape that we want to outfeed
-  // -- if !ShapeUtil::Compatible(GetShape(operand), shape_with_layout) an error
-  // will occur.
-  void Outfeed(const XlaOp& operand, const Shape& shape_with_layout,
-               const string& outfeed_config);
-  XlaOp OutfeedWithToken(const XlaOp& operand, const XlaOp& token,
-                         const Shape& shape_with_layout,
-                         const string& outfeed_config);
-
-  // Enqueues a call instruction onto the computation.
-  XlaOp Call(const XlaComputation& computation,
-             tensorflow::gtl::ArraySlice<XlaOp> operands);
-
-  // Enqueues a custom call instruction onto the computation.
-  // During code generation, a call instruction is emitted which targets a
-  // symbol with the name |call_target_name|.  The |operands| are passed to the
-  // call instruction.  |shape| is the resultant shape.
-  XlaOp CustomCall(const string& call_target_name,
-                   tensorflow::gtl::ArraySlice<XlaOp> operands,
-                   const Shape& shape);
-
-  // Enqueues a pseudo-op to represent host-side computation data-dependencies.
-  // During code generation, host send and receive operations will be generated
-  // to transfer |operands| to the host and a single result of |shape| back to
-  // the device.  Host send/recv operations are emitted using |channel_name|.
-  // Dataflow dependencies and the |cost_estimate_ns| field may be used in HLO
-  // instruction scheduling.
-  XlaOp HostCompute(tensorflow::gtl::ArraySlice<XlaOp> operands,
-                    const string& channel_name, int64 cost_estimate_ns,
-                    const Shape& shape);
-
-  // The following methods enqueue element-wise binary arithmetic operations
-  // onto the computation. The shapes of the operands have to match unless one
-  // of the operands is a scalar, or an explicit broadcast dimension is given
-  // (see g3doc for more details).
-
-  // Enqueues a complex compose instruction onto the computation.
-  XlaOp Complex(const XlaOp& real, const XlaOp& imag,
-                tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a complex conjugate instruction onto the computation.
-  XlaOp Conj(const XlaOp& operand);
-
-  // Enqueues an add instruction onto the computation.
-  XlaOp Add(const XlaOp& lhs, const XlaOp& rhs,
-            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a subtract instruction onto the computation.
-  XlaOp Sub(const XlaOp& lhs, const XlaOp& rhs,
-            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a multiply instruction onto the computation.
-  XlaOp Mul(const XlaOp& lhs, const XlaOp& rhs,
-            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a divide instruction onto the computation.
-  XlaOp Div(const XlaOp& lhs, const XlaOp& rhs,
-            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a remainder instruction onto the computation.
-  XlaOp Rem(const XlaOp& lhs, const XlaOp& rhs,
-            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a max instruction onto the computation.
-  XlaOp Max(const XlaOp& lhs, const XlaOp& rhs,
-            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a min instruction onto the computation.
-  XlaOp Min(const XlaOp& lhs, const XlaOp& rhs,
-            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Element-wise logical operators
-  XlaOp And(const XlaOp& lhs, const XlaOp& rhs,
-            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  XlaOp Or(const XlaOp& lhs, const XlaOp& rhs,
-           tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  XlaOp Xor(const XlaOp& lhs, const XlaOp& rhs,
-            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  XlaOp Not(const XlaOp& operand);
-
-  XlaOp ShiftLeft(const XlaOp& lhs, const XlaOp& rhs,
-                  tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-  XlaOp ShiftRightArithmetic(
-      const XlaOp& lhs, const XlaOp& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-  XlaOp ShiftRightLogical(
-      const XlaOp& lhs, const XlaOp& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Reduces an array among the provided dimensions, given "computation" as a
-  // reduction operator.
-  XlaOp Reduce(const XlaOp& operand, const XlaOp& init_value,
-               const XlaComputation& computation,
-               tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce);
-
-  // Convenience wrapper around the above that reduces all the dimensions in the
-  // operand shape.
-  XlaOp ReduceAll(const XlaOp& operand, const XlaOp& init_value,
-                  const XlaComputation& computation);
-
-  // Enqueues a windowed reduce instruction onto the computation.
-  XlaOp ReduceWindow(const XlaOp& operand, const XlaOp& init_value,
-                     const XlaComputation& computation,
-                     tensorflow::gtl::ArraySlice<int64> window_dimensions,
-                     tensorflow::gtl::ArraySlice<int64> window_strides,
-                     Padding padding);
-
-  // As ReduceWindow(), but the padding is given in the format
-  // returned by MakePadding().
-  XlaOp ReduceWindowWithGeneralPadding(
-      const XlaOp& operand, const XlaOp& init_value,
-      const XlaComputation& computation,
-      tensorflow::gtl::ArraySlice<int64> window_dimensions,
-      tensorflow::gtl::ArraySlice<int64> window_strides,
-      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding);
-
-  // Returns the sum of the operand value within each subgroup of replicas. All
-  // replicas supply one input to the sum and all replicas receive the resulting
-  // sum for each subgroup.
-  XlaOp CrossReplicaSum(
-      const XlaOp& operand,
-      tensorflow::gtl::ArraySlice<int64> replica_group_ids = {});
-
-  // Enqueues an operation that do an AllReduce of the operand cross cores. Here
-  // AllReduce means doing a reduction on the input operand cross cores and then
-  // broadcasting the reduction result to those cores. The reduction function is
-  // defined by `computation`, which should be a commutative computation on
-  // scalars, e.g., add, min, or max. The way that AllReduce is applied is
-  // configured by:
-  //
-  // - `replica_group_ids`: maps replica ids to subgroup ids. If empty, all
-  // replicas belong to one group. Allreduce will be applied within subgroups.
-  // For example, we have 4 replicas, then replica_group_ids={0,1,0,1} means,
-  // replica 0 and 2 are in subgroup 0, replica 1 and 3 are in subgroup 1.
-  //
-  // - `channel_id`: for Allreduce nodes from different models, if they have the
-  // same channel_id, they will be 'Allreduce'd. If empty, Allreduce will not be
-  // applied cross models.
-  //
-  // TODO(b/79737069): Rename this to AllReduce when it's ready to use.
-  XlaOp CrossReplicaSum(
-      const XlaOp& operand, const XlaComputation& computation,
-      tensorflow::gtl::ArraySlice<int64> replica_group_ids = {},
-      const tensorflow::gtl::optional<ChannelHandle>& channel_id =
-          tensorflow::gtl::nullopt);
-
-  // Enqueues an operation that scatters the `source` array to the selected
-  // indices of each window.
-  XlaOp SelectAndScatter(const XlaOp& operand, const XlaComputation& select,
-                         tensorflow::gtl::ArraySlice<int64> window_dimensions,
-                         tensorflow::gtl::ArraySlice<int64> window_strides,
-                         Padding padding, const XlaOp& source,
-                         const XlaOp& init_value,
-                         const XlaComputation& scatter);
-
-  // As SelectAndScatter(), but the padding is given in the format
-  // returned by MakePadding().
-  XlaOp SelectAndScatterWithGeneralPadding(
-      const XlaOp& operand, const XlaComputation& select,
-      tensorflow::gtl::ArraySlice<int64> window_dimensions,
-      tensorflow::gtl::ArraySlice<int64> window_strides,
-      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
-      const XlaOp& source, const XlaOp& init_value,
-      const XlaComputation& scatter);
-
-  // Enqueues an abs instruction onto the computation.
-  XlaOp Abs(const XlaOp& operand);
-
-  // Enqueues a atan2 instruction onto the computation.
-  XlaOp Atan2(const XlaOp& y, const XlaOp& x,
-              tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues an exp instruction onto the computation.
-  XlaOp Exp(const XlaOp& operand);
-
-  // Enqueues an expm1 instruction onto the computation.
-  XlaOp Expm1(const XlaOp& operand);
-
-  // Enqueues a floor instruction onto the computation.
-  XlaOp Floor(const XlaOp& operand);
-
-  // Enqueues a ceil instruction onto the computation.
-  XlaOp Ceil(const XlaOp& operand);
-
-  // Enqueues a round instruction onto the computation, rounding to nearest even
-  // with half-way cases rounding away from zero.
-  XlaOp Round(const XlaOp& operand);
-
-  // Enqueues an log instruction (natural logarithm) onto the computation.
-  XlaOp Log(const XlaOp& operand);
-
-  // Enqueues an log1p instruction (log(x+1)) onto the computation.
-  XlaOp Log1p(const XlaOp& operand);
-
-  // Enqueues a sign instruction onto the computation.
-  XlaOp Sign(const XlaOp& operand);
-
-  // Enqueues a count leading zeros instruction onto the computation.
-  XlaOp Clz(const XlaOp& operand);
-
-  // Enqueues a cosine instruction onto the computation.
-  XlaOp Cos(const XlaOp& operand);
-
-  // Enqueues a sine instruction onto the computation.
-  XlaOp Sin(const XlaOp& operand);
-
-  // Enqueues a tanh instruction onto the computation.
-  XlaOp Tanh(const XlaOp& operand);
-
-  // Enqueues a real-part instruction onto the computation.
-  XlaOp Real(const XlaOp& operand);
-
-  // Enqueues an imaginary-part instruction onto the computation.
-  XlaOp Imag(const XlaOp& operand);
-
-  // Enqueues a lhs^rhs computation onto the computation.
-  XlaOp Pow(const XlaOp& lhs, const XlaOp& rhs,
-            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues an operator that tests if the operand's values are finite, i.e.,
-  // not Inf or NaN. Defined only for floating-point types. Returns an array of
-  // booleans with the same shape where entries are true iff the corresponding
-  // entry was NaN.
-  XlaOp IsFinite(const XlaOp& operand);
-
-  // Enqueues a convert instruction onto the computation that changes the
-  // element type of the operand array to primitive_type.
-  XlaOp ConvertElementType(const XlaOp& operand,
-                           PrimitiveType new_element_type);
-
-  // Enqueues a no-op instruction onto the computation that changes
-  // the element type of the operand array to primitive_type. The
-  // bit-widths of the source and destination element types must be
-  // identical.
-  XlaOp BitcastConvertType(const XlaOp& operand,
-                           PrimitiveType new_element_type);
-
-  // Enqueues a negate instruction onto the computation.
-  XlaOp Neg(const XlaOp& operand);
-
-  // Enqueues a transpose instruction onto the computation.
-  XlaOp Transpose(const XlaOp& operand,
-                  tensorflow::gtl::ArraySlice<int64> permutation);
-
-  // Enqueues a reverse instruction onto the computation. The order of the
-  // elements in the given dimensions is reversed (i.e., the element at index i
-  // is moved to index dimension_size - 1 - i).
-  XlaOp Rev(const XlaOp& operand,
-            tensorflow::gtl::ArraySlice<int64> dimensions);
-
-  // Enqueues a sort (as increasing order) instruction onto the computation.
-  // If only keys are provided:
-  // * If the keys are an rank-1 tensor (an array), the result is a sorted array
-  // of keys, in ascending order.
-  // * If the keys have higher rank, the keys are sorted along the provided
-  // dimension. For example, for a rank-2 tensor (a matrix) of keys, a dimension
-  // value of 0 will indepenently sort every column, and a dimension value of 1
-  // will independently sort each row. If no dimension number is provided, then
-  // the last dimension is chosen by default.
-  //
-  // If both keys and values are provided:
-  // * The keys and the values must tensors with the same dimensions. The
-  // element types of the tensors may be different.
-  // * The result is a tuple that consists of a sorted tensor of keys (along the
-  // provided dimension, as above) as the first element, and a tensor with their
-  // corresponding values as the second element.
-  XlaOp Sort(XlaOp keys,
-             tensorflow::gtl::optional<XlaOp> values = tensorflow::gtl::nullopt,
-             int64 dimension = -1);
-
-  // Enqueues a clamp instruction onto the computation.
-  XlaOp Clamp(const XlaOp& min, const XlaOp& operand, const XlaOp& max);
-
-  // Enqueues a map instruction onto the computation.
-  XlaOp Map(tensorflow::gtl::ArraySlice<XlaOp> operands,
-            const XlaComputation& computation,
-            tensorflow::gtl::ArraySlice<int64> dimensions,
-            tensorflow::gtl::ArraySlice<XlaOp> static_operands = {});
-
-  // Enqueues a N(mu, sigma) random number generation instruction onto the
-  // computation.
-  XlaOp RngNormal(const XlaOp& mu, const XlaOp& sigma, const Shape& shape);
-
-  // Enqueues a U(a, b) random number generation instruction onto the
-  // computation. Returns values in the semi-open interval [a, b).
-  XlaOp RngUniform(const XlaOp& a, const XlaOp& b, const Shape& shape);
-
-  // Enqueues a while node onto the computation.
-  XlaOp While(const XlaComputation& condition, const XlaComputation& body,
-              const XlaOp& init);
-
-  // Enqueues a conditional node onto the computation.
-  XlaOp Conditional(const XlaOp& predicate, const XlaOp& true_operand,
-                    const XlaComputation& true_computation,
-                    const XlaOp& false_operand,
-                    const XlaComputation& false_computation);
-
-  // Enqueues a ReducePrecision node onto the computation.
-  XlaOp ReducePrecision(const XlaOp& operand, const int exponent_bits,
-                        const int mantissa_bits);
-
-  // Enqueues a Gather node onto the computation.
-  XlaOp Gather(const XlaOp& input, const XlaOp& gather_indices,
-               const GatherDimensionNumbers& dimension_numbers,
-               tensorflow::gtl::ArraySlice<int64> window_bounds);
-
-  // Enqueues a Send node onto the computation for device-to-device
-  // communication, to send the given operand to a Recv instruction that shares
-  // the same channel handle.
-  void Send(const XlaOp& operand, const ChannelHandle& handle);
-  XlaOp SendWithToken(const XlaOp& operand, const XlaOp& token,
-                      const ChannelHandle& handle);
-
-  // Enqueues a Send node which sends data to the host.
-  XlaOp SendToHost(const XlaOp& operand, const XlaOp& token,
-                   const Shape& shape_with_layout, const ChannelHandle& handle);
-
-  // Enqueues a Recv node which receives data from the host.
-  XlaOp RecvFromHost(const XlaOp& token, const Shape& shape,
-                     const ChannelHandle& handle);
-
-  // Enqueues an AfterAll operation with no operands producing a token-shaped
-  // value.
-  XlaOp CreateToken();
-
-  // Enqueues an AfterAll operation with no operands producing a token-shaped
-  // value.
-  XlaOp AfterAll(tensorflow::gtl::ArraySlice<XlaOp> tokens);
-
-  // Enqueues a Recv node onto the computation. The data comes from a Send
-  // instruction that shares the same channel handle and its shape must
-  // be the same as the given shape.
-  XlaOp Recv(const Shape& shape, const ChannelHandle& handle);
-  XlaOp RecvWithToken(const XlaOp& token, const Shape& shape,
-                      const ChannelHandle& handle);
-
-  // Normalizes operand across spatial and batch dimensions for each feature.
-  //
-  // Returns a tuple (normalized, batch_mean, batch_var) where `normalized`
-  // is the normalized result and batch_mean and batch_var are the mean and
-  // variance, respectively, across batch for the operand.
-  XlaOp BatchNormTraining(const XlaOp& operand, const XlaOp& scale,
-                          const XlaOp& offset, float epsilon,
-                          int64 feature_index);
-
-  // Normalizes operand across spatial and batch dimensions for each feature.
-  //
-  // `BatchNormInference` is equivalent to calling `BatchNormTraining` without
-  // computing `mean` and `variance` for each batch inside the operation. It
-  // uses the input `mean` and `variance` instead as estimated values. The
-  // purpose of this op is to reduce latency in inference, hence the name
-  // `BatchNormInference`.
-  //
-  // The output has the same shape as `operand`, and contains the normalized
-  // values for each batch.
-  XlaOp BatchNormInference(const XlaOp& operand, const XlaOp& scale,
-                           const XlaOp& offset, const XlaOp& mean,
-                           const XlaOp& variance, float epsilon,
-                           int64 feature_index);
-
-  // Calculates the gradients of a batch norm op.
-  //
-  // The inputs `batch_mean` and `batch_var` represent the mean and variance
-  // across the batch.
-  //
-  // Returns a tuple of three elements:
-  //   - grad_operand: Gradient with respect to input `operand`
-  //   - grad_offset: Gradient with respect to input `offset`
-  //   - grad_scale: Gradient with respect to input `scale`
-  XlaOp BatchNormGrad(const XlaOp& operand, const XlaOp& scale,
-                      const XlaOp& batch_mean, const XlaOp& batch_var,
-                      const XlaOp& grad_output, float epsilon,
-                      int64 feature_index);
-
-  StatusOr<XlaOp> AddInstruction(
-      HloInstructionProto&& instr, HloOpcode opcode,
-      tensorflow::gtl::ArraySlice<XlaOp> operands = {});
-
-  void AddCalledComputation(const XlaComputation& computation,
-                            HloInstructionProto* instr);
-
-  StatusOr<const HloInstructionProto*> LookUpInstruction(const XlaOp& op) const;
-
-  // Internal helper method that does the building for an arbitrary unary op.
-  XlaOp UnaryOp(HloOpcode unop, const XlaOp& operand);
-
-  // Internal helper method that does the building for an arbitrary binary op.
-  // broadcast_dimensions specifies which dimensions to use for broadcasting
-  // when the operation is between tensors of different ranks.
-  XlaOp BinaryOp(HloOpcode binop, const XlaOp& lhs, const XlaOp& rhs,
-                 tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-
-  // Internal helper method that does the building for an arbitrary ternary op.
-  XlaOp TernaryOp(HloOpcode triop, const XlaOp& lhs, const XlaOp& rhs,
-                  const XlaOp& ehs);
-
-  XlaOp RngOp(RandomDistribution distribution,
-              tensorflow::gtl::ArraySlice<XlaOp> parameters,
-              const Shape& shape);
-
-  StatusOr<XlaOp> InDimBroadcast(
-      const Shape& shape, const XlaOp& operand,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-
-  // Internal helper method that creates a sequence of instructions that
-  // performs an explicit broadcast of the operand to the target shape.
-  StatusOr<XlaOp> AddBroadcastSequence(const Shape& output_shape,
-                                       const XlaOp& operand);
-
-  // Internal helper method for creating a Reshape op with the already inferred
-  // shape.
-  StatusOr<XlaOp> Reshape(const Shape& shape, const XlaOp& operand);
-
-  // Returns the (inferred) result for the program shape for the current
-  // computation and fills the root_id in the pointer.
-  StatusOr<ProgramShape> GetProgramShape(int64* root_id) const;
-
-  // Returns shapes for the operands.
-  StatusOr<std::vector<Shape>> GetOperandShapes(
-      tensorflow::gtl::ArraySlice<XlaOp> operands) const;
-
-  // A visitor which checks whether an operation is a compile-time constant,
-  // meaning that it doesn't depend on any parameters, or on any stateful
-  // operation such as `RngNormal` or `Infeed`. The visitor walks the
-  // computation starting at a given operation and sets is_constant to false iff
-  // a parameter or stateful operation is encountered.
-  void IsConstantVisitor(const int64 op_handle, std::set<int64>* visited,
-                         bool* is_constant) const;
-
-  // Checks bounds for convolution parameters.
-  Status VerifyConvolution(
-      const Shape& lhs_shape, const Shape& rhs_shape,
-      const ConvolutionDimensionNumbers& dimension_numbers) const;
-
-  // Helper function for creating a Window proto from user-supplied data.
-  // Returns error if the user-supplied data was invalid.
-  StatusOr<Window> MakeWindow(
-      tensorflow::gtl::ArraySlice<int64> window_dimensions,
-      tensorflow::gtl::ArraySlice<int64> window_strides,
-      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
-      tensorflow::gtl::ArraySlice<int64> lhs_dilation,
-      tensorflow::gtl::ArraySlice<int64> rhs_dilation) const;
-
-  string name_;  // Name to use for the built computation.
-
-  // The first error encountered while building the computation.
-  // This is OK until the first error is encountered.
-  Status first_error_;
-
-  // The saved stack trace from the point at which the first error occurred.
-  tensorflow::SavedStackTrace first_error_backtrace_;
-
-  // The instructions of this computation.
-  std::vector<HloInstructionProto> instructions_;
-
-  // The embedded computations used by this computation. Each computation was
-  // the entry computation of some XlaComputation, the key is the unique id of
-  // that XlaComputation.
-  std::map<int64, HloComputationProto> embedded_;
-
-  // The unique parameter numbers.
-  tensorflow::gtl::FlatSet<int64> parameter_numbers_;
-
-  // The metadata to attach to each op. This is structured as a "modal"-like
-  // operation, in order to simplify client code (and not sprinkle this metadata
-  // throughout the TensorFlow op kernel implementations).
-  OpMetadata metadata_;
-
-  // Sharding for this operator. This is structured as a "model"-like operation,
-  // in order to simplify client code, similar to metadata_.
-  tensorflow::gtl::optional<OpSharding> sharding_;
-
-  // Mode bit that indicates whether to die when a first error is encountered.
-  bool die_immediately_on_error_ = false;
-
-  XlaBuilder* parent_builder_{nullptr};
-
-  friend XlaOp Parameter(XlaBuilder* builder, int64 parameter_number,
-                         const Shape& shape, const string& name);
-  friend XlaOp ConstantLiteral(XlaBuilder* builder,
-                               const LiteralSlice& literal);
-  template <typename NativeT>
-  friend XlaOp ConstantR0(XlaBuilder* builder, NativeT value);
-  template <typename NativeT>
-  friend XlaOp ConstantR1(XlaBuilder* builder,
-                          tensorflow::gtl::ArraySlice<NativeT> values);
-  friend XlaOp ConstantR1(XlaBuilder* builder,
-                          const tensorflow::core::Bitmap& values);
-  template <typename NativeT>
-  friend XlaOp ConstantR2(
-      XlaBuilder* builder,
-      std::initializer_list<std::initializer_list<NativeT>> values);
-  template <typename NativeT>
-  friend XlaOp ConstantFromArrayWithLayout(XlaBuilder* builder,
-                                           const Array<NativeT>& values,
-                                           const Layout& layout);
-  template <typename NativeT>
-  friend XlaOp ConstantFromArray(XlaBuilder* builder,
-                                 const Array<NativeT>& values);
-  template <typename NativeT>
-  friend XlaOp ConstantR2FromArray2DWithLayout(XlaBuilder* builder,
-                                               const Array2D<NativeT>& values,
-                                               const Layout& layout);
-  template <typename NativeT>
-  friend XlaOp ConstantR2FromArray2D(XlaBuilder* builder,
-                                     const Array2D<NativeT>& values);
-  template <typename NativeT>
-  friend XlaOp ConstantR3FromArray3DWithLayout(XlaBuilder* builder,
-                                               const Array3D<NativeT>& values,
-                                               const Layout& layout);
-  template <typename NativeT>
-  friend XlaOp ConstantR3FromArray3D(XlaBuilder* builder,
-                                     const Array3D<NativeT>& values);
-  template <typename NativeT>
-  friend XlaOp ConstantR4FromArray4DWithLayout(XlaBuilder* builder,
-                                               const Array4D<NativeT>& values,
-                                               const Layout& layout);
-  template <typename NativeT>
-  friend XlaOp ConstantR4FromArray4D(XlaBuilder* builder,
-                                     const Array4D<NativeT>& values);
-
-  template <typename NativeT>
-  friend XlaOp ConstantR1(XlaBuilder* builder, int64 length, NativeT value);
-
-  friend XlaOp Broadcast(const XlaOp& operand,
-                         tensorflow::gtl::ArraySlice<int64> broadcast_sizes);
-
-  friend XlaOp BroadcastInDim(
-      const XlaOp& operand, const Shape& shape,
-      const tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-
-  friend XlaOp Pad(const XlaOp& operand, const XlaOp& padding_value,
-                   const PaddingConfig& padding_config);
-
-  friend XlaOp Reshape(const XlaOp& operand,
-                       tensorflow::gtl::ArraySlice<int64> dimensions,
-                       tensorflow::gtl::ArraySlice<int64> new_sizes);
-
-  friend XlaOp Reshape(const XlaOp& operand,
-                       tensorflow::gtl::ArraySlice<int64> new_sizes);
-
-  friend XlaOp Collapse(const XlaOp& operand,
-                        tensorflow::gtl::ArraySlice<int64> dimensions);
-
-  friend XlaOp Slice(const XlaOp& operand,
-                     tensorflow::gtl::ArraySlice<int64> start_indices,
-                     tensorflow::gtl::ArraySlice<int64> limit_indices,
-                     tensorflow::gtl::ArraySlice<int64> strides);
-
-  friend XlaOp SliceInDim(const XlaOp& operand, int64 start_index,
-                          int64 limit_index, int64 stride, int64 dimno);
-
-  friend XlaOp DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
-                            tensorflow::gtl::ArraySlice<int64> slice_sizes);
-
-  friend XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
-                                  const XlaOp& start_indices);
-
-  friend XlaOp ConcatInDim(XlaBuilder* builder,
-                           tensorflow::gtl::ArraySlice<XlaOp> operands,
-                           int64 dimension);
-
-  friend void Trace(const string& tag, const XlaOp& operand);
-
-  friend XlaOp Select(const XlaOp& pred, const XlaOp& on_true,
-                      const XlaOp& on_false);
-  friend XlaOp Tuple(XlaBuilder* builder,
-                     tensorflow::gtl::ArraySlice<XlaOp> elements);
-  friend XlaOp GetTupleElement(const XlaOp& tuple_data, int64 index);
-  friend XlaOp Eq(const XlaOp& lhs, const XlaOp& rhs,
-                  tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-  friend XlaOp Ne(const XlaOp& lhs, const XlaOp& rhs,
-                  tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-  friend XlaOp Ge(const XlaOp& lhs, const XlaOp& rhs,
-                  tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-  friend XlaOp Gt(const XlaOp& lhs, const XlaOp& rhs,
-                  tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-  friend XlaOp Lt(const XlaOp& lhs, const XlaOp& rhs,
-                  tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-  friend XlaOp Le(const XlaOp& lhs, const XlaOp& rhs,
-                  tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-  friend XlaOp Dot(const XlaOp& lhs, const XlaOp& rhs);
-  friend XlaOp DotGeneral(const XlaOp& lhs, const XlaOp& rhs,
-                          const DotDimensionNumbers& dimension_numbers);
-  friend XlaOp Conv(const XlaOp& lhs, const XlaOp& rhs,
-                    tensorflow::gtl::ArraySlice<int64> window_strides,
-                    Padding padding);
-  friend XlaOp ConvWithGeneralPadding(
-      const XlaOp& lhs, const XlaOp& rhs,
-      tensorflow::gtl::ArraySlice<int64> window_strides,
-      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding);
-  friend XlaOp ConvWithGeneralDimensions(
-      const XlaOp& lhs, const XlaOp& rhs,
-      tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
-      const ConvolutionDimensionNumbers& dimension_numbers);
-  friend XlaOp ConvGeneral(
-      const XlaOp& lhs, const XlaOp& rhs,
-      tensorflow::gtl::ArraySlice<int64> window_strides,
-      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
-      const ConvolutionDimensionNumbers& dimension_numbers);
-  friend XlaOp ConvGeneralDilated(
-      const XlaOp& lhs, const XlaOp& rhs,
-      tensorflow::gtl::ArraySlice<int64> window_strides,
-      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
-      tensorflow::gtl::ArraySlice<int64> lhs_dilation,
-      tensorflow::gtl::ArraySlice<int64> rhs_dilation,
-      const ConvolutionDimensionNumbers& dimension_numbers);
-  friend XlaOp Fft(const XlaOp& operand, FftType fft_type,
-                   tensorflow::gtl::ArraySlice<int64> fft_length);
-  friend XlaOp Infeed(XlaBuilder* builder, const Shape& shape,
-                      const string& config);
-  friend void Outfeed(const XlaOp& operand, const Shape& shape_with_layout,
-                      const string& outfeed_config);
-  friend XlaOp Call(XlaBuilder* builder, const XlaComputation& computation,
-                    tensorflow::gtl::ArraySlice<XlaOp> operands);
-  friend XlaOp CustomCall(XlaBuilder* builder, const string& call_target_name,
-                          tensorflow::gtl::ArraySlice<XlaOp> operands,
-                          const Shape& shape);
-  friend XlaOp HostCompute(XlaBuilder* builder,
-                           tensorflow::gtl::ArraySlice<XlaOp> operands,
-                           const string& channel_name, int64 cost_estimate_ns,
-                           const Shape& shape);
-  friend XlaOp Complex(const XlaOp& real, const XlaOp& imag,
-                       tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-  friend XlaOp Conj(const XlaOp& operand);
-  friend XlaOp Add(const XlaOp& lhs, const XlaOp& rhs,
-                   tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-  friend XlaOp Sub(const XlaOp& lhs, const XlaOp& rhs,
-                   tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-  friend XlaOp Mul(const XlaOp& lhs, const XlaOp& rhs,
-                   tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-  friend XlaOp Div(const XlaOp& lhs, const XlaOp& rhs,
-                   tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-  friend XlaOp Rem(const XlaOp& lhs, const XlaOp& rhs,
-                   tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-  friend XlaOp Max(const XlaOp& lhs, const XlaOp& rhs,
-                   tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-  friend XlaOp Min(const XlaOp& lhs, const XlaOp& rhs,
-                   tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-  friend XlaOp And(const XlaOp& lhs, const XlaOp& rhs,
-                   tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-  friend XlaOp Or(const XlaOp& lhs, const XlaOp& rhs,
-                  tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-  friend XlaOp Xor(const XlaOp& lhs, const XlaOp& rhs,
-                   tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-  friend XlaOp Not(const XlaOp& operand);
-  friend XlaOp ShiftLeft(
-      const XlaOp& lhs, const XlaOp& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-  friend XlaOp ShiftRightArithmetic(
-      const XlaOp& lhs, const XlaOp& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-  friend XlaOp ShiftRightLogical(
-      const XlaOp& lhs, const XlaOp& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-  friend XlaOp Reduce(const XlaOp& operand, const XlaOp& init_value,
-                      const XlaComputation& computation,
-                      tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce);
-  friend XlaOp ReduceAll(const XlaOp& operand, const XlaOp& init_value,
-                         const XlaComputation& computation);
-  friend XlaOp ReduceWindow(
-      const XlaOp& operand, const XlaOp& init_value,
-      const XlaComputation& computation,
-      tensorflow::gtl::ArraySlice<int64> window_dimensions,
-      tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding);
-  friend XlaOp ReduceWindowWithGeneralPadding(
-      const XlaOp& operand, const XlaOp& init_value,
-      const XlaComputation& computation,
-      tensorflow::gtl::ArraySlice<int64> window_dimensions,
-      tensorflow::gtl::ArraySlice<int64> window_strides,
-      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding);
-  friend XlaOp CrossReplicaSum(
-      const XlaOp& operand,
-      tensorflow::gtl::ArraySlice<int64> replica_group_ids);
-  friend XlaOp CrossReplicaSum(
-      const XlaOp& operand, const XlaComputation& computation,
-      tensorflow::gtl::ArraySlice<int64> replica_group_ids,
-      const tensorflow::gtl::optional<ChannelHandle>& channel_id);
-  friend XlaOp SelectAndScatter(
-      const XlaOp& operand, const XlaComputation& select,
-      tensorflow::gtl::ArraySlice<int64> window_dimensions,
-      tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
-      const XlaOp& source, const XlaOp& init_value,
-      const XlaComputation& scatter);
-  friend XlaOp SelectAndScatterWithGeneralPadding(
-      const XlaOp& operand, const XlaComputation& select,
-      tensorflow::gtl::ArraySlice<int64> window_dimensions,
-      tensorflow::gtl::ArraySlice<int64> window_strides,
-      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
-      const XlaOp& source, const XlaOp& init_value,
-      const XlaComputation& scatter);
-  friend XlaOp Abs(const XlaOp& operand);
-  friend XlaOp Atan2(const XlaOp& y, const XlaOp& x,
-                     tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-  friend XlaOp Exp(const XlaOp& operand);
-  friend XlaOp Expm1(const XlaOp& operand);
-  friend XlaOp Floor(const XlaOp& operand);
-  friend XlaOp Ceil(const XlaOp& operand);
-  friend XlaOp Round(const XlaOp& operand);
-  friend XlaOp Log(const XlaOp& operand);
-  friend XlaOp Log1p(const XlaOp& operand);
-  friend XlaOp Sign(const XlaOp& operand);
-  friend XlaOp Clz(const XlaOp& operand);
-  friend XlaOp Cos(const XlaOp& operand);
-  friend XlaOp Sin(const XlaOp& operand);
-  friend XlaOp Tanh(const XlaOp& operand);
-  friend XlaOp Real(const XlaOp& operand);
-  friend XlaOp Imag(const XlaOp& operand);
-  friend XlaOp Pow(const XlaOp& lhs, const XlaOp& rhs,
-                   tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-  friend XlaOp IsFinite(const XlaOp& operand);
-  friend XlaOp ConvertElementType(const XlaOp& operand,
-                                  PrimitiveType new_element_type);
-  friend XlaOp BitcastConvertType(const XlaOp& operand,
-                                  PrimitiveType new_element_type);
-  friend XlaOp Neg(const XlaOp& operand);
-  friend XlaOp Transpose(const XlaOp& operand,
-                         tensorflow::gtl::ArraySlice<int64> permutation);
-  friend XlaOp Rev(const XlaOp& operand,
-                   tensorflow::gtl::ArraySlice<int64> dimensions);
-  friend XlaOp Sort(XlaOp keys, tensorflow::gtl::optional<XlaOp> values,
-                    int64 dimension);
-  friend XlaOp Clamp(const XlaOp& min, const XlaOp& operand, const XlaOp& max);
-  friend XlaOp Map(XlaBuilder* builder,
-                   tensorflow::gtl::ArraySlice<XlaOp> operands,
-                   const XlaComputation& computation,
-                   tensorflow::gtl::ArraySlice<int64> dimensions,
-                   tensorflow::gtl::ArraySlice<XlaOp> static_operands);
-  friend XlaOp RngNormal(const XlaOp& mu, const XlaOp& sigma,
-                         const Shape& shape);
-  friend XlaOp RngUniform(const XlaOp& a, const XlaOp& b, const Shape& shape);
-  friend XlaOp While(const XlaComputation& condition,
-                     const XlaComputation& body, const XlaOp& init);
-  friend XlaOp Conditional(const XlaOp& predicate, const XlaOp& true_operand,
-                           const XlaComputation& true_computation,
-                           const XlaOp& false_operand,
-                           const XlaComputation& false_computation);
-  friend XlaOp ReducePrecision(const XlaOp& operand, const int exponent_bits,
-                               const int mantissa_bits);
-  friend XlaOp Gather(const XlaOp& input, const XlaOp& gather_indices,
-                      const GatherDimensionNumbers& dimension_numbers,
-                      tensorflow::gtl::ArraySlice<int64> window_bounds);
-  friend void Send(const XlaOp& operand, const ChannelHandle& handle);
-  friend XlaOp Recv(XlaBuilder* builder, const Shape& shape,
-                    const ChannelHandle& handle);
-  friend XlaOp BatchNormTraining(const XlaOp& operand, const XlaOp& scale,
-                                 const XlaOp& offset, float epsilon,
-                                 int64 feature_index);
-  friend XlaOp BatchNormInference(const XlaOp& operand, const XlaOp& scale,
-                                  const XlaOp& offset, const XlaOp& mean,
-                                  const XlaOp& variance, float epsilon,
-                                  int64 feature_index);
-  friend XlaOp BatchNormGrad(const XlaOp& operand, const XlaOp& scale,
-                             const XlaOp& batch_mean, const XlaOp& batch_var,
-                             const XlaOp& grad_output, float epsilon,
-                             int64 feature_index);
-  friend XlaOp SendWithToken(const XlaOp& operand, const XlaOp& token,
-                             const ChannelHandle& handle);
-  friend XlaOp RecvWithToken(const XlaOp& token, const Shape& shape,
-                             const ChannelHandle& handle);
-  friend XlaOp SendToHost(const XlaOp& operand, const XlaOp& token,
-                          const Shape& shape_with_layout,
-                          const ChannelHandle& handle);
-  friend XlaOp RecvFromHost(const XlaOp& token, const Shape& shape,
-                            const ChannelHandle& handle);
-  friend XlaOp InfeedWithToken(const XlaOp& token, const Shape& shape,
-                               const string& config);
-  friend XlaOp OutfeedWithToken(const XlaOp& operand, const XlaOp& token,
-                                const Shape& shape_with_layout,
-                                const string& outfeed_config);
-  friend XlaOp CreateToken(XlaBuilder* builder);
-  friend XlaOp AfterAll(XlaBuilder* builder,
-                        tensorflow::gtl::ArraySlice<XlaOp> tokens);
-};
-
-// RAII-style object: sets the current sharding assignment in builder on
-// construction, and sets back to the previous assignment on destruction.
-class XlaScopedShardingAssignment {
- public:
-  XlaScopedShardingAssignment(xla::XlaBuilder* builder,
-                              tensorflow::gtl::optional<OpSharding> sharding)
-      : builder_(builder), prev_sharding_(builder->sharding()) {
-    SetSharding(sharding);
-  }
-
-  XlaScopedShardingAssignment(const XlaScopedShardingAssignment&) = delete;
-  XlaScopedShardingAssignment& operator=(const XlaScopedShardingAssignment&) =
-      delete;
-
-  ~XlaScopedShardingAssignment() { SetSharding(prev_sharding_); }
-
- private:
-  void SetSharding(const tensorflow::gtl::optional<OpSharding>& sharding) {
-    if (sharding.has_value()) {
-      builder_->SetSharding(sharding.value());
-    } else {
-      builder_->ClearSharding();
-    }
-  }
-
-  xla::XlaBuilder* const builder_;
-  tensorflow::gtl::optional<OpSharding> prev_sharding_;
-};
-
-// Free functions for building XlaOps. The intention is that these will
-// become the public API for building XlaOps rather than calling methods on
-// XlaBuilder directly.
-
-// Enqueues a "retrieve parameter value" instruction for a parameter that was
-// passed to the computation.
-XlaOp Parameter(XlaBuilder* builder, int64 parameter_number, const Shape& shape,
-                const string& name);
-
-// Enqueues a constant with the value of the given literal onto the
-// computation.
-XlaOp ConstantLiteral(XlaBuilder* builder, const LiteralSlice& literal);
-
-// Enqueues a constant onto the computation. Methods are templated on the
-// native host type (NativeT) which corresponds to a specific XLA
-// PrimitiveType as given in the following table:
-//
-//  Native Type   PrimitiveType
-// -----------------------------
-//   bool           PRED
-//   int32          S32
-//   int64          S64
-//   uint32         U32
-//   uint64         U64
-//   float          F32
-//   double         F64
-//
-// Note: not all primitive types defined in xla_data.proto have a
-// corresponding native type yet.
-template <typename NativeT>
-XlaOp ConstantR0(XlaBuilder* builder, NativeT value);
-template <typename NativeT>
-XlaOp ConstantR1(XlaBuilder* builder,
-                 tensorflow::gtl::ArraySlice<NativeT> values);
-XlaOp ConstantR1(XlaBuilder* builder, const tensorflow::core::Bitmap& values);
-template <typename NativeT>
-XlaOp ConstantR2(XlaBuilder* builder,
-                 std::initializer_list<std::initializer_list<NativeT>> values);
-template <typename NativeT>
-XlaOp ConstantFromArrayWithLayout(XlaBuilder* builder,
-                                  const Array<NativeT>& values,
-                                  const Layout& layout);
-template <typename NativeT>
-XlaOp ConstantFromArray(XlaBuilder* builder, const Array<NativeT>& values);
-template <typename NativeT>
-XlaOp ConstantR2FromArray2DWithLayout(XlaBuilder* builder,
-                                      const Array2D<NativeT>& values,
-                                      const Layout& layout);
-template <typename NativeT>
-XlaOp ConstantR2FromArray2D(XlaBuilder* builder,
-                            const Array2D<NativeT>& values);
-template <typename NativeT>
-XlaOp ConstantR3FromArray3DWithLayout(XlaBuilder* builder,
-                                      const Array3D<NativeT>& values,
-                                      const Layout& layout);
-template <typename NativeT>
-XlaOp ConstantR3FromArray3D(XlaBuilder* builder,
-                            const Array3D<NativeT>& values);
-template <typename NativeT>
-XlaOp ConstantR4FromArray4DWithLayout(XlaBuilder* builder,
-                                      const Array4D<NativeT>& values,
-                                      const Layout& layout);
-template <typename NativeT>
-XlaOp ConstantR4FromArray4D(XlaBuilder* builder,
-                            const Array4D<NativeT>& values);
-
-// Enqueues a rank one constant (XlaBuilder* builder, vector) onto the
-// computation. The vector has size 'length' and every element has the value
-// 'value'.
-template <typename NativeT>
-XlaOp ConstantR1(XlaBuilder* builder, int64 length, NativeT value);
-
-// Adds dimensions to an array by duplicating the data in the array.
-//
-// The new dimensions are inserted on the left, i.e. if
-// broadcast_sizes has values {a0, ..., aN} and the operand shape
-// has dimensions {b0, ..., bM} then the shape of the output has
-// dimensions {a0, ..., aN, b0, ..., bM}.
-//
-// The new dimensions index into copies of the operand, i.e.
-//
-//   output[i0, ..., iN, j0, ..., jM] = operand[j0, ..., jM]
-XlaOp Broadcast(const XlaOp& operand,
-                tensorflow::gtl::ArraySlice<int64> broadcast_sizes);
-
-// Performs in-dimension-style broadcast.
-//
-// Operand specifies the input to be broadcast. "shape" is expected output
-// shape. "broadcast_dimensions" are the dimensions to be broadcasting into.
-// Dimension numbers in broadcast_dimensions map to individual dimensions
-// of the operand, and specify what dimension of the output shape they
-// should be broadcast.
-// e.g.
-// Say operand = [1, 2], i.e., a 1D tensor with 2 elements.
-// and dimension of shape is [2,2].
-// Specifying {1} as brodcast_dimension will generate output
-// [1 , 2]
-// [1 , 2]
-// On the other hand, specifying {0} as broadcast_dimension
-// will generate output
-// [1 , 1]
-// [2 , 2]
-XlaOp BroadcastInDim(
-    const XlaOp& operand, const Shape& shape,
-    const tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-
-// Enqueues a pad operation onto the computation that pads the given value on
-// the edges as well as between the elements of the input. padding_config
-// specifies the padding amount for each dimension.
-XlaOp Pad(const XlaOp& operand, const XlaOp& padding_value,
-          const PaddingConfig& padding_config);
-
-// Enqueues an operation onto the computation that flattens the operand based
-// on the dimension order (major/slowest-varying to minor/fastest-varying)
-// given, followed by reshaping it into the shape with the given dimension
-// sizes (also major to minor). Conceptually, this is a limited form of
-// "shape casting".
-XlaOp Reshape(const XlaOp& operand,
-              tensorflow::gtl::ArraySlice<int64> dimensions,
-              tensorflow::gtl::ArraySlice<int64> new_sizes);
-
-// Enqueues an operation onto the computation that collapses the operand, from
-// first to last dimension (C order), then reshapes it to the given dimension
-// sizes. Conceptually, this is a limited form of "shape casting".
-XlaOp Reshape(const XlaOp& operand,
-              tensorflow::gtl::ArraySlice<int64> new_sizes);
-
-// Wrapper for Reshape.
-// Enqueues an operation to collapse the provided dimensions; e.g. an
-// operand with dimensions {x=256, y=2, z=2, p=32} can be collapsed to
-// {x=1024, y=32} by collapsing dims {0, 1, 2}. Collapsing dimensions must
-// be a consecutive, in-order subsequence of the operand dimensions.
-//
-// Note that collapsing a single dimension does nothing:
-//
-//    {256} collapsing {0} => {256}
-//    {1} collapsing {0} => {1}
-//
-// Collapsing multiple dimensions produces a single result dimension:
-//
-//    {256, 2} collapsing {0,1} => {512}
-//    {256, 2, 3} collapsing {0,1} => {512, 3}
-//
-// This could potentially cause data to be moved -- it provides a more
-// structured form of reshaping than an arbitrary Reshape operation.
-XlaOp Collapse(const XlaOp& operand,
-               tensorflow::gtl::ArraySlice<int64> dimensions);
-
-// Enqueues a slice operation onto the computation that slices the operand
-// from the start indices to the limit indices; e.g.
-//
-//        x
-//   [ 0 1 2 3 ]
-// y [ 4 5 6 7 ] => slice(start={1, 1}, limit={2, 3}) => [ 5 6 ]
-//   [ 8 9 a b ]
-//
-// Note that "limit" means up-to-but-not-including; i.e. [start, limit) in 1D
-// range notation.
-// The strides parameter determines the stride over the slice
-XlaOp Slice(const XlaOp& operand,
-            tensorflow::gtl::ArraySlice<int64> start_indices,
-            tensorflow::gtl::ArraySlice<int64> limit_indices,
-            tensorflow::gtl::ArraySlice<int64> strides);
-
-// Enqueues a slice operation in a given dimension, taking all other
-// dimensions as they are; e.g. if dimno is 1 from start_index 2 to
-// limit_index 4 by 1, and the shape is f32[7,8,9], this call is short-hand
-// for:
-//
-//  array[:, 2:4:1, :]
-XlaOp SliceInDim(const XlaOp& operand, int64 start_index, int64 limit_index,
-                 int64 stride, int64 dimno);
-
-// Enqueues a slice operation onto the computation that slices the 'operand'
-// from dynamic start indices which are passed in 'start_indices'.
-// The size of the slice in each dimension is passed in 'slice_sizes',
-// which specify the end point of exclusive slice intervals in each
-// dimension [start, start + size).
-// The shape of 'start_indices' must be rank == 1, with dimension size
-// equal to the rank of the 'operand'.
-// Slice index calculations are computed modulo input dimension sizes to
-// prevent dynamic start indices from generating out-of-bound array accesses.
-XlaOp DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
-                   tensorflow::gtl::ArraySlice<int64> slice_sizes);
-
-// Enqueues a dynamic update slice operation onto the computation, which
-// updates a slice of 'operand' with 'update' at dynamic 'start_indices'.
-// The shape of 'update' determines the shape of the slice of 'operand'
-// which is updated.
-// The indices specified in 'start_indices' specify the offset of the slice
-// of 'operand' which is updated.
-//
-//               update = {10, 11} // calculated at runtime.
-//   [1 2 3]     start  = {1, 1}   // calculated at runtime.  [1 2  3 ]
-//   [4 5 6]  => DynamicUpdateslice(data, update, start)   => [4 10 11]
-//   [7 8 9]                                                  [7 8  9 ]
-//
-// The shape of 'start_indices' must be rank == 1, with dimension size
-// equal to the rank of the 'operand'.
-// Slice index calculations are computed modulo update dimension sizes to
-// prevent dynamic start indices from generating out-of-bound array accesses.
-XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
-                         const XlaOp& start_indices);
-
-// Enqueues a concatenate instruction onto the computation. 'operands' must
-// have >= 1 entry.
-XlaOp ConcatInDim(XlaBuilder* builder,
-                  tensorflow::gtl::ArraySlice<XlaOp> operands, int64 dimension);
-
-// Enqueue a tracing operation onto the computation; the computation will emit
-// a logging message with the operand.
-void Trace(const string& tag, const XlaOp& operand);
-
-// Enqueues a conditional-move-like select operation onto the computation;
-// predicated on pred, selects between on_true and on_false.
-XlaOp Select(const XlaOp& pred, const XlaOp& on_true, const XlaOp& on_false);
-
-// Enqueues a tuple-creation instruction onto the computation.
-XlaOp Tuple(XlaBuilder* builder, tensorflow::gtl::ArraySlice<XlaOp> elements);
-
-// Enqueues a tuple-element-get instruction onto the computation.
-XlaOp GetTupleElement(const XlaOp& tuple_data, int64 index);
-
-// Enqueues an equal-to comparison instruction onto the computation.
-XlaOp Eq(const XlaOp& lhs, const XlaOp& rhs,
-         tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-// Enqueues a not-equal comparison instruction onto the computation.
-XlaOp Ne(const XlaOp& lhs, const XlaOp& rhs,
-         tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-// Enqueues a greater-or-equal comparison instruction onto the computation.
-XlaOp Ge(const XlaOp& lhs, const XlaOp& rhs,
-         tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-// Enqueues a greater-than comparison instruction onto the computation.
-XlaOp Gt(const XlaOp& lhs, const XlaOp& rhs,
-         tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-// Enqueues a less-than comparison instruction onto the computation.
-XlaOp Lt(const XlaOp& lhs, const XlaOp& rhs,
-         tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-// Enqueues a less-or-equal comparison instruction onto the computation.
-XlaOp Le(const XlaOp& lhs, const XlaOp& rhs,
-         tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-// Enqueues a dot instruction onto the computation.
-XlaOp Dot(const XlaOp& lhs, const XlaOp& rhs);
-
-// Enqueues a general dot instruction onto the computation.
-XlaOp DotGeneral(const XlaOp& lhs, const XlaOp& rhs,
-                 const DotDimensionNumbers& dimension_numbers);
-
-// Enqueues a convolution instruction onto the computation, which uses the
-// default convolution dimension numbers.
-XlaOp Conv(const XlaOp& lhs, const XlaOp& rhs,
-           tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding);
-
-// Enqueues a convolution instruction onto the computation, with the caller
-// provided padding configuration in the format returned by MakePadding().
-XlaOp ConvWithGeneralPadding(
-    const XlaOp& lhs, const XlaOp& rhs,
-    tensorflow::gtl::ArraySlice<int64> window_strides,
-    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding);
-
-// Enqueues a convolution instruction onto the computation, with the caller
-// provided dimension numbers configuration.
-XlaOp ConvWithGeneralDimensions(
-    const XlaOp& lhs, const XlaOp& rhs,
-    tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
-    const ConvolutionDimensionNumbers& dimension_numbers);
-
-// Enqueues a convolution instruction onto the computation, with the caller
-// provided padding configuration as well as the dimension numbers.
-XlaOp ConvGeneral(const XlaOp& lhs, const XlaOp& rhs,
-                  tensorflow::gtl::ArraySlice<int64> window_strides,
-                  tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
-                  const ConvolutionDimensionNumbers& dimension_numbers);
-
-// Enqueues a convolution instruction onto the computation, with the caller
-// provided padding configuration, dilation factors and dimension numbers.
-XlaOp ConvGeneralDilated(
-    const XlaOp& lhs, const XlaOp& rhs,
-    tensorflow::gtl::ArraySlice<int64> window_strides,
-    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
-    tensorflow::gtl::ArraySlice<int64> lhs_dilation,
-    tensorflow::gtl::ArraySlice<int64> rhs_dilation,
-    const ConvolutionDimensionNumbers& dimension_numbers);
-
-// Enqueues an FFT instruction onto the computation, of the given type and
-// with the given FFT length.
-XlaOp Fft(const XlaOp& operand, FftType fft_type,
-          tensorflow::gtl::ArraySlice<int64> fft_length);
-
-// Enqueues an infeed instruction onto the computation, which writes data of
-// the given shape to the infeed buffer of the device.
-XlaOp Infeed(XlaBuilder* builder, const Shape& shape,
-             const string& config = "");
-
-// Variant of Infeed which takes a token-shaped operand and produces a
-// two-element tuple containing the data value and a token-shaped value.
-// Tokens are used for ordering side-effecting operations.
-// TODO(b/110532604): Replace all uses of the non-token form with this variant.
-XlaOp InfeedWithToken(const XlaOp& token, const Shape& shape,
-                      const string& config = "");
-
-// Enqueues an outfeed instruction onto the computation. This instruction
-// generates outgoing data transfers for the given data.
-//
-// shape_with_layout communicates the laid out shape that we want to outfeed
-// -- if !ShapeUtil::Compatible(GetShape(operand), shape_with_layout) an error
-// will occur.
-void Outfeed(const XlaOp& operand, const Shape& shape_with_layout,
-             const string& outfeed_config);
-
-// Variant of Outfeed which takes a token-shaped operand and produces a
-// token-shaped value. Tokens are used for ordering side-effecting operations.
-// TODO(b/110532604): Replace all uses of the non-token form with this variant.
-XlaOp OutfeedWithToken(const XlaOp& operand, const XlaOp& token,
-                       const Shape& shape_with_layout,
-                       const string& outfeed_config);
-
-// Enqueues a call instruction onto the computation.
-XlaOp Call(XlaBuilder* builder, const XlaComputation& computation,
-           tensorflow::gtl::ArraySlice<XlaOp> operands);
-
-// Enqueues a custom call instruction onto the computation.
-// During code generation, a call instruction is emitted which targets a
-// symbol with the name |call_target_name|.  The |operands| are passed to the
-// call instruction.  |shape| is the resultant shape.
-XlaOp CustomCall(XlaBuilder* builder, const string& call_target_name,
-                 tensorflow::gtl::ArraySlice<XlaOp> operands,
-                 const Shape& shape);
-
-// Enqueues a pseudo-op to represent host-side computation data-dependencies.
-// During code generation, host send and receive operations will be generated
-// to transfer |operands| to the host and a single result of |shape| back to
-// the device.  Host send/recv operations are emitted using |channel_name|.
-// Dataflow dependencies and the |cost_estimate_ns| field may be used in HLO
-// instruction scheduling.
-XlaOp HostCompute(XlaBuilder* builder,
-                  tensorflow::gtl::ArraySlice<XlaOp> operands,
-                  const string& channel_name, int64 cost_estimate_ns,
-                  const Shape& shape);
-
-// The following methods enqueue element-wise binary arithmetic operations
-// onto the computation. The shapes of the operands have to match unless one
-// of the operands is a scalar, or an explicit broadcast dimension is given
-// (see g3doc for more details).
-
-// Enqueues a complex compose instruction onto the computation.
-XlaOp Complex(const XlaOp& real, const XlaOp& imag,
-              tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-// Enqueues a complex conjugate instruction onto the computation.
-XlaOp Conj(const XlaOp& operand);
-
-// Enqueues an add instruction onto the computation.
-XlaOp Add(const XlaOp& lhs, const XlaOp& rhs,
-          tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-// Enqueues a subtract instruction onto the computation.
-XlaOp Sub(const XlaOp& lhs, const XlaOp& rhs,
-          tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-// Enqueues a multiply instruction onto the computation.
-XlaOp Mul(const XlaOp& lhs, const XlaOp& rhs,
-          tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-// Enqueues a divide instruction onto the computation.
-XlaOp Div(const XlaOp& lhs, const XlaOp& rhs,
-          tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-// Enqueues a remainder instruction onto the computation.
-XlaOp Rem(const XlaOp& lhs, const XlaOp& rhs,
-          tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-// Enqueues a max instruction onto the computation.
-XlaOp Max(const XlaOp& lhs, const XlaOp& rhs,
-          tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-// Enqueues a min instruction onto the computation.
-XlaOp Min(const XlaOp& lhs, const XlaOp& rhs,
-          tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-// Element-wise logical operators
-XlaOp And(const XlaOp& lhs, const XlaOp& rhs,
-          tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-XlaOp Or(const XlaOp& lhs, const XlaOp& rhs,
-         tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-XlaOp Xor(const XlaOp& lhs, const XlaOp& rhs,
-          tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-XlaOp Not(const XlaOp& operand);
-
-XlaOp ShiftLeft(const XlaOp& lhs, const XlaOp& rhs,
-                tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-XlaOp ShiftRightArithmetic(
-    const XlaOp& lhs, const XlaOp& rhs,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-XlaOp ShiftRightLogical(
-    const XlaOp& lhs, const XlaOp& rhs,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-// Reduces an array among the provided dimensions, given "computation" as a
-// reduction operator.
-XlaOp Reduce(const XlaOp& operand, const XlaOp& init_value,
-             const XlaComputation& computation,
-             tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce);
-
-// Convenience wrapper around the above that reduces all the dimensions in the
-// operand shape.
-XlaOp ReduceAll(const XlaOp& operand, const XlaOp& init_value,
-                const XlaComputation& computation);
-
-// Enqueues a windowed reduce instruction onto the computation.
-XlaOp ReduceWindow(const XlaOp& operand, const XlaOp& init_value,
-                   const XlaComputation& computation,
-                   tensorflow::gtl::ArraySlice<int64> window_dimensions,
-                   tensorflow::gtl::ArraySlice<int64> window_strides,
-                   Padding padding);
-
-// As ReduceWindow(), but the padding is given in the format
-// returned by MakePadding().
-XlaOp ReduceWindowWithGeneralPadding(
-    const XlaOp& operand, const XlaOp& init_value,
-    const XlaComputation& computation,
-    tensorflow::gtl::ArraySlice<int64> window_dimensions,
-    tensorflow::gtl::ArraySlice<int64> window_strides,
-    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding);
-
-// Returns the sum of the operand value within each subgroup of replicas. All
-// replicas supply one input to the sum and all replicas receive the resulting
-// sum for each subgroup.
-XlaOp CrossReplicaSum(
-    const XlaOp& operand,
-    tensorflow::gtl::ArraySlice<int64> replica_group_ids = {});
-
-// Enqueues an operation that do an AllReduce of the operand cross cores. Here
-// AllReduce means doing a reduction on the input operand cross cores and then
-// broadcasting the reduction result to those cores. The reduction function is
-// defined by `computation`, which should be a commutative computation on
-// scalars, e.g., add, min, or max. The way that AllReduce is applied is
-// configured by:
-//
-// - `replica_group_ids`: maps replica ids to subgroup ids. If empty, all
-// replicas belong to one group. Allreduce will be applied within subgroups.
-// For example, we have 4 replicas, then replica_group_ids={0,1,0,1} means,
-// replica 0 and 2 are in subgroup 0, replica 1 and 3 are in subgroup 1.
-//
-// - `channel_id`: for Allreduce nodes from different models, if they have the
-// same channel_id, they will be 'Allreduce'd. If empty, Allreduce will not be
-// applied cross models.
-//
-// TODO(b/79737069): Rename this to AllReduce when it's ready to use.
-XlaOp CrossReplicaSum(const XlaOp& operand, const XlaComputation& computation,
-                      tensorflow::gtl::ArraySlice<int64> replica_group_ids = {},
-                      const tensorflow::gtl::optional<ChannelHandle>&
-                          channel_id = tensorflow::gtl::nullopt);
-
-// Enqueues an operation that scatters the `source` array to the selected
-// indices of each window.
-XlaOp SelectAndScatter(const XlaOp& operand, const XlaComputation& select,
-                       tensorflow::gtl::ArraySlice<int64> window_dimensions,
-                       tensorflow::gtl::ArraySlice<int64> window_strides,
-                       Padding padding, const XlaOp& source,
-                       const XlaOp& init_value, const XlaComputation& scatter);
-
-// As SelectAndScatter(), but the padding is given in the format
-// returned by MakePadding().
-XlaOp SelectAndScatterWithGeneralPadding(
-    const XlaOp& operand, const XlaComputation& select,
-    tensorflow::gtl::ArraySlice<int64> window_dimensions,
-    tensorflow::gtl::ArraySlice<int64> window_strides,
-    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
-    const XlaOp& source, const XlaOp& init_value,
-    const XlaComputation& scatter);
-
-// Enqueues an abs instruction onto the computation.
-XlaOp Abs(const XlaOp& operand);
-
-// Enqueues a atan2 instruction onto the computation.
-XlaOp Atan2(const XlaOp& y, const XlaOp& x,
-            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-// Enqueues an exp instruction onto the computation.
-XlaOp Exp(const XlaOp& operand);
-
-// Enqueues an expm1 instruction onto the computation.
-XlaOp Expm1(const XlaOp& operand);
-
-// Enqueues a floor instruction onto the computation.
-XlaOp Floor(const XlaOp& operand);
-
-// Enqueues a ceil instruction onto the computation.
-XlaOp Ceil(const XlaOp& operand);
-
-// Enqueues a round instruction onto the computation, rounding to nearest even
-// with half-way cases rounding away from zero.
-XlaOp Round(const XlaOp& operand);
-
-// Enqueues an log instruction (natural logarithm) onto the computation.
-XlaOp Log(const XlaOp& operand);
-
-// Enqueues an log1p instruction (log(x+1)) onto the computation.
-XlaOp Log1p(const XlaOp& operand);
-
-// Enqueues a sign instruction onto the computation.
-XlaOp Sign(const XlaOp& operand);
-
-// Enqueues a count leading zeros instruction onto the computation.
-XlaOp Clz(const XlaOp& operand);
-
-// Enqueues a cosine instruction onto the computation.
-XlaOp Cos(const XlaOp& operand);
-
-// Enqueues a sine instruction onto the computation.
-XlaOp Sin(const XlaOp& operand);
-
-// Enqueues a tanh instruction onto the computation.
-XlaOp Tanh(const XlaOp& operand);
-
-// Enqueues a real-part instruction onto the computation.
-XlaOp Real(const XlaOp& operand);
-
-// Enqueues an imaginary-part instruction onto the computation.
-XlaOp Imag(const XlaOp& operand);
-
-// Enqueues a lhs^rhs computation onto the computation.
-XlaOp Pow(const XlaOp& lhs, const XlaOp& rhs,
-          tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-// Enqueues an operator that tests if the operand's values are finite, i.e.,
-// not Inf or NaN. Defined only for floating-point types. Returns an array of
-// booleans with the same shape where entries are true iff the corresponding
-// entry was NaN.
-XlaOp IsFinite(const XlaOp& operand);
-
-// Enqueues a convert instruction onto the computation that changes the
-// element type of the operand array to primitive_type.
-XlaOp ConvertElementType(const XlaOp& operand, PrimitiveType new_element_type);
-
-// Enqueues a no-op instruction onto the computation that changes
-// the element type of the operand array to primitive_type. The
-// bit-widths of the source and destination element types must be
-// identical.
-XlaOp BitcastConvertType(const XlaOp& operand, PrimitiveType new_element_type);
-
-// Enqueues a negate instruction onto the computation.
-XlaOp Neg(const XlaOp& operand);
-
-// Enqueues a transpose instruction onto the computation.
-XlaOp Transpose(const XlaOp& operand,
-                tensorflow::gtl::ArraySlice<int64> permutation);
-
-// Enqueues a reverse instruction onto the computation. The order of the
-// elements in the given dimensions is reversed (i.e., the element at index i
-// is moved to index dimension_size - 1 - i).
-XlaOp Rev(const XlaOp& operand, tensorflow::gtl::ArraySlice<int64> dimensions);
-
-// Enqueues a sort (as increasing order) instruction onto the computation.
-// If only keys are provided:
-// * If the keys are an rank-1 tensor (an array), the result is a sorted array
-// of keys, in ascending order.
-// * If the keys have higher rank, the keys are sorted along the provided
-// dimension. For example, for a rank-2 tensor (a matrix) of keys, a dimension
-// value of 0 will indepenently sort every column, and a dimension value of 1
-// will independently sort each row. If no dimension number is provided, then
-// the last dimension is chosen by default.
-//
-// If both keys and values are provided:
-// * The keys and the values must tensors with the same dimensions. The
-// element types of the tensors may be different.
-// * The result is a tuple that consists of a sorted tensor of keys (along the
-// provided dimension, as above) as the first element, and a tensor with their
-// corresponding values as the second element.
-XlaOp Sort(XlaOp keys,
-           tensorflow::gtl::optional<XlaOp> values = tensorflow::gtl::nullopt,
-           int64 dimension = -1);
-
-// Enqueues a clamp instruction onto the computation.
-XlaOp Clamp(const XlaOp& min, const XlaOp& operand, const XlaOp& max);
-
-// Enqueues a map instruction onto the computation.
-XlaOp Map(XlaBuilder* builder, tensorflow::gtl::ArraySlice<XlaOp> operands,
-          const XlaComputation& computation,
-          tensorflow::gtl::ArraySlice<int64> dimensions,
-          tensorflow::gtl::ArraySlice<XlaOp> static_operands = {});
-
-// Enqueues a N(mu, sigma) random number generation instruction onto the
-// computation.
-XlaOp RngNormal(const XlaOp& mu, const XlaOp& sigma, const Shape& shape);
-
-// Enqueues a U(a, b) random number generation instruction onto the
-// computation. Returns values in the semi-open interval [a, b).
-XlaOp RngUniform(const XlaOp& a, const XlaOp& b, const Shape& shape);
-
-// Enqueues a while node onto the computation.
-XlaOp While(const XlaComputation& condition, const XlaComputation& body,
-            const XlaOp& init);
-
-// Enqueues a conditional node onto the computation.
-XlaOp Conditional(const XlaOp& predicate, const XlaOp& true_operand,
-                  const XlaComputation& true_computation,
-                  const XlaOp& false_operand,
-                  const XlaComputation& false_computation);
-
-// Enqueues a ReducePrecision node onto the computation.
-XlaOp ReducePrecision(const XlaOp& operand, const int exponent_bits,
-                      const int mantissa_bits);
-
-// Enqueues a Gather node onto the computation.
-XlaOp Gather(const XlaOp& input, const XlaOp& gather_indices,
-             const GatherDimensionNumbers& dimension_numbers,
-             tensorflow::gtl::ArraySlice<int64> window_bounds);
-
-// Enqueues a Send node onto the computation for device-to-device
-// communication. This operation sends the given operand to
-// a Recv instruction in a different computation that shares the same channel
-// handle.
-void Send(const XlaOp& operand, const ChannelHandle& handle);
-
-// Variant of Send which takes a token-shaped operand and produces a
-// token-shaped value.  Tokens are used for ordering side-effecting operations.
-// TODO(b/110532604): Replace all uses of the non-token form with this variant.
-XlaOp SendWithToken(const XlaOp& operand, const XlaOp& token,
-                    const ChannelHandle& handle);
-
-// Enqueues a Recv node onto the computation for device-to-device
-// communication. The data comes from a Send instruction in a different
-// computation that shares the same channel handle and its shape must be the
-// same as the given shape.
-XlaOp Recv(XlaBuilder* builder, const Shape& shape,
-           const ChannelHandle& handle);
-
-// Variant of Recv which takes a token-shaped operand and produces a two-element
-// tuple containing the data value and a token-shaped value. Tokens are used
-// for ordering side-effecting operations.
-// TODO(b/110532604): Replace all uses of the non-token form with this variant.
-XlaOp RecvWithToken(const XlaOp& token, const Shape& shape,
-                    const ChannelHandle& handle);
-
-// Enqueues a Send node which transfers data from the device to the host. The
-// 'shape_with_layout' argument defines the layout of the data transferred; its
-// shape must be compatible with the shape of the operand. The operand must be
-// array-shaped.
-// TODO(b/111544877): Support tuple shapes.
-XlaOp SendToHost(const XlaOp& operand, const XlaOp& token,
-                 const Shape& shape_with_layout, const ChannelHandle& handle);
-
-// Enqueues a Recv node which transfers data from the host to the device. The
-// given shape must contain a layout and must be an array.
-// TODO(b/111544877): Support tuple shapes.
-XlaOp RecvFromHost(const XlaOp& token, const Shape& shape,
-                   const ChannelHandle& handle);
-
-// Enqueues an operation (AfterAll) with no operands that produces a
-// token-shaped value.  Tokens are used for ordering side-effecting operations.
-// This is a separate method from AfterAll to facility the removal of
-// operand-less AfterAll instructions.
-// TODO(b/110532604): Remove this function when all tokens are derived from a
-// single token generated or passed into the entry computation.
-XlaOp CreateToken(XlaBuilder* builder);
-
-// Enqueues an AfterAll instruction which produces a token-shaped value and
-// takes a variadic number of token-shaped operands. The number of operands must
-// be greater than zero. Used for joining tokens.
-XlaOp AfterAll(XlaBuilder* builder, tensorflow::gtl::ArraySlice<XlaOp> tokens);
-
-// Normalizes operand across spatial and batch dimensions for each feature.
-//
-// Returns a tuple (normalized, batch_mean, batch_var) where `normalized`
-// is the normalized result and batch_mean and batch_var are the mean and
-// variance, respectively, across batch for the operand.
-XlaOp BatchNormTraining(const XlaOp& operand, const XlaOp& scale,
-                        const XlaOp& offset, float epsilon,
-                        int64 feature_index);
-
-// Normalizes operand across spatial and batch dimensions for each feature.
-//
-// `BatchNormInference` is equivalent to calling `BatchNormTraining` without
-// computing `mean` and `variance` for each batch inside the operation. It
-// uses the input `mean` and `variance` instead as estimated values. The
-// purpose of this op is to reduce latency in inference, hence the name
-// `BatchNormInference`.
-//
-// The output has the same shape as `operand`, and contains the normalized
-// values for each batch.
-XlaOp BatchNormInference(const XlaOp& operand, const XlaOp& scale,
-                         const XlaOp& offset, const XlaOp& mean,
-                         const XlaOp& variance, float epsilon,
-                         int64 feature_index);
-
-// Calculates the gradients of a batch norm op.
-//
-// The inputs `batch_mean` and `batch_var` represent the mean and variance
-// across the batch.
-//
-// Returns a tuple of three elements:
-//   - grad_operand: Gradient with respect to input `operand`
-//   - grad_offset: Gradient with respect to input `offset`
-//   - grad_scale: Gradient with respect to input `scale`
-XlaOp BatchNormGrad(const XlaOp& operand, const XlaOp& scale,
-                    const XlaOp& batch_mean, const XlaOp& batch_var,
-                    const XlaOp& grad_output, float epsilon,
-                    int64 feature_index);
-
-// Implementation details below this point.
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR0(NativeT value) {
-  return ConstantLiteral(*LiteralUtil::CreateR0<NativeT>(value));
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR1(tensorflow::gtl::ArraySlice<NativeT> values) {
-  return ConstantLiteral(*LiteralUtil::CreateR1<NativeT>(values));
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR1(int64 length, NativeT value) {
-  Literal literal(ShapeUtil::MakeShape(
-      primitive_util::NativeToPrimitiveType<NativeT>(), {length}));
-  literal.PopulateWithValue(value);
-  return ConstantLiteral(literal);
-}
-
-inline XlaOp XlaBuilder::ConstantR1(const tensorflow::core::Bitmap& values) {
-  return ConstantLiteral(*LiteralUtil::CreateR1(values));
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR2(
-    std::initializer_list<std::initializer_list<NativeT>> values) {
-  return ConstantLiteral(*LiteralUtil::CreateR2<NativeT>(values));
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantFromArrayWithLayout(const Array<NativeT>& values,
-                                              const Layout& layout) {
-  return ConstantLiteral(
-      *LiteralUtil::CreateFromArrayWithLayout<NativeT>(values, layout));
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantFromArray(const Array<NativeT>& values) {
-  return ConstantLiteral(*LiteralUtil::CreateFromArray<NativeT>(values));
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR2FromArray2DWithLayout(
-    const Array2D<NativeT>& values, const Layout& layout) {
-  return ConstantLiteral(
-      *LiteralUtil::CreateFromArrayWithLayout<NativeT>(values, layout));
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR2FromArray2D(const Array2D<NativeT>& values) {
-  return ConstantLiteral(*LiteralUtil::CreateR2FromArray2D<NativeT>(values));
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR3FromArray3DWithLayout(
-    const Array3D<NativeT>& values, const Layout& layout) {
-  return ConstantLiteral(
-      *LiteralUtil::CreateR3FromArray3DWithLayout<NativeT>(values, layout));
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR3FromArray3D(const Array3D<NativeT>& values) {
-  return ConstantFromArray(values);
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR4FromArray4DWithLayout(
-    const Array4D<NativeT>& values, const Layout& layout) {
-  return ConstantFromArrayWithLayout(values, layout);
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR4FromArray4D(const Array4D<NativeT>& values) {
-  return ConstantFromArray(values);
-}
-
-// Free function template implementations.
-
-template <typename NativeT>
-XlaOp ConstantR0(XlaBuilder* builder, NativeT value) {
-  return ConstantLiteral(builder, *LiteralUtil::CreateR0<NativeT>(value));
-}
-
-template <typename NativeT>
-XlaOp ConstantR1(XlaBuilder* builder,
-                 tensorflow::gtl::ArraySlice<NativeT> values) {
-  return ConstantLiteral(builder, *LiteralUtil::CreateR1<NativeT>(values));
-}
-
-template <typename NativeT>
-XlaOp ConstantR1(XlaBuilder* builder, int64 length, NativeT value) {
-  Literal literal(ShapeUtil::MakeShape(
-      primitive_util::NativeToPrimitiveType<NativeT>(), {length}));
-  literal.PopulateWithValue(value);
-  return ConstantLiteral(builder, literal);
-}
-
-inline XlaOp ConstantR1(XlaBuilder* builder,
-                        const tensorflow::core::Bitmap& values) {
-  return ConstantLiteral(builder, *LiteralUtil::CreateR1(values));
-}
-
-template <typename NativeT>
-XlaOp ConstantR2(XlaBuilder* builder,
-                 std::initializer_list<std::initializer_list<NativeT>> values) {
-  return ConstantLiteral(builder, *LiteralUtil::CreateR2<NativeT>(values));
-}
-
-template <typename NativeT>
-XlaOp ConstantFromArrayWithLayout(XlaBuilder* builder,
-                                  const Array<NativeT>& values,
-                                  const Layout& layout) {
-  return ConstantLiteral(
-      builder,
-      *LiteralUtil::CreateFromArrayWithLayout<NativeT>(values, layout));
-}
-
-template <typename NativeT>
-XlaOp ConstantFromArray(XlaBuilder* builder, const Array<NativeT>& values) {
-  return ConstantLiteral(builder,
-                         *LiteralUtil::CreateFromArray<NativeT>(values));
-}
-
-template <typename NativeT>
-XlaOp ConstantR2FromArray2DWithLayout(XlaBuilder* builder,
-                                      const Array2D<NativeT>& values,
-                                      const Layout& layout) {
-  return ConstantLiteral(
-      builder,
-      *LiteralUtil::CreateFromArrayWithLayout<NativeT>(values, layout));
-}
-
-template <typename NativeT>
-XlaOp ConstantR2FromArray2D(XlaBuilder* builder,
-                            const Array2D<NativeT>& values) {
-  return ConstantLiteral(builder,
-                         *LiteralUtil::CreateR2FromArray2D<NativeT>(values));
-}
-
-template <typename NativeT>
-XlaOp ConstantR3FromArray3DWithLayout(XlaBuilder* builder,
-                                      const Array3D<NativeT>& values,
-                                      const Layout& layout) {
-  return ConstantLiteral(
-      builder,
-      *LiteralUtil::CreateR3FromArray3DWithLayout<NativeT>(values, layout));
-}
-
-template <typename NativeT>
-XlaOp ConstantR3FromArray3D(XlaBuilder* builder,
-                            const Array3D<NativeT>& values) {
-  return ConstantFromArray(builder, values);
-}
-
-template <typename NativeT>
-XlaOp ConstantR4FromArray4DWithLayout(XlaBuilder* builder,
-                                      const Array4D<NativeT>& values,
-                                      const Layout& layout) {
-  return ConstantFromArrayWithLayout(builder, values, layout);
-}
-
-template <typename NativeT>
-XlaOp ConstantR4FromArray4D(XlaBuilder* builder,
-                            const Array4D<NativeT>& values) {
-  return ConstantFromArray(builder, values);
-}
-
-}  // namespace xla
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 
 #endif  // TENSORFLOW_COMPILER_XLA_CLIENT_XLA_CLIENT_XLA_BUILDER_H_
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_computation.cc b/tensorflow/compiler/xla/client/xla_computation.cc
index 72e3935696..3543d41fc2 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_computation.cc
+++ b/tensorflow/compiler/xla/client/xla_computation.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 
 #include <utility>
 
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_computation.h b/tensorflow/compiler/xla/client/xla_computation.h
index 0ffba208b1..71598ef8b2 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_computation.h
+++ b/tensorflow/compiler/xla/client/xla_computation.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_XLA_CLIENT_XLA_COMPUTATION_H_
-#define TENSORFLOW_COMPILER_XLA_CLIENT_XLA_CLIENT_XLA_COMPUTATION_H_
+#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_XLA_COMPUTATION_H_
+#define TENSORFLOW_COMPILER_XLA_CLIENT_XLA_COMPUTATION_H_
 
 #include <utility>
 
@@ -64,4 +64,4 @@ class XlaComputation {
 
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_XLA_CLIENT_XLA_COMPUTATION_H_
+#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_XLA_COMPUTATION_H_
diff --git a/tensorflow/compiler/xla/literal.cc b/tensorflow/compiler/xla/literal.cc
index 5db124b5a2..0545deb096 100644
--- a/tensorflow/compiler/xla/literal.cc
+++ b/tensorflow/compiler/xla/literal.cc
@@ -1775,7 +1775,9 @@ void LiteralBase::Piece::WriteToProto(LiteralProto* proto) const {
       // Nothing to do but assign the shape which is done above.
       return;
     default:
-      LOG(FATAL) << "Unhandled primitive type " << subshape().element_type();
+      // TODO(b/111551621): Support serializing more PrimitiveTypes.
+      LOG(FATAL) << "Unhandled primitive type "
+                 << PrimitiveType_Name(subshape().element_type());
   }
 }
 
diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
index fe346f9956..c8f2d65c22 100644
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -53,9 +53,9 @@ cc_library(
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:executable_build_options",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:math",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/core:framework_lite",
         "//tensorflow/core:lib",
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.cc b/tensorflow/compiler/xla/python/local_computation_builder.cc
index f25348e735..434d78d78d 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.cc
+++ b/tensorflow/compiler/xla/python/local_computation_builder.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/python/local_computation_builder.h"
 #include "tensorflow/compiler/xla/client/lib/math.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/executable_run_options.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -486,6 +486,11 @@ LocalOp LocalComputationBuilder::ConvertElementType(
   return xla::ConvertElementType(operand.op(), new_element_type);
 }
 
+LocalOp LocalComputationBuilder::BitcastConvertType(
+    const LocalOp& operand, PrimitiveType new_element_type) {
+  return xla::BitcastConvertType(operand.op(), new_element_type);
+}
+
 LocalOp LocalComputationBuilder::Call(
     const LocalComputation& local_computation,
     tensorflow::gtl::ArraySlice<LocalOp> operands) {
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.h b/tensorflow/compiler/xla/python/local_computation_builder.h
index 0e0d8ac29a..545aa63f9d 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.h
+++ b/tensorflow/compiler/xla/python/local_computation_builder.h
@@ -19,8 +19,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/executable_build_options.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
@@ -259,6 +259,9 @@ class LocalComputationBuilder {
   LocalOp ConvertElementType(const LocalOp& operand,
                              PrimitiveType new_element_type);
 
+  LocalOp BitcastConvertType(const LocalOp& operand,
+                             PrimitiveType new_element_type);
+
   LocalOp Call(const LocalComputation& local_computation,
                tensorflow::gtl::ArraySlice<LocalOp> operands);
 
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.i b/tensorflow/compiler/xla/python/local_computation_builder.i
index eeccbd7cfa..9b8b0aa7f2 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.i
+++ b/tensorflow/compiler/xla/python/local_computation_builder.i
@@ -957,6 +957,7 @@ tensorflow::ImportNumpy();
 %unignore xla::swig::LocalComputationBuilder::Tuple;
 %unignore xla::swig::LocalComputationBuilder::GetTupleElement;
 %unignore xla::swig::LocalComputationBuilder::ConvertElementType;
+%unignore xla::swig::LocalComputationBuilder::BitcastConvertType;
 %unignore xla::swig::LocalComputationBuilder::Call;
 %unignore xla::swig::LocalComputationBuilder::Transpose;
 %unignore xla::swig::LocalComputationBuilder::Rev;
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index ef043e4ca0..c0105b385b 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -721,6 +721,18 @@ class ComputationBuilder(object):
     """
     return self._client.ConvertElementType(operand, new_element_type)
 
+  def BitcastConvertType(self, operand, new_element_type):
+    """Enqueues a bitcast type conversion operation onto the computation.
+
+    Args:
+      operand: the operand to convert.
+      new_element_type: the target primitive type.
+
+    Returns:
+      A LocalOp representing the added conversion op.
+    """
+    return self._client.BitcastConvertType(operand, new_element_type)
+
   def GetShape(self, operand):
     return _wrap_shape(self._client.GetShape(operand))
 
diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py
index 93177aa647..fd98e19457 100644
--- a/tensorflow/compiler/xla/python/xla_client_test.py
+++ b/tensorflow/compiler/xla/python/xla_client_test.py
@@ -489,6 +489,34 @@ class SingleOpTest(LocalComputationTest):
     for src_dtype, dst_dtype in itertools.product(xla_types, xla_types):
       _ConvertAndTest(x, src_dtype, dst_dtype)
 
+  def testBitcastConvertType(self):
+    xla_x32_types = {
+        np.int32: xla_client.xla_data_pb2.S32,
+        np.float32: xla_client.xla_data_pb2.F32,
+    }
+
+    xla_x64_types = {
+        np.int64: xla_client.xla_data_pb2.S64,
+        np.float64: xla_client.xla_data_pb2.F64,
+    }
+
+    def _ConvertAndTest(template, src_dtype, dst_dtype, dst_etype):
+      c = self._NewComputation()
+      x = c.Constant(np.array(template, dtype=src_dtype))
+      c.BitcastConvertType(x, dst_etype)
+
+      result = c.Build().Compile().Execute()
+      expected = np.array(template, src_dtype).view(dst_dtype)
+
+      self.assertEqual(result.shape, expected.shape)
+      self.assertEqual(result.dtype, expected.dtype)
+      np.testing.assert_equal(result, expected)
+
+    x = [0, 1, 0, 0, 1]
+    for xla_types in [xla_x32_types, xla_x64_types]:
+      for src_dtype, dst_dtype in itertools.product(xla_types, xla_types):
+        _ConvertAndTest(x, src_dtype, dst_dtype, xla_types[dst_dtype])
+
   def testCrossReplicaSumOneReplica(self):
     samples = [
         NumpyArrayF32(42.0),
diff --git a/tensorflow/compiler/xla/reference_util.cc b/tensorflow/compiler/xla/reference_util.cc
index 6397f1f479..a803520876 100644
--- a/tensorflow/compiler/xla/reference_util.cc
+++ b/tensorflow/compiler/xla/reference_util.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <array>
 #include <utility>
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h"
 #include "tensorflow/compiler/xla/service/hlo_evaluator.h"
diff --git a/tensorflow/compiler/xla/rpc/BUILD b/tensorflow/compiler/xla/rpc/BUILD
index 0b1cec1925..44b22a5586 100644
--- a/tensorflow/compiler/xla/rpc/BUILD
+++ b/tensorflow/compiler/xla/rpc/BUILD
@@ -56,7 +56,7 @@ tf_cc_test(
         ":grpc_stub",
         "//tensorflow:grpc++",
         "//tensorflow/compiler/xla/client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
diff --git a/tensorflow/compiler/xla/rpc/grpc_client_test.cc b/tensorflow/compiler/xla/rpc/grpc_client_test.cc
index 90efee50b4..6788676181 100644
--- a/tensorflow/compiler/xla/rpc/grpc_client_test.cc
+++ b/tensorflow/compiler/xla/rpc/grpc_client_test.cc
@@ -24,7 +24,7 @@ limitations under the License.
 #include "grpcpp/security/credentials.h"
 
 #include "tensorflow/compiler/xla/client/client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/rpc/grpc_stub.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index cba7883fde..528b7fdfd3 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -256,7 +256,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/service:hlo_element_type_converter",
         "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -564,7 +564,7 @@ cc_library(
         ":computation_placer",
         ":device_memory_allocator",
         ":platform_util",
-        ":pool",
+        ":stream_pool",
         ":transfer_manager",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
@@ -598,6 +598,7 @@ cc_library(
         ":hlo_proto_util",
         ":platform_util",
         ":source_map_util",
+        ":stream_pool",
         ":transfer_manager",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:execution_options_util",
@@ -642,7 +643,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:executable_build_options",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
     ],
@@ -751,8 +752,8 @@ cc_library(
         ":hlo_execution_profile",
         ":hlo_graph_dumper",
         ":hlo_proto",
-        ":pool",
         ":shaped_buffer",
+        ":stream_pool",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:status_macros",
@@ -838,7 +839,7 @@ cc_library(
     hdrs = ["execution_tracker.h"],
     deps = [
         ":backend",
-        ":pool",
+        ":stream_pool",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
@@ -946,7 +947,6 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
     ],
@@ -1663,8 +1663,8 @@ tf_cc_test(
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:padding",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
@@ -2670,7 +2670,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service/gpu:ir_emission_utils",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
@@ -2707,7 +2707,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
@@ -2715,21 +2715,25 @@ tf_cc_test(
 )
 
 cc_library(
-    name = "pool",
-    hdrs = ["pool.h"],
+    name = "stream_pool",
+    srcs = ["stream_pool.cc"],
+    hdrs = ["stream_pool.h"],
     deps = [
+        "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
+        "//tensorflow/core:stream_executor_no_cuda",
     ],
 )
 
 tf_cc_test(
-    name = "pool_test",
-    srcs = ["pool_test.cc"],
+    name = "stream_pool_test",
+    srcs = ["stream_pool_test.cc"],
     deps = [
-        ":pool",
+        ":stream_pool",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:stream_executor_no_cuda",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index 2205a7ec18..505c0e8dff 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -1744,19 +1744,37 @@ Status AlgebraicSimplifierVisitor::HandleSlice(HloInstruction* slice) {
   if (ReplaceInstructionIfSameShape(slice, slice->mutable_operand(0))) {
     return Status::OK();
   }
+
+  auto is_unstrided_slice = [](const HloInstruction* hlo) {
+    return c_all_of(hlo->slice_strides(),
+                    [](int64 stride) { return stride == 1; });
+  };
+  if (slice->operand(0)->opcode() == HloOpcode::kSlice &&
+      is_unstrided_slice(slice) && is_unstrided_slice(slice->operand(0))) {
+    HloInstruction* operand_slice = slice->mutable_operand(0);
+    std::vector<int64> new_slice_starts = slice->slice_starts();
+    std::vector<int64> new_slice_limits = slice->slice_limits();
+    for (int64 i = 0; i < new_slice_starts.size(); ++i) {
+      new_slice_starts[i] += operand_slice->slice_starts(i);
+      new_slice_limits[i] += operand_slice->slice_starts(i);
+    }
+    return ReplaceWithNewInstruction(
+        slice, HloInstruction::CreateSlice(
+                   slice->shape(), operand_slice->mutable_operand(0),
+                   new_slice_starts, new_slice_limits, slice->slice_strides()));
+  }
   return Status::OK();
 }
 
 Status AlgebraicSimplifierVisitor::HandleDynamicSlice(
     HloInstruction* dynamic_slice) {
   auto operand = dynamic_slice->mutable_operand(0);
-  auto start_indices = dynamic_slice->operand(1);
   if (ShapeUtil::IsScalar(dynamic_slice->shape())) {
     return ReplaceInstruction(dynamic_slice, operand);
   }
-  // DynamicSlice where operand has the same size as the output and
-  // start_indices are all zero is simply equal to operand.
-  if (IsAll(start_indices, 0) && SameShape(operand, dynamic_slice)) {
+  // DynamicSlice where operand has the same size as the output is simply equal
+  // to operand.
+  if (SameShape(operand, dynamic_slice)) {
     return ReplaceInstruction(dynamic_slice, operand);
   }
   return Status::OK();
@@ -1765,20 +1783,10 @@ Status AlgebraicSimplifierVisitor::HandleDynamicSlice(
 Status AlgebraicSimplifierVisitor::HandleDynamicUpdateSlice(
     HloInstruction* dynamic_update_slice) {
   auto update = dynamic_update_slice->mutable_operand(1);
-  auto start_indices = dynamic_update_slice->operand(2);
-  // DynamicUpdateSlice on a scalar just passes through the update argument.
-  if (ShapeUtil::IsScalar(dynamic_update_slice->shape())) {
-    return ReplaceInstruction(dynamic_update_slice, update);
-  }
 
-  // DynamicUpdateSlice where operand and update have the same size and
-  // start_indices are all zero is simply equal to update.
-  //
-  // (We require start_indices to be all zero because we want this optimization
-  // not to affect the visible behavior of this op even when the indices are out
-  // of range.  Currently dynamic-update-slice wraps out-of-range indices, so
-  // we can only remove the op if its indices never wrap.)
-  if (IsAll(start_indices, 0) && SameShape(dynamic_update_slice, update)) {
+  // DynamicUpdateSlice where operand and update have the same size is simply
+  // equal to update.
+  if (SameShape(dynamic_update_slice, update)) {
     return ReplaceInstruction(dynamic_update_slice, update);
   }
 
@@ -1904,6 +1912,26 @@ Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* reduce) {
                       new_reduce_dimensions, function));
     }
   }
+  // Convert Reduce(concat({a,b,...})) to
+  //  map(reduce(a),map(reduce(b),...,))
+  //
+  // This should make fusion easier or use less memory bandwidth in the unfused
+  // case.
+  if (arg->opcode() == HloOpcode::kConcatenate &&
+      c_linear_search(reduce->dimensions(), arg->concatenate_dimension())) {
+    HloInstruction* old_reduce = nullptr;
+    for (HloInstruction* operand : arg->operands()) {
+      HloInstruction* new_reduce = computation_->AddInstruction(
+          HloInstruction::CreateReduce(reduce->shape(), operand, init_value,
+                                       reduce->dimensions(), function));
+      if (old_reduce != nullptr) {
+        new_reduce = computation_->AddInstruction(HloInstruction::CreateMap(
+            reduce->shape(), {old_reduce, new_reduce}, function));
+      }
+      old_reduce = new_reduce;
+    }
+    return ReplaceInstruction(reduce, old_reduce);
+  }
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index 3f0f2afadd..8b81b4c97e 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -1250,6 +1250,55 @@ TEST_F(AlgebraicSimplifierTest, RemoveEmptyConcatenateOperands) {
               op::Concatenate(param0, param0, param1));
 }
 
+// Test that reduce of concat is simplified.
+TEST_F(AlgebraicSimplifierTest, SimplifyReduceOfConcat) {
+  const int kParamLength = 100;
+  Shape r3f32 =
+      ShapeUtil::MakeShape(F32, {kParamLength, kParamLength, kParamLength});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r3f32, "param0"));
+  HloInstruction* param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, r3f32, "param1"));
+  HloInstruction* param2 = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, r3f32, "param2"));
+  Shape concat_shape =
+      ShapeUtil::MakeShape(F32, {kParamLength, 3 * kParamLength, kParamLength});
+  HloInstruction* Concatenate =
+      builder.AddInstruction(HloInstruction::CreateConcatenate(
+          concat_shape, {param0, param1, param2}, 1));
+  HloComputation* add_computation = nullptr;
+  {
+    HloComputation::Builder builder(TestName() + ".add");
+    const Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
+    HloInstruction* p0 = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, scalar_shape, "p0"));
+    HloInstruction* p1 = builder.AddInstruction(
+        HloInstruction::CreateParameter(1, scalar_shape, "p1"));
+    builder.AddInstruction(
+        HloInstruction::CreateBinary(scalar_shape, HloOpcode::kAdd, p0, p1));
+    add_computation = module().AddEmbeddedComputation(builder.Build());
+  }
+  Shape r4f32 = ShapeUtil::MakeShape(F32, {4, 5, 6, 7});
+  Shape reduce_shape = ShapeUtil::MakeShape(F32, {kParamLength});
+
+  HloInstruction* zero = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0)));
+  builder.AddInstruction(HloInstruction::CreateReduce(
+      reduce_shape, Concatenate, zero, {1, 2}, add_computation));
+
+  auto computation = module().AddEntryComputation(builder.Build());
+
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+
+  EXPECT_THAT(
+      computation->root_instruction(),
+      op::Map(op::Map(op::Reduce(param0, zero), op::Reduce(param1, zero)),
+              op::Reduce(param2, zero)));
+}
+
 // Test a concatenate with only empty operands is removed.
 TEST_F(AlgebraicSimplifierTest, OnlyEmptyConcatenateOperands) {
   const int kParamLength = 100;
@@ -1859,6 +1908,39 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopSlice) {
   EXPECT_THAT(computation->root_instruction(), param);
 }
 
+TEST_F(AlgebraicSimplifierTest, SliceOfSliceToSlice) {
+  HloComputation::Builder builder(TestName());
+  const int64 dim0 = 11;
+  const int64 dim1 = 12;
+  HloInstruction* param =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeShape(F32, {dim0, dim1}), "param"));
+  HloInstruction* original_slice =
+      builder.AddInstruction(HloInstruction::CreateSlice(
+          ShapeUtil::MakeShape(F32, {dim0 - 2, dim1 - 4}), param,
+          /*start_indices=*/{1, 2},
+          /*limit_indices=*/{dim0 - 1, dim1 - 2}, /*strides=*/{1, 1}));
+
+  builder.AddInstruction(HloInstruction::CreateSlice(
+      ShapeUtil::MakeShape(F32, {dim0 - 5, dim1 - 9}), original_slice,
+      /*start_indices=*/{2, 3},
+      /*limit_indices=*/{dim0 - 3, dim1 - 6}, /*strides=*/{1, 1}));
+  auto module = CreateNewModule();
+  HloComputation* computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(), op::Slice(op::Slice(param)));
+
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
+
+  EXPECT_THAT(computation->root_instruction(), op::Slice(param));
+  EXPECT_EQ(computation->root_instruction()->slice_starts(0), 3);
+  EXPECT_EQ(computation->root_instruction()->slice_starts(1), 5);
+  EXPECT_EQ(computation->root_instruction()->slice_limits(0), dim0 - 2);
+  EXPECT_EQ(computation->root_instruction()->slice_limits(1), dim1 - 4);
+}
+
 TEST_F(AlgebraicSimplifierTest, ConvertConvToMatmul) {
   struct ConvTestOptions {
     int in_batch = 10;
@@ -2407,8 +2489,8 @@ TEST_F(AlgebraicSimplifierTest, TrivialDynamicSlice) {
       shape,
       builder.AddInstruction(
           HloInstruction::CreateParameter(0, shape, "slice_from")),
-      builder.AddInstruction(HloInstruction::CreateConstant(
-          LiteralUtil::CreateR1<int>({0, 0, 0}))),
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          1, ShapeUtil::MakeShape(U32, {3}), "slice_indices")),
       /*slice_sizes=*/{10, 100, 1000}));
 
   auto computation = module().AddEntryComputation(builder.Build());
@@ -2441,8 +2523,8 @@ TEST_F(AlgebraicSimplifierTest, TrivialDynamicUpdateSlice) {
       builder.AddInstruction(
           HloInstruction::CreateParameter(2, slice_shape, "to_update")),
       slice,
-      builder.AddInstruction(HloInstruction::CreateConstant(
-          LiteralUtil::CreateR1<int>({0, 0, 0})))));
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          3, ShapeUtil::MakeShape(U32, {3}), "update_indices"))));
 
   auto computation = module().AddEntryComputation(builder.Build());
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
diff --git a/tensorflow/compiler/xla/service/backend.cc b/tensorflow/compiler/xla/service/backend.cc
index 349b32451a..d12be3e007 100644
--- a/tensorflow/compiler/xla/service/backend.cc
+++ b/tensorflow/compiler/xla/service/backend.cc
@@ -96,24 +96,19 @@ Backend::CreateDefaultBackend() {
   return CreateBackend(backend_options);
 }
 
-StatusOr<Backend::StreamPtr> Backend::BorrowStream(int device_ordinal) {
-  TF_ASSIGN_OR_RETURN(auto exec, stream_executor(device_ordinal));
-  return BorrowStream(exec);
+StatusOr<StreamPool::Ptr> Backend::BorrowStream(int device_ordinal) {
+  TF_ASSIGN_OR_RETURN(auto executor, stream_executor(device_ordinal));
+  return BorrowStream(executor);
 }
 
-StatusOr<Backend::StreamPtr> Backend::BorrowStream(
-    se::StreamExecutor* executor) {
+StatusOr<StreamPool::Ptr> Backend::BorrowStream(se::StreamExecutor* executor) {
   tensorflow::mutex_lock l(mu_);
   if (0 == stream_pools_.count(executor)) {
     stream_pools_.emplace(std::piecewise_construct,
                           std::forward_as_tuple(executor),
-                          std::forward_as_tuple([executor]() {
-                            auto stream = MakeUnique<se::Stream>(executor);
-                            stream->Init();
-                            return stream;
-                          }));
+                          std::forward_as_tuple());
   }
-  return stream_pools_.at(executor).Allocate();
+  return stream_pools_.at(executor).BorrowStream(executor);
 }
 
 Backend::Backend(
diff --git a/tensorflow/compiler/xla/service/backend.h b/tensorflow/compiler/xla/service/backend.h
index 6546602473..1bc3796fa4 100644
--- a/tensorflow/compiler/xla/service/backend.h
+++ b/tensorflow/compiler/xla/service/backend.h
@@ -24,7 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
-#include "tensorflow/compiler/xla/service/pool.h"
+#include "tensorflow/compiler/xla/service/stream_pool.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -63,11 +63,9 @@ class BackendOptions {
 //
 // It also offers a pooling API for creation/use of initialized streams:
 //
-//    StreamPtr stream = backend->BorrowStream().ConsumeValueOrDie();
+//    StreamPool::Ptr stream = backend->BorrowStream().ConsumeValueOrDie();
 class Backend {
  public:
-  using StreamPtr = Pool<se::Stream>::SmartPtr;
-
   // Creates a new backend.
   static StatusOr<std::unique_ptr<Backend>> CreateBackend(
       const BackendOptions& options);
@@ -114,13 +112,13 @@ class Backend {
   // Borrows a stream for use by the caller, either by grabbing it from an
   // internal pool, or by constructing/initializating it, and returns the result
   // to the caller.
-  StatusOr<StreamPtr> BorrowStream(int device_ordinal);
-  StatusOr<StreamPtr> BorrowStream(se::StreamExecutor* executor);
+  StatusOr<StreamPool::Ptr> BorrowStream(int device_ordinal);
+  StatusOr<StreamPool::Ptr> BorrowStream(se::StreamExecutor* executor);
 
   // Returns a function to borrow a stream, as `BorrowStream` above does.
   // Purely for convenience, the caller could rather make this anonymous
   // function itself.
-  std::function<StatusOr<StreamPtr>(int)> StreamBorrower() {
+  std::function<StatusOr<StreamPool::Ptr>(int)> StreamBorrower() {
     return [this](int device_ordinal) { return BorrowStream(device_ordinal); };
   }
 
@@ -169,7 +167,7 @@ class Backend {
   tensorflow::mutex mu_;
 
   // Mapping from stream executor to stream pools, used by `BorrowStream` above.
-  std::map<se::StreamExecutor*, Pool<se::Stream>> stream_pools_ GUARDED_BY(mu_);
+  std::map<se::StreamExecutor*, StreamPool> stream_pools_ GUARDED_BY(mu_);
 
   // The default memory allocator to use.
   std::unique_ptr<StreamExecutorMemoryAllocator> memory_allocator_;
diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation.cc b/tensorflow/compiler/xla/service/bfloat16_propagation.cc
index b21c83a07f..2fb401c428 100644
--- a/tensorflow/compiler/xla/service/bfloat16_propagation.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_propagation.cc
@@ -215,7 +215,12 @@ bool BFloat16Propagation::AllUsersConsumeBF16(const HloInstruction& hlo,
     if (ContainsKey(values_that_must_be_kept_as_f32_, value)) {
       return false;
     }
-    if (ValueTypeAfterChange(value) == BF16) {
+    // We use the original type for the value because we are going to examine
+    // the uses of it, instead of the value itself. If ValueTypeAfterChange()
+    // were used, it would cause problems when there are aliasing buffers, i.e.,
+    // ResolveInconsistencyOfAliasingBuffers() would fail to revert the
+    // tentative change to BF16 even if the uses require F32.
+    if (value->shape().element_type() == BF16) {
       continue;
     }
     for (const HloUse& use : value->uses()) {
@@ -566,6 +571,9 @@ bool BFloat16Propagation::ResolveInconsistencyOfAliasingBuffersHelper(
       }
       visited_computations->insert(visited_in_while.begin(),
                                    visited_in_while.end());
+    } else if (hlo->opcode() == HloOpcode::kFusion) {
+      ResolveInconsistencyOfAliasingBuffersHelper(
+          hlo->fused_instructions_computation(), visited_computations);
     }
   }
   // Now adjust parameters of called computations.
@@ -769,8 +777,7 @@ StatusOr<bool> BFloat16Propagation::Run(HloModule* module) {
   // propagation in reverse topological order.
   for (auto comp_it = computations_topological_order.rbegin();
        comp_it != computations_topological_order.rend(); ++comp_it) {
-    if ((*comp_it)->IsFusionComputation()) {
-      // Fusion computations are handled when visiting the fusion instruction.
+    if (ContainsKey(computations_visited_in_backward_pass_, *comp_it)) {
       continue;
     }
     auto insts = (*comp_it)->MakeInstructionPostOrder();
@@ -778,6 +785,7 @@ StatusOr<bool> BFloat16Propagation::Run(HloModule* module) {
       DetermineInstructionPrecision(*inst_it,
                                     /*skip_parameters=*/true);
     }
+    computations_visited_in_backward_pass_.insert(*comp_it);
   }
 
   // It's possible that an instruction does not define a buffer, but the
diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc b/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
index aeafb25ad7..69b654d30e 100644
--- a/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
@@ -508,6 +508,63 @@ TEST_F(BFloat16PropagationTest, PropagateThroughSimpleWhile) {
   EXPECT_FALSE(OutputsBF16(dot));
 }
 
+// Tests that if the while condition prevents using BF16, no changes should be
+// made to the while body and thus the fusion node inside it.
+TEST_F(BFloat16PropagationTest,
+       ConditionPreventsPropagationForFusionInsideWhile) {
+  auto module = CreateNewModule();
+  auto builder = HloComputation::Builder(TestName());
+  Shape shape = ShapeUtil::MakeShape(F32, {4, 4});
+
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "param0"));
+  HloInstruction* param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, shape, "param1"));
+  HloInstruction* add = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param0, param1));
+
+  auto builder_cond = HloComputation::Builder("cond");
+  auto cond_param = builder_cond.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "cond_param"));
+  builder_cond.AddInstruction(HloInstruction::CreateBinary(
+      ShapeUtil::MakeShape(PRED, {}), HloOpcode::kGt,
+      builder_cond.AddInstruction(HloInstruction::CreateSlice(
+          ShapeUtil::MakeShape(F32, {}), cond_param, {0, 0}, {1, 1}, {1, 1})),
+      builder_cond.AddInstruction(HloInstruction::CreateSlice(
+          ShapeUtil::MakeShape(F32, {}), cond_param, {1, 1}, {2, 2}, {1, 1}))));
+  auto cond = module->AddEmbeddedComputation(builder_cond.Build());
+
+  auto builder_body = HloComputation::Builder("body");
+  auto body_param = builder_body.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "body_param"));
+  auto body_transpose = builder_body.AddInstruction(
+      HloInstruction::CreateTranspose(shape, body_param, {0, 1}));
+
+  auto builder_f = HloComputation::Builder("fusion");
+  HloInstruction* a_f =
+      builder_f.AddInstruction(HloInstruction::CreateParameter(0, shape, "a"));
+  builder_f.AddInstruction(HloInstruction::CreateTranspose(shape, a_f, {0, 1}));
+  auto comp_f = module->AddEmbeddedComputation(builder_f.Build());
+  auto body_fusion = builder_body.AddInstruction(HloInstruction::CreateFusion(
+      shape, HloInstruction::FusionKind::kCustom, {body_transpose}, comp_f));
+  auto body = module->AddEmbeddedComputation(builder_body.Build());
+
+  auto while_hlo = builder.AddInstruction(
+      HloInstruction::CreateWhile(shape, cond, body, add));
+
+  auto dot = builder.AddInstruction(HloInstruction::CreateBinary(
+      shape, HloOpcode::kDot, while_hlo, while_hlo));
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_FALSE(PropagatePrecision(module.get()));
+  EXPECT_EQ(computation->root_instruction(), dot);
+  EXPECT_FALSE(OutputsBF16(add));
+  EXPECT_FALSE(OutputsBF16(body_fusion));
+  EXPECT_FALSE(OutputsBF16(body_param));
+  EXPECT_FALSE(OutputsBF16(body_transpose));
+  EXPECT_FALSE(OutputsBF16(a_f));
+}
+
 // Tests that BF16 is propagated properly through while computations with
 // tuple-shaped input/output.
 TEST_F(BFloat16PropagationTest, PropagateThroughTupleWhile) {
@@ -553,10 +610,14 @@ TEST_F(BFloat16PropagationTest, PropagateThroughTupleWhile) {
       HloInstruction::CreateGetTupleElement(shape, body_param, 0));
   auto body_rhs = builder_body.AddInstruction(
       HloInstruction::CreateGetTupleElement(shape, body_param, 1));
-  auto body_dot = builder_body.AddInstruction(
+  auto body_dot1 = builder_body.AddInstruction(
       HloInstruction::CreateBinary(shape, HloOpcode::kDot, body_lhs, body_rhs));
+  auto body_dot2 = builder_body.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kDot, body_rhs, body_lhs));
+  auto body_transpose = builder_body.AddInstruction(
+      HloInstruction::CreateTranspose(shape, body_dot2, {0, 1}));
   builder_body.AddInstruction(
-      HloInstruction::CreateTuple({body_dot, body_rhs}));
+      HloInstruction::CreateTuple({body_dot1, body_transpose}));
   auto body = module->AddEmbeddedComputation(builder_body.Build());
 
   auto while_hlo = builder.AddInstruction(
@@ -575,9 +636,11 @@ TEST_F(BFloat16PropagationTest, PropagateThroughTupleWhile) {
   EXPECT_EQ(computation->root_instruction(), dot);
   EXPECT_TRUE(OutputsBF16(lhs));
   EXPECT_FALSE(OutputsBF16(rhs));
-  EXPECT_TRUE(OutputsBF16(body_dot));
+  EXPECT_TRUE(OutputsBF16(body_dot1));
   EXPECT_TRUE(OutputsBF16(body_lhs));
   EXPECT_FALSE(OutputsBF16(body_rhs));
+  EXPECT_FALSE(OutputsBF16(body_dot2));
+  EXPECT_FALSE(OutputsBF16(body_transpose));
   EXPECT_TRUE(OutputsBF16(cond_lhs));
   EXPECT_FALSE(OutputsBF16(cond_rhs));
   EXPECT_TRUE(OutputsBF16(add0));
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index 783e3f7e73..e4d2e73b99 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -270,7 +270,7 @@ BufferAllocationProto BufferAllocation::ToProto() const {
   proto.set_index(index_);
   proto.set_size(size_);
   proto.set_is_thread_local(is_thread_local_);
-  proto.set_is_reusable(is_reusable_);
+  proto.set_is_tuple(is_tuple_);
   proto.set_color(color_.value());
   if (is_entry_computation_parameter_) {
     proto.set_is_entry_computation_parameter(true);
@@ -279,6 +279,7 @@ BufferAllocationProto BufferAllocation::ToProto() const {
     }
     proto.set_parameter_number(parameter_number_);
   }
+  proto.set_is_constant(is_constant_);
   proto.set_maybe_live_out(maybe_live_out_);
   for (const auto& buffer_offset_size : assigned_buffers_) {
     BufferAllocationProto::Assigned* proto_assigned = proto.add_assigned();
@@ -304,6 +305,9 @@ string BufferAllocation::ToString() const {
     StrAppend(&output, ", parameter ", parameter_number(), " at ShapeIndex ",
               param_shape_index().ToString());
   }
+  if (is_constant()) {
+    StrAppend(&output, ", constant");
+  }
   if (is_thread_local()) {
     StrAppend(&output, ", thread-local");
   }
@@ -491,20 +495,16 @@ BufferAssignment::GetUniqueTopLevelOutputSlice() const {
 }
 
 BufferAllocation* BufferAssignment::NewEmptyAllocation(
-    int64 size, bool is_thread_local, bool is_reusable,
-    LogicalBuffer::Color color) {
+    int64 size, LogicalBuffer::Color color) {
   BufferAllocation::Index index = allocations_.size();
-  allocations_.emplace_back(index, size, is_thread_local, is_reusable, color);
+  allocations_.emplace_back(index, size, color);
   BufferAllocation* allocation = &allocations_.back();
   return allocation;
 }
 
 BufferAllocation* BufferAssignment::NewAllocation(const LogicalBuffer& buffer,
-                                                  int64 size,
-                                                  bool is_thread_local,
-                                                  bool is_reusable) {
-  BufferAllocation* allocation =
-      NewEmptyAllocation(size, is_thread_local, is_reusable, buffer.color());
+                                                  int64 size) {
+  BufferAllocation* allocation = NewEmptyAllocation(size, buffer.color());
   AddAssignment(allocation, buffer, /*offset=*/0, size);
   allocation->peak_buffers_.push_back(&buffer);
   return allocation;
@@ -517,7 +517,8 @@ void BufferAssignment::AddAssignment(BufferAllocation* allocation,
   CHECK_EQ(0, allocation_index_for_buffer_.count(&buffer))
       << "LogicalBuffer " << buffer << " already has an allocation.";
   CHECK(allocation->is_reusable() || allocation->assigned_buffers().empty())
-      << "Non-reusable allocation already assigned a buffer";
+      << "Non-reusable allocation already assigned a buffer: "
+      << allocation->ToString();
 
   TF_CHECK_OK(points_to_analysis().VerifyBuffer(buffer));
 
@@ -609,6 +610,10 @@ Status BufferAssignment::ComputeSummaryStats() {
       stats_.parameter_allocation_count++;
       stats_.parameter_allocation_bytes += allocation.size();
     }
+    if (allocation.is_constant()) {
+      stats_.constant_allocation_count++;
+      stats_.constant_allocation_bytes += allocation.size();
+    }
     if (allocation.maybe_live_out()) {
       stats_.maybe_live_out_allocation_count++;
       stats_.maybe_live_out_allocation_bytes += allocation.size();
@@ -645,6 +650,8 @@ string BufferAssignment::Stats::ToString() const {
   Appendf(&s, "BufferAssignment stats:\n");
   Appendf(&s, "             parameter allocation: %10s\n",
           HumanReadableNumBytes(parameter_allocation_bytes).c_str());
+  Appendf(&s, "              constant allocation: %10s\n",
+          HumanReadableNumBytes(constant_allocation_bytes).c_str());
   Appendf(&s, "        maybe_live_out allocation: %10s\n",
           HumanReadableNumBytes(maybe_live_out_allocation_bytes).c_str());
   Appendf(&s, "     preallocated temp allocation: %10s\n",
@@ -722,8 +729,10 @@ StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::Run(
     const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
     LogicalBuffer::SizeFunction buffer_size,
     LogicalBuffer::AlignmentFunction color_alignment,
-    bool allow_input_output_aliasing, BufferLiveness::Colorer colorer) {
-  BufferAssigner assigner(allow_input_output_aliasing, std::move(colorer));
+    bool allow_input_output_aliasing, bool allocate_buffers_for_constants,
+    BufferLiveness::Colorer colorer) {
+  BufferAssigner assigner(allow_input_output_aliasing,
+                          allocate_buffers_for_constants, std::move(colorer));
   return assigner.CreateAssignment(module, std::move(hlo_ordering),
                                    std::move(buffer_size),
                                    std::move(color_alignment));
@@ -751,8 +760,8 @@ bool BufferAssigner::MaybeAssignBuffer(BufferAllocation* allocation,
     return false;
   }
 
-  if (allocation->is_entry_computation_parameter()) {
-    VLOG(4) << "Can't assign: allocation holds parameter";
+  if (allocation->is_readonly()) {
+    VLOG(4) << "Can't assign: allocation is readonly";
     return false;
   }
 
@@ -808,8 +817,7 @@ bool BufferAssigner::MaybeAssignBuffer(BufferAllocation* allocation,
 }
 
 Status BufferAssigner::AssignBuffersForComputation(
-    const HloComputation* computation, const DebugOptions& debug_options,
-    bool is_thread_local,
+    const HloComputation* computation, bool is_thread_local,
     const FlatSet<const LogicalBuffer*>& colocated_buffers,
     const FlatSet<BufferAllocation::Index>& colocated_allocations,
     FlatMap<const HloComputation*, FlatSet<const LogicalBuffer*>>*
@@ -905,15 +913,19 @@ Status BufferAssigner::AssignBuffersForComputation(
     TF_RET_CHECK(!assignment->HasAllocation(*buffer));
 
     const HloInstruction* instruction = buffer->instruction();
+    const int64 buffer_size = assignment->buffer_size_(*buffer);
+
     if (instruction->opcode() == HloOpcode::kConstant) {
-      // No BufferAllocations for constants.
-      // TODO(b/32248867): For consistency, constants should get allocations.
-      VLOG(3) << "Skipping constant: " << *buffer;
+      if (allocate_buffers_for_constants_) {
+        BufferAllocation* allocation =
+            assignment->NewAllocation(*buffer, buffer_size);
+        allocation->set_constant(true);
+        VLOG(3) << "New allocation #" << allocation->index() << " for constant "
+                << *buffer;
+      }
       continue;
     }
 
-    const int64 buffer_size = assignment->buffer_size_(*buffer);
-
     const bool is_entry_parameter =
         instruction->opcode() == HloOpcode::kParameter &&
         computation == computation->parent()->entry_computation();
@@ -923,9 +935,7 @@ Status BufferAssigner::AssignBuffersForComputation(
       // computations do not need special allocations because they live inside
       // callers.
       BufferAllocation* allocation =
-          assignment->NewAllocation(*buffer, buffer_size,
-                                    /*is_thread_local=*/false,
-                                    /*is_reusable=*/false);
+          assignment->NewAllocation(*buffer, buffer_size);
       allocation->set_entry_computation_parameter(
           instruction->parameter_number(), buffer->index());
       VLOG(3) << "New allocation #" << allocation->index()
@@ -934,20 +944,18 @@ Status BufferAssigner::AssignBuffersForComputation(
     }
 
     if (is_thread_local) {
-      // We do not reuse thread-local buffers for now, because they are
-      // dynamically allocated and their lifetimes are hard to compute.
-      BufferAllocation* allocation = assignment->NewAllocation(
-          *buffer, buffer_size, is_thread_local, /*is_reusable=*/false);
+      BufferAllocation* allocation =
+          assignment->NewAllocation(*buffer, buffer_size);
+      allocation->set_is_thread_local(true);
       VLOG(3) << "New allocation #" << allocation->index()
               << " for thread-local: " << *buffer;
       continue;
     }
 
     if (ShapeUtil::IsTuple(buffer->shape())) {
-      // TODO(b/34669761): Don't reuse tuple buffers because the GPU backend
-      // assumes longer buffer liveness than indicated by the analysis.
-      BufferAllocation* allocation = assignment->NewAllocation(
-          *buffer, buffer_size, is_thread_local, /*is_reusable=*/false);
+      BufferAllocation* allocation =
+          assignment->NewAllocation(*buffer, buffer_size);
+      allocation->set_is_tuple(true);
       VLOG(3) << "New allocation #" << allocation->index()
               << " for tuple-shaped buffer: " << *buffer;
       continue;
@@ -1030,8 +1038,8 @@ Status BufferAssigner::AssignBuffersForComputation(
     }
 
     if (!assignment->HasAllocation(*buffer)) {
-      BufferAllocation* allocation = assignment->NewAllocation(
-          *buffer, buffer_size, is_thread_local, /*is_reusable=*/true);
+      BufferAllocation* allocation =
+          assignment->NewAllocation(*buffer, buffer_size);
       allocation_indices.push_back(allocation->index());
       VLOG(3) << "New allocation #" << allocation->index()
               << " for: " << *buffer;
@@ -1085,6 +1093,7 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
       VLOG(2) << "Simulating heap for color " << color;
       int64 alignment = assignment->color_alignment_(color);
       HeapSimulator::Options options;
+      options.alloc_constants = allocate_buffers_for_constants_;
       BufferValueFlatSet buffer_value_set =
           ToBufferValueFlatSet(single_colored_set.second);
       options.buffers_to_assign = &buffer_value_set;
@@ -1227,8 +1236,8 @@ void BufferAssigner::AssignBuffersFromHeapSimulator(
         result.fragmentation_size;
   }
 
-  BufferAllocation* allocation = assignment->NewEmptyAllocation(
-      result.heap_size, /*is_thread_local=*/false, /*is_reusable=*/true, color);
+  BufferAllocation* allocation =
+      assignment->NewEmptyAllocation(result.heap_size, color);
   for (const auto& buffer_chunk : result.chunk_map) {
     // TODO(lauj) Remove this down_cast after downstream users of
     // BufferAllocation::assigned_buffers() are updated to use BufferValue.
@@ -1332,11 +1341,25 @@ BufferAssigner::MergeColocatedBufferSets(
   auto cannot_merge_buffer_sets = [&colocated_buffer_sets, &buffer_liveness,
                                    &buffer_size,
                                    &is_entry_parameter](int64 i, int64 j) {
-    // Do not merge if one of the sets includes live outs or entry parameters.
+    // Do not merge if one of the sets includes live outs, entry parameters or
+    // constants.
+    //
+    // Buffer liveness does not report the correct live range for entry
+    // parameter and live out buffers so we have to special case them here.  On
+    // backends that support constant buffer allocations, constant buffers are
+    // assigned globals in readonly storage so we can't merge colocated buffer
+    // sets containing constants with colocated buffer sets containing writing
+    // instructions or other constants.
+    //
+    // Moreover (on the CPU/GPU backends) the entry parameter buffers belong to
+    // the caller of the executable so we can't write to entry parameters
+    // either, and the argument for not merging constants also applies to entry
+    // parameters.
     for (int64 key : {i, j}) {
       for (auto& buffer : colocated_buffer_sets[key]) {
         if (buffer_liveness.MaybeLiveOut(*buffer) ||
-            is_entry_parameter(*buffer)) {
+            is_entry_parameter(*buffer) ||
+            buffer->instruction()->opcode() == HloOpcode::kConstant) {
           return true;
         }
       }
@@ -1566,6 +1589,7 @@ void BufferAssigner::AssignColocatedBufferSets(
     // param in 'colocated_buffer_set'.
     int64 entry_parameter_number = -1;
     const ShapeIndex* entry_parameter_shape_idx = nullptr;
+    bool is_constant = false;
     for (const LogicalBuffer* buffer : colocated_buffer_set) {
       const HloInstruction* instruction = buffer->instruction();
       const HloComputation* computation = instruction->parent();
@@ -1573,10 +1597,14 @@ void BufferAssigner::AssignColocatedBufferSets(
           computation == computation->parent()->entry_computation()) {
         entry_parameter_number = instruction->parameter_number();
         entry_parameter_shape_idx = &buffer->index();
-        break;
+      } else if (instruction->opcode() == HloOpcode::kConstant) {
+        is_constant = true;
       }
     }
 
+    CHECK(!is_constant || entry_parameter_number == -1)
+        << "Copy insertion should have inserted copies to prevent this.";
+
     for (const LogicalBuffer* buffer : colocated_buffer_set) {
       const int64 buffer_size = assignment->buffer_size_(*buffer);
       if (allocation == nullptr) {
@@ -1584,18 +1612,14 @@ void BufferAssigner::AssignColocatedBufferSets(
         // allocations for each colocated buffer set. When liveness has
         // module-level scope, we can allow buffers to be shared across
         // computations (in some cases).
-        allocation = assignment->NewAllocation(*buffer, buffer_size,
-                                               /*is_thread_local=*/false,
-                                               /*is_reusable=*/true);
+        allocation = assignment->NewAllocation(*buffer, buffer_size);
         if (entry_parameter_number >= 0) {
-          // This colocated buffer set contains an entry parameter and other
-          // logical buffers which use the parameter as read-only in a while
-          // body computation (which updates in place).
-          // Set 'entry_computation_parameter' to indicate that it contains
-          // an entry parameter, and to prevent reuse in MaybeAssignBuffer.
           allocation->set_entry_computation_parameter(
               entry_parameter_number, *entry_parameter_shape_idx);
         }
+        if (is_constant) {
+          allocation->set_constant(true);
+        }
         colocated_allocations->insert(allocation->index());
       } else {
         CHECK_EQ(buffer_size, allocation->size())
@@ -1653,7 +1677,7 @@ StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
       buffers_to_assign_sequentially;
   for (auto* computation : global_computations) {
     TF_RETURN_IF_ERROR(AssignBuffersForComputation(
-        computation, module->config().debug_options(),
+        computation,
         /*is_thread_local=*/false, colocated_buffers, colocated_allocations,
         &buffers_to_assign_sequentially, assignment.get()));
   }
@@ -1674,7 +1698,7 @@ StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
       continue;
     }
     TF_RETURN_IF_ERROR(AssignBuffersForComputation(
-        computation, module->config().debug_options(),
+        computation,
         /*is_thread_local=*/true, colocated_buffers, colocated_allocations,
         /*buffers_to_assign_sequentially=*/nullptr, assignment.get()));
   }
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.h b/tensorflow/compiler/xla/service/buffer_assignment.h
index ad0b0bf7c2..94495290c1 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.h
+++ b/tensorflow/compiler/xla/service/buffer_assignment.h
@@ -32,7 +32,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
@@ -58,13 +57,8 @@ class BufferAllocation {
   // contiguously and can be used as array indexes.
   using Index = int64;
 
-  BufferAllocation(Index index, int64 size, bool is_thread_local,
-                   bool is_reusable, LogicalBuffer::Color color)
-      : index_(index),
-        size_(size),
-        is_thread_local_(is_thread_local),
-        is_reusable_(is_reusable),
-        color_(color) {}
+  BufferAllocation(Index index, int64 size, LogicalBuffer::Color color)
+      : index_(index), size_(size), color_(color) {}
   ~BufferAllocation() {}
 
   // Returns the index of this allocation.
@@ -74,9 +68,28 @@ class BufferAllocation {
   // inside of a map or reduce computation. Such allocations need to be thread
   // local.
   bool is_thread_local() const { return is_thread_local_; }
+  void set_is_thread_local(bool is_thread_local) {
+    is_thread_local_ = is_thread_local;
+  }
 
   // Whether this allocation can be used by more than one logical buffer.
-  bool is_reusable() const { return is_reusable_; }
+  bool is_reusable() const {
+    // We do not reuse thread-local buffers for now, because they are
+    // dynamically allocated and their lifetimes are hard to compute.
+    //
+    // TODO(b/34669761): Don't reuse tuple buffers because the GPU backend
+    // assumes longer buffer liveness than indicated by the analysis.
+    return !is_thread_local() && !is_tuple();
+  }
+
+  // Whether this allocation is readonly i.e. backed by memory we cannot write
+  // to.
+  bool is_readonly() const {
+    return is_entry_computation_parameter() || is_constant();
+  }
+
+  bool is_tuple() const { return is_tuple_; }
+  void set_is_tuple(bool is_tuple) { is_tuple_ = is_tuple; }
 
   // Whether this allocation holds a LogicalBuffer from a parameter of the entry
   // computation. These buffers have lifetimes which may be longer than the
@@ -84,6 +97,13 @@ class BufferAllocation {
   bool is_entry_computation_parameter() const {
     return is_entry_computation_parameter_;
   }
+
+  // Whether this allocation holds a constant.  On the CPU and GPU backends
+  // constant allocations are not allocated dynamically, instead we resolve
+  // references to these buffer allocations to a global in the readonly section
+  // of the binary.
+  bool is_constant() const { return is_constant_; }
+
   // If this allocation holds a Buffer from a parameter of the entry
   // computation, this methods returns the parameter number. CHECKs otherwise.
   int64 parameter_number() const {
@@ -189,7 +209,9 @@ class BufferAllocation {
            // of the computation.
            !maybe_live_out() &&
            // Thread-local buffers are allocated using `alloca`s.
-           !is_thread_local();
+           !is_thread_local() &&
+           // Constant buffers are allocated as global values.
+           !is_constant();
   }
 
   // Add a heap trace which was used to assign slices to logical buffers in this
@@ -245,6 +267,8 @@ class BufferAllocation {
     parameter_number_ = parameter_number;
     param_shape_index_ = std::move(param_shape_index);
   }
+
+  void set_constant(bool is_constant) { is_constant_ = is_constant; }
   void set_maybe_live_out(bool value) { maybe_live_out_ = value; }
   void set_index(Index index) { index_ = index; }
   void set_size(int64 size) { size_ = size; }
@@ -256,10 +280,10 @@ class BufferAllocation {
   int64 size_;
 
   // Whether this buffer needs to be thread-local.
-  bool is_thread_local_;
+  bool is_thread_local_ = false;
 
-  // Whether this buffer is usable by more than one logical buffer.
-  bool is_reusable_;
+  // Whether this buffer holds a tuple.
+  bool is_tuple_ = false;
 
   // Color of the allocation.
   LogicalBuffer::Color color_;
@@ -283,6 +307,9 @@ class BufferAllocation {
   // might not actually escape.
   bool maybe_live_out_ = false;
 
+  // See comment on the is_constant() accessor.
+  bool is_constant_ = false;
+
   // Mapping from the set of buffers assigned to this allocation to their
   // logical offsets and sizes.
   tensorflow::gtl::FlatMap<const LogicalBuffer*, OffsetSize> assigned_buffers_;
@@ -398,6 +425,8 @@ class BufferAssignment {
   struct Stats {
     int64 parameter_allocation_count = 0;
     int64 parameter_allocation_bytes = 0;
+    int64 constant_allocation_count = 0;
+    int64 constant_allocation_bytes = 0;
     int64 maybe_live_out_allocation_count = 0;
     int64 maybe_live_out_allocation_bytes = 0;
     int64 preallocated_temp_allocation_count = 0;
@@ -426,14 +455,11 @@ class BufferAssignment {
 
   // Creates and returns a new BufferAllocation, with no assigned
   // LogicalBuffers. Ownership is maintained internally.
-  BufferAllocation* NewEmptyAllocation(int64 size, bool is_thread_local,
-                                       bool is_reusable,
-                                       LogicalBuffer::Color color);
+  BufferAllocation* NewEmptyAllocation(int64 size, LogicalBuffer::Color color);
 
   // Helper that calls NewEmptyAllocation and AddAssignment in one call,
   // creating an allocation containing a single LogicalBuffer.
-  BufferAllocation* NewAllocation(const LogicalBuffer& buffer, int64 size,
-                                  bool is_thread_local, bool is_reusable);
+  BufferAllocation* NewAllocation(const LogicalBuffer& buffer, int64 size);
 
   // Adds a LogicalBuffer to the set assigned to the given allocation.
   void AddAssignment(BufferAllocation* allocation, const LogicalBuffer& buffer,
@@ -493,12 +519,15 @@ class BufferAssigner {
       LogicalBuffer::SizeFunction buffer_size,
       LogicalBuffer::AlignmentFunction color_alignment,
       bool allow_input_output_aliasing = false,
+      bool allocate_buffers_for_constants = false,
       BufferLiveness::Colorer colorer = BufferLiveness::DefaultColorer());
 
  private:
   BufferAssigner(bool allow_input_output_aliasing,
+                 bool allocate_buffers_for_constants,
                  BufferLiveness::Colorer colorer)
       : allow_input_output_aliasing_(allow_input_output_aliasing),
+        allocate_buffers_for_constants_(allocate_buffers_for_constants),
         colorer_(colorer) {}
   virtual ~BufferAssigner() = default;
 
@@ -513,8 +542,7 @@ class BufferAssigner {
   // true, then all assigned buffers have the is_thread_local flag set to
   // true.
   Status AssignBuffersForComputation(
-      const HloComputation* computation, const DebugOptions& debug_options,
-      bool is_thread_local,
+      const HloComputation* computation, bool is_thread_local,
       const tensorflow::gtl::FlatSet<const LogicalBuffer*>& colocated_buffers,
       const tensorflow::gtl::FlatSet<BufferAllocation::Index>&
           colocated_allocations,
@@ -595,6 +623,9 @@ class BufferAssigner {
   // buffers can be shared if their sizes match.
   bool allow_input_output_aliasing_;
 
+  // If true, allocate buffers for constant instructions.
+  bool allocate_buffers_for_constants_;
+
   // Functor used to assign colors to newly allocated logical buffers.
   BufferLiveness::Colorer colorer_;
 
diff --git a/tensorflow/compiler/xla/service/buffer_assignment_test.cc b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
index bfd20921e2..eccb146a0d 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
@@ -89,7 +89,20 @@ class BufferAssignmentTest : public HloTestBase {
     return BufferAssigner::Run(
                module, xla::MakeUnique<DependencyHloOrdering>(module),
                backend().compiler()->BufferSizeBytesFunction(),
-               [alignment](LogicalBuffer::Color) { return alignment; })
+               [alignment](LogicalBuffer::Color) { return alignment; },
+               /*allow_input_output_aliasing=*/false,
+               /*allocate_buffers_for_constants=*/true)
+        .ConsumeValueOrDie();
+  }
+
+  std::unique_ptr<BufferAssignment> RunBufferAssignmentNoBuffersForConstants(
+      HloModule* module, int64 alignment = 1) {
+    return BufferAssigner::Run(
+               module, xla::MakeUnique<DependencyHloOrdering>(module),
+               backend().compiler()->BufferSizeBytesFunction(),
+               [alignment](LogicalBuffer::Color) { return alignment; },
+               /*allow_input_output_aliasing=*/false,
+               /*allocate_buffers_for_constants=*/false)
         .ConsumeValueOrDie();
   }
 
@@ -98,8 +111,9 @@ class BufferAssignmentTest : public HloTestBase {
     return BufferAssigner::Run(
                module, xla::MakeUnique<DependencyHloOrdering>(module),
                backend().compiler()->BufferSizeBytesFunction(),
-               [alignment](LogicalBuffer::Color) { return alignment; }, false,
-               std::move(colorer))
+               [alignment](LogicalBuffer::Color) { return alignment; },
+               /*allow_input_output_aliasing=*/false,
+               /*allocate_buffers_for_constants=*/true, std::move(colorer))
         .ConsumeValueOrDie();
   }
 
@@ -115,7 +129,9 @@ class BufferAssignmentTest : public HloTestBase {
                module,
                xla::MakeUnique<SequentialHloOrdering>(module, module_sequence),
                backend().compiler()->BufferSizeBytesFunction(),
-               [alignment](LogicalBuffer::Color) { return alignment; })
+               [alignment](LogicalBuffer::Color) { return alignment; },
+               /*allow_input_output_aliasing=*/false,
+               /*allocate_buffers_for_constants=*/true)
         .ConsumeValueOrDie();
   }
 
@@ -294,9 +310,15 @@ TEST_F(BufferAssignmentTest, ScalarConstant) {
   auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
 
-  auto buffers = RunBufferAssignment(module.get());
-  // Check that the constant does not have a buffer assigned.
-  EXPECT_FALSE(buffers->HasTopLevelAllocation(const0));
+  {
+    auto buffers = RunBufferAssignment(module.get());
+    EXPECT_TRUE(buffers->HasTopLevelAllocation(const0));
+  }
+
+  {
+    auto buffers = RunBufferAssignmentNoBuffersForConstants(module.get());
+    EXPECT_FALSE(buffers->HasTopLevelAllocation(const0));
+  }
 }
 
 TEST_F(BufferAssignmentTest, BufferForConst) {
@@ -312,12 +334,18 @@ TEST_F(BufferAssignmentTest, BufferForConst) {
   auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
 
-  auto buffers = RunBufferAssignment(module.get());
-  // The two constant nodes have no buffers assigned.
-  EXPECT_FALSE(buffers->HasTopLevelAllocation(const0));
-  EXPECT_FALSE(buffers->HasTopLevelAllocation(const1));
-  // The add node has an output buffer.
-  GetAssignedOutputAllocation(*buffers, add);
+  {
+    auto buffers = RunBufferAssignment(module.get());
+    EXPECT_TRUE(buffers->HasTopLevelAllocation(const0));
+    EXPECT_TRUE(buffers->HasTopLevelAllocation(const1));
+    GetAssignedOutputAllocation(*buffers, add);
+  }
+  {
+    auto buffers = RunBufferAssignmentNoBuffersForConstants(module.get());
+    EXPECT_FALSE(buffers->HasTopLevelAllocation(const0));
+    EXPECT_FALSE(buffers->HasTopLevelAllocation(const1));
+    GetAssignedOutputAllocation(*buffers, add);
+  }
 }
 
 TEST_F(BufferAssignmentTest, HasAllocationAt) {
@@ -1196,7 +1224,7 @@ TEST_F(BufferAssignmentTest, ElementOfNestedTupleParameterAsOutput) {
 
 // TODO(b/32248867): Enable when buffer assignment gives allocations to
 // constants.
-TEST_F(BufferAssignmentTest, DISABLED_TupleConstantAsOutput) {
+TEST_F(BufferAssignmentTest, TupleConstantAsOutput) {
   // Test that a tuple constant which is forwarded to the computation output
   // is properly handled.
   auto builder = HloComputation::Builder(TestName());
@@ -1644,6 +1672,66 @@ TEST_F(BufferAssignmentTest, PeakBuffersWhile) {
       nonbcast_buffer->instruction() == condition->parameter_instruction(0));
 }
 
+TEST_F(BufferAssignmentTest, ConstantBuffersAreNotReused) {
+  const char* hlo_text = R"(
+HloModule Module
+
+True {
+  ROOT x.0.1 = f32[] parameter(0)
+}
+
+False {
+  x.0.0 = f32[] parameter(0)
+  ROOT copy.1 = f32[] copy(x.0.0)
+}
+
+ENTRY main {
+  pred.1.0 = pred[] parameter(0)
+  constant.1.1 = f32[] constant(56)
+  copy.2 = f32[] copy(constant.1.1)
+  constant.1.2 = f32[] constant(12)
+  ROOT conditional.1.3 = f32[] conditional(pred.1.0, copy.2, constant.1.2),
+      true_computation=True, false_computation=False
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_text));
+
+  HloInstruction* constant_1 =
+      module->entry_computation()->GetInstructionWithName("constant.1.1");
+  HloInstruction* constant_2 =
+      module->entry_computation()->GetInstructionWithName("constant.1.2");
+
+  auto buffers = RunBufferAssignment(module.get());
+
+  {
+    const BufferAllocation& allocation_for_const_1 =
+        GetTopLevelAllocation(*buffers, constant_1);
+    EXPECT_TRUE(allocation_for_const_1.is_constant());
+    for (const auto& buffer_offset_pair :
+         allocation_for_const_1.assigned_buffers()) {
+      EXPECT_NE(buffer_offset_pair.first->instruction()->opcode(),
+                HloOpcode::kCopy);
+      EXPECT_NE(buffer_offset_pair.first->instruction()->opcode(),
+                HloOpcode::kConditional);
+    }
+  }
+
+  {
+    const BufferAllocation& allocation_for_const_2 =
+        GetTopLevelAllocation(*buffers, constant_2);
+    EXPECT_TRUE(allocation_for_const_2.is_constant());
+    for (const auto& buffer_offset_pair :
+         allocation_for_const_2.assigned_buffers()) {
+      EXPECT_NE(buffer_offset_pair.first->instruction()->opcode(),
+                HloOpcode::kCopy);
+      EXPECT_NE(buffer_offset_pair.first->instruction()->opcode(),
+                HloOpcode::kConditional);
+    }
+  }
+}
+
 class WhileBufferAssignmentTest : public HloTestBase {
  protected:
   std::unique_ptr<HloComputation> BuildWhileConditionComputation(
@@ -1683,7 +1771,9 @@ class WhileBufferAssignmentTest : public HloTestBase {
     return BufferAssigner::Run(
                module, xla::MakeUnique<SequentialHloOrdering>(module, sequence),
                ByteSizeOf,
-               [alignment](LogicalBuffer::Color) { return alignment; })
+               [alignment](LogicalBuffer::Color) { return alignment; },
+               /*allow_input_output_aliasing=*/false,
+               /*allocate_buffers_for_constants=*/true)
         .ConsumeValueOrDie();
   }
 
@@ -1833,6 +1923,74 @@ ENTRY %test_module {
   EXPECT_NE(slice_param, slice_while1);
 }
 
+TEST_F(WhileBufferAssignmentTest, ColocatedBufferWithConstant) {
+  const Shape r0s32 = ShapeUtil::MakeShape(S32, {});
+
+  const char* module_str = R"(
+HloModule test_module
+
+%cond.v0 {
+  %param = s32[] parameter(0)
+  ROOT %constant = pred[] constant(true)
+}
+
+%cond.v1 {
+  %param.0 = s32[] parameter(0)
+  ROOT %constant.0 = pred[] constant(true)
+}
+
+%body.v0 {
+  ROOT %param.1 = s32[] parameter(0)
+}
+
+%body.v1 {
+  %param.2 = s32[] parameter(0)
+  ROOT add = s32[] add(%param.2, %param.2)
+}
+
+ENTRY %test_module {
+  %constant.42 = s32[] constant(42)
+  %while.0 = s32[] while(%constant.42), condition=%cond.v0, body=%body.v0
+  %mul = s32[] multiply(%while.0, %while.0)
+  %while.1 = s32[] while(%mul), condition=%cond.v1, body=%body.v1
+  ROOT %bcast = s32[1024,1024]{1,0} broadcast(s32[] %while.1), dimensions={}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(module_str));
+
+  // Run CopyInsertion and check if the graph constructed above doesn't need
+  // any copies inserted for BufferAssignment to run.
+  int64 instruction_count = module->instruction_count();
+  CopyInsertion copy_insertion;
+  ASSERT_IS_OK(copy_insertion.Run(module.get()).status());
+  ASSERT_EQ(instruction_count, module->instruction_count());
+
+  // Get the instructions in the module.
+  const HloInstruction* bcast = module->entry_computation()->root_instruction();
+  const HloInstruction* constant =
+      module->entry_computation()->GetInstructionWithName("constant.42");
+  ASSERT_EQ(bcast->opcode(), HloOpcode::kBroadcast);
+  const HloInstruction* while1 = bcast->operand(0);
+  ASSERT_EQ(while1->opcode(), HloOpcode::kWhile);
+  const HloInstruction* while0 = while1->operand(0)->operand(0);
+  ASSERT_EQ(while0->opcode(), HloOpcode::kWhile);
+
+  // Run buffer assignment.
+  auto assignment = RunBufferAssignment(module.get());
+  TF_ASSERT_OK_AND_ASSIGN(auto slice_constant,
+                          assignment->GetUniqueSlice(constant, {}));
+  TF_ASSERT_OK_AND_ASSIGN(auto slice_while0,
+                          assignment->GetUniqueSlice(while0, {}));
+  TF_ASSERT_OK_AND_ASSIGN(auto slice_while1,
+                          assignment->GetUniqueSlice(while1, {}));
+
+  // The constant slice is part of the while0's colocation set (init value), but
+  // not merged into the while1's colocation set.
+  EXPECT_EQ(slice_constant, slice_while0);
+  EXPECT_NE(slice_constant, slice_while1);
+}
+
 // Tests that the colocated buffers for while instructions are properly assigned
 // during buffer assignment such that the result tuple elements are not assigned
 // to the same buffer.
@@ -1927,7 +2085,9 @@ TEST_F(WhileBufferAssignmentTest, ColocatedBuffers) {
           module.get(),
           xla::MakeUnique<SequentialHloOrdering>(module.get(), sequence),
           backend().compiler()->BufferSizeBytesFunction(),
-          [](LogicalBuffer::Color) { return 1; }));
+          [](LogicalBuffer::Color) { return 1; },
+          /*allow_input_output_aliasing=*/false,
+          /*allocate_buffers_for_constants=*/true));
 
   // The result tuple elements must be assigned with different buffers.
   TF_ASSERT_OK_AND_ASSIGN(auto slice0, assignment->GetUniqueSlice(tuple, {0}));
@@ -2181,7 +2341,9 @@ TEST_F(WhileBufferAssignmentTest, WhileLoopsInterferingResultRange) {
       BufferAssigner::Run(
           module.get(),
           xla::MakeUnique<SequentialHloOrdering>(module.get(), sequence),
-          ByteSizeOf, [](LogicalBuffer::Color) { return 1; })
+          ByteSizeOf, [](LogicalBuffer::Color) { return 1; },
+          /*allow_input_output_aliasing=*/false,
+          /*allocate_buffers_for_constants=*/true)
           .ConsumeValueOrDie();
 
   EXPECT_TRUE(BuffersDistinct({while0}, {while1}, *assignment));
diff --git a/tensorflow/compiler/xla/service/computation_placer.cc b/tensorflow/compiler/xla/service/computation_placer.cc
index d26486fcfe..187ce568cb 100644
--- a/tensorflow/compiler/xla/service/computation_placer.cc
+++ b/tensorflow/compiler/xla/service/computation_placer.cc
@@ -29,9 +29,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
+using tensorflow::strings::StrAppend;
+using tensorflow::strings::StrCat;
+
 namespace xla {
 
 Status DeviceAssignment::Serialize(DeviceAssignmentProto* proto) const {
@@ -71,6 +75,19 @@ DeviceAssignment::Deserialize(const DeviceAssignmentProto& proto) {
   return std::move(assignment);
 }
 
+string DeviceAssignment::ToString() const {
+  string output = StrCat("Computations: ", computation_count(),
+                         " Replicas: ", replica_count(), "\n");
+  for (int computation = 0; computation < computation_count(); ++computation) {
+    StrAppend(&output, "Computation ", computation, ": ");
+    for (int replica = 0; replica < replica_count(); ++replica) {
+      StrAppend(&output, operator()(replica, computation), " ");
+    }
+    StrAppend(&output, "\n");
+  }
+  return output;
+}
+
 StatusOr<int> ComputationPlacer::DeviceId(int replica, int computation,
                                           int replica_count,
                                           int computation_count) {
diff --git a/tensorflow/compiler/xla/service/computation_placer.h b/tensorflow/compiler/xla/service/computation_placer.h
index 737d00e93e..c899ffb9dc 100644
--- a/tensorflow/compiler/xla/service/computation_placer.h
+++ b/tensorflow/compiler/xla/service/computation_placer.h
@@ -55,6 +55,8 @@ class DeviceAssignment : public Array2D<int> {
   // due to a StatusOr of an incomplete type (DeviceAssignment).
   static StatusOr<std::unique_ptr<DeviceAssignment>> Deserialize(
       const DeviceAssignmentProto& proto);
+
+  string ToString() const;
 };
 
 // A generic implementation of the XLA computation placer, which assigns device
diff --git a/tensorflow/compiler/xla/service/copy_insertion.cc b/tensorflow/compiler/xla/service/copy_insertion.cc
index ca2a78da67..36fb9b43aa 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion.cc
@@ -351,26 +351,6 @@ Status StripControlDependenciesFrom(HloInstruction* instruction) {
   return Status::OK();
 }
 
-// Add kCopy instructions to the given module to guarantee there is no
-// live-range interference. Generally interference can only occur around kWhile
-// instructions which have update-in-place semantics.
-Status AddCopiesToResolveInterference(HloModule* module) {
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloAliasAnalysis> alias_analysis,
-                      HloAliasAnalysis::Run(module));
-
-  for (HloComputation* computation : module->computations()) {
-    for (HloInstruction* instruction : computation->instructions()) {
-      if (instruction->opcode() == HloOpcode::kWhile) {
-        TF_RETURN_IF_ERROR(AddCopiesForWhile(*alias_analysis, instruction));
-      } else if (instruction->opcode() == HloOpcode::kConditional) {
-        TF_RETURN_IF_ERROR(
-            AddCopiesForConditional(*alias_analysis, instruction));
-      }
-    }
-  }
-  return Status::OK();
-}
-
 // Class for removing unnecessary copies from the module.
 //
 // kCopy instructions are added conservatively to guarantee no live range
@@ -945,6 +925,36 @@ class CopyRemover {
   BufferValueTracker buffer_value_tracker_;
 };
 
+void MaybeDumpModule(const string& message, const HloModule& module) {
+  if (VLOG_IS_ON(3)) {
+    VLOG(3) << message;
+    XLA_VLOG_LINES(3, module.ToString());
+    hlo_graph_dumper::MaybeDumpHloModule(module, message);
+  }
+}
+
+}  // namespace
+
+// Add kCopy instructions to the given module to guarantee there is no
+// live-range interference. Generally interference can only occur around kWhile
+// instructions which have update-in-place semantics.
+Status CopyInsertion::AddCopiesToResolveInterference(HloModule* module) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloAliasAnalysis> alias_analysis,
+                      HloAliasAnalysis::Run(module, fusion_can_share_buffer_));
+
+  for (HloComputation* computation : module->computations()) {
+    for (HloInstruction* instruction : computation->instructions()) {
+      if (instruction->opcode() == HloOpcode::kWhile) {
+        TF_RETURN_IF_ERROR(AddCopiesForWhile(*alias_analysis, instruction));
+      } else if (instruction->opcode() == HloOpcode::kConditional) {
+        TF_RETURN_IF_ERROR(
+            AddCopiesForConditional(*alias_analysis, instruction));
+      }
+    }
+  }
+  return Status::OK();
+}
+
 // Add copies to address special constraints on the roots of computations not
 // related to live range interference:
 //
@@ -955,9 +965,10 @@ class CopyRemover {
 //
 //    (3) Constants and parameters cannot be live out of the entry computation
 //
-Status AddSpecialCaseCopies(const CallGraph& call_graph, HloModule* module) {
+Status CopyInsertion::AddSpecialCaseCopies(const CallGraph& call_graph,
+                                           HloModule* module) {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloAliasAnalysis> alias_analysis,
-                      HloAliasAnalysis::Run(module));
+                      HloAliasAnalysis::Run(module, fusion_can_share_buffer_));
 
   // Identify which shape indices of which instructions need to be copied. Store
   // these results in 'instructions_to_copy'.
@@ -1065,32 +1076,20 @@ Status AddSpecialCaseCopies(const CallGraph& call_graph, HloModule* module) {
   return Status::OK();
 }
 
-Status VerifyNoLiveRangeInterference(HloModule* module) {
+Status CopyInsertion::VerifyNoLiveRangeInterference(HloModule* module) {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloAliasAnalysis> alias_analysis,
-                      HloAliasAnalysis::Run(module));
+                      HloAliasAnalysis::Run(module, fusion_can_share_buffer_));
   DependencyHloOrdering ordering(module);
   TF_RET_CHECK(!alias_analysis->HasLiveRangeInterference(ordering));
   return Status::OK();
 }
 
-void MaybeDumpModule(const string& message, const HloModule& module) {
-  if (VLOG_IS_ON(3)) {
-    VLOG(3) << message;
-    XLA_VLOG_LINES(3, module.ToString());
-    hlo_graph_dumper::MaybeDumpHloModule(module, message);
-  }
-}
-
-}  // namespace
-
-Status RemoveUnnecessaryCopies(
-    const HloOrdering& ordering, HloModule* module,
-    const HloDataflowAnalysis::FusionCanShareBufferFunction&
-        fusion_can_share_buffer) {
+Status CopyInsertion::RemoveUnnecessaryCopies(const HloOrdering& ordering,
+                                              HloModule* module) {
   MaybeDumpModule("after adding copies to resolve interference", *module);
 
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloAliasAnalysis> alias_analysis,
-                      HloAliasAnalysis::Run(module, fusion_can_share_buffer));
+                      HloAliasAnalysis::Run(module, fusion_can_share_buffer_));
   CopyRemover copy_remover(*alias_analysis, ordering, module);
   XLA_VLOG_LINES(3, copy_remover.ToString());
 
diff --git a/tensorflow/compiler/xla/service/copy_insertion.h b/tensorflow/compiler/xla/service/copy_insertion.h
index e1973db928..5ba64b78a3 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.h
+++ b/tensorflow/compiler/xla/service/copy_insertion.h
@@ -71,20 +71,26 @@ class CopyInsertion : public HloPassInterface {
   // TODO(b/62548313): Remove this when buffer assignment is module-scoped.
   static StatusOr<bool> AddCopiesForBufferAssignment(HloModule* module);
 
+  // Try to remove as many copies from the module as possible without
+  // introducing live range interference. Only copy instructions that are
+  // eligible for copy elision are considered for removal.
+  Status RemoveUnnecessaryCopies(const HloOrdering& ordering,
+                                 HloModule* module);
+
  private:
+  // Verifies that no HLO values have interfering live ranged assuming the
+  // ordering used by copy insertion.
+  Status VerifyNoLiveRangeInterference(HloModule* module);
+
+  Status AddCopiesToResolveInterference(HloModule* module);
+
+  Status AddSpecialCaseCopies(const CallGraph& call_graph, HloModule* module);
+
   // Backend specific function that decides whether a fusion can share buffer
   // with its operand.
   HloDataflowAnalysis::FusionCanShareBufferFunction fusion_can_share_buffer_;
 };
 
-// Try to remove as many copies from the module as possible without introducing
-// live range interference. Only copy instructions that are eligible for
-// copy elision are considered for removal.
-Status RemoveUnnecessaryCopies(
-    const HloOrdering& ordering, HloModule* module,
-    const HloDataflowAnalysis::FusionCanShareBufferFunction&
-        fusion_can_share_buffer = nullptr);
-
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_COPY_INSERTION_H_
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index ace9f96cfb..504b61d134 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -252,6 +252,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:name_uniquer",
         "//tensorflow/compiler/xla/service/llvm_ir:alias_analysis",
+        "//tensorflow/compiler/xla/service/llvm_ir:buffer_assignment_util",
         "//tensorflow/compiler/xla/service/llvm_ir:dynamic_update_slice_util",
         "//tensorflow/compiler/xla/service/llvm_ir:fused_ir_emitter",
         "//tensorflow/compiler/xla/service/llvm_ir:ir_array",
@@ -363,8 +364,8 @@ tf_cc_binary(
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/core:lib",
     ],
 )
@@ -444,6 +445,7 @@ cc_library(
     deps = [
         ":vector_support_library",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
+        "//tensorflow/compiler/xla/service/llvm_ir:math_ops",
         "//tensorflow/core:lib",
         "@llvm//:core",
         "@llvm//:transform_utils",
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index 29fa29d33a..b49ea89896 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -562,7 +562,9 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
       BufferAssigner::Run(
           module.get(),
           xla::MakeUnique<SequentialHloOrdering>(module.get(), module_sequence),
-          BufferSizeBytesFunction(), memory_alignment));
+          BufferSizeBytesFunction(), memory_alignment,
+          /*allow_input_output_aliasing=*/false,
+          /*allocate_buffers_for_constants=*/true));
   // BufferAssignment::ToString() includes a header, so no need for us to
   // print one ourselves.
   XLA_VLOG_LINES(2, assignment->ToString());
@@ -584,6 +586,8 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
                        std::move(computation_to_profile_idx),
                        &target_machine_features);
 
+  TF_RETURN_IF_ERROR(ir_emitter.EmitConstantGlobals());
+
   for (auto embedded_computation :
        entry_computation->MakeEmbeddedComputationsList()) {
     if (embedded_computation->IsFusionComputation()) {
@@ -747,7 +751,9 @@ CpuCompiler::CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
         BufferAssigner::Run(
             module,
             xla::MakeUnique<SequentialHloOrdering>(module, module_sequence),
-            BufferSizeBytesFunction(), memory_alignment));
+            BufferSizeBytesFunction(), memory_alignment,
+            /*allow_input_output_aliasing=*/false,
+            /*allocate_buffers_for_constants=*/true));
     // BufferAssignment::ToString() includes a header, so no need for us to
     // print one ourselves.
     XLA_VLOG_LINES(2, assignment->ToString());
@@ -776,6 +782,9 @@ CpuCompiler::CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
                          std::move(instruction_to_profile_idx),
                          std::move(computation_to_profile_idx),
                          &target_machine_features);
+
+    TF_RETURN_IF_ERROR(ir_emitter.EmitConstantGlobals());
+
     HloComputation* computation = module->entry_computation();
     for (auto embedded_computation :
          computation->MakeEmbeddedComputationsList()) {
@@ -832,7 +841,8 @@ CpuCompiler::CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
     BufferSizes buffer_sizes;
     for (const BufferAllocation& allocation : assignment->Allocations()) {
       // Callers don't need to allocate temporary buffers for parameters.
-      if (allocation.is_entry_computation_parameter()) {
+      if (allocation.is_entry_computation_parameter() ||
+          allocation.is_constant()) {
         buffer_sizes.push_back(-1);
         continue;
       }
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index 1093559892..81e17a5cd4 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -88,6 +88,11 @@ Status CpuExecutable::AllocateBuffers(
       continue;
     }
 
+    if (allocation.is_constant()) {
+      VLOG(3) << "allocation #" << i << " is a constant";
+      continue;
+    }
+
     if (allocation.is_thread_local()) {
       VLOG(3) << "buffer #" << i << " is thread-local";
       continue;
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
index 58228180ca..645888de78 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
@@ -49,15 +49,15 @@ class MemoryTile {
   // `tile_size_along_major_dim` vectors from the matrix `matrix`, starting at
   // `major_dim_offset` in the major dimension.  The tile size along the minor
   // dimension is the vector size, and that is implicitly determined by `vsl`.
-  MemoryTile(VectorSupportLibrary* vsl, llvm::IRBuilder<>* ir_builder,
+  MemoryTile(VectorSupportLibrary* vsl, llvm::IRBuilder<>* b,
              llvm::Value* matrix, int64 matrix_size_along_minor_dim,
              llvm::Value* major_dim_offset, int64 tile_size_along_major_dim)
-      : vsl_(vsl), ir_builder_(ir_builder) {
+      : vsl_(vsl), b_(b) {
     pointers_.reserve(tile_size_along_major_dim);
     for (int64 i = 0; i < tile_size_along_major_dim; i++) {
-      llvm::Value* total_offset = ir_builder->CreateMul(
-          ir_builder->getInt64(matrix_size_along_minor_dim),
-          ir_builder->CreateAdd(ir_builder->getInt64(i), major_dim_offset));
+      llvm::Value* total_offset =
+          b->CreateMul(b->getInt64(matrix_size_along_minor_dim),
+                       b->CreateAdd(b->getInt64(i), major_dim_offset));
       pointers_.push_back(vsl_->ComputeOffsetPointer(matrix, total_offset));
     }
   }
@@ -101,8 +101,7 @@ class MemoryTile {
     for (int64 i = 0; i < pointers_.size(); i++) {
       for (int64 j = 0; j < tile_size_along_middle_dim; j++) {
         result[i].push_back(vsl_->LoadBroadcast(
-            pointers_[i], ir_builder_->CreateAdd(minor_dim_offset,
-                                                 ir_builder_->getInt64(j))));
+            pointers_[i], b_->CreateAdd(minor_dim_offset, b_->getInt64(j))));
       }
     }
     return result;
@@ -110,7 +109,7 @@ class MemoryTile {
 
  private:
   VectorSupportLibrary* vsl_;
-  llvm::IRBuilder<>* ir_builder_;
+  llvm::IRBuilder<>* b_;
   std::vector<llvm::Value*> pointers_;
 };
 
@@ -249,16 +248,15 @@ class ColumnMajorMatrixVectorProductEmitter
   ColumnMajorMatrixVectorProductEmitter(const Config& config, llvm::Value* lhs,
                                         llvm::Value* rhs, llvm::Value* addend,
                                         llvm::Value* result,
-                                        llvm::IRBuilder<>* ir_builder)
+                                        llvm::IRBuilder<>* b)
       : config_(config),
         lhs_(lhs),
         rhs_(rhs),
         addend_(addend),
         result_(result),
-        ir_builder_(ir_builder),
-        ksl_(ir_builder_),
-        vsl_(config.scalar_type(), /*vector_size=*/config.tile_rows(),
-             ir_builder_, "") {
+        b_(b),
+        ksl_(b_),
+        vsl_(config.scalar_type(), /*vector_size=*/config.tile_rows(), b_, "") {
     CHECK(tile_rows() > 0 && IsPowerOfTwo(static_cast<uint64>(tile_rows())));
     CHECK(!has_addend() || addend != nullptr);
   }
@@ -272,7 +270,7 @@ class ColumnMajorMatrixVectorProductEmitter
                          bool is_first_column);
 
   MemoryTile GetLhsMemoryTile(llvm::Value* column_start, int64 column_count) {
-    return MemoryTile(&vsl_, ir_builder_, /*matrix=*/lhs_,
+    return MemoryTile(&vsl_, b_, /*matrix=*/lhs_,
                       /*matrix_size_along_minor_dim=*/m(),
                       /*major_dim_offset=*/column_start,
                       /*tile_size_along_major_dim=*/column_count);
@@ -302,7 +300,7 @@ class ColumnMajorMatrixVectorProductEmitter
   llvm::Value* rhs_;
   llvm::Value* addend_;
   llvm::Value* result_;
-  llvm::IRBuilder<>* ir_builder_;
+  llvm::IRBuilder<>* b_;
   KernelSupportLibrary ksl_;
   VectorSupportLibrary vsl_;
 };
@@ -331,7 +329,7 @@ void ColumnMajorMatrixVectorProductEmitter::Emit() {
                      });
 
   if (column_remainder != 0) {
-    EmitOuterLoopBody(ir_builder_->getInt64(column_limit), column_remainder,
+    EmitOuterLoopBody(b_->getInt64(column_limit), column_remainder,
                       column_limit == 0);
   }
 }
@@ -364,7 +362,7 @@ void ColumnMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue(
     return;
   }
 
-  llvm::Value* columns_llvm = ir_builder_->getInt64(columns);
+  llvm::Value* columns_llvm = b_->getInt64(columns);
 
   // for (col = current_tile_col; col < (columns + current_tile_col); col++)
   //   for (row = row_start, row < m_; row++) {
@@ -375,12 +373,11 @@ void ColumnMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue(
 
   ksl_.ForReturnVoid(
       "dot.inner.epilg.outer", /*start=*/current_tile_col,
-      /*end=*/ir_builder_->CreateAdd(columns_llvm, current_tile_col),
+      /*end=*/b_->CreateAdd(columns_llvm, current_tile_col),
       /*step=*/1, /*peel_first_iteration=*/false,
       [&](llvm::Value* col, llvm::Value* is_first_scalar_col) {
         llvm::Value* rhs_element = vsl_.LoadScalar(rhs_, col);
-        llvm::Value* total_offset =
-            ir_builder_->CreateMul(col, ir_builder_->getInt64(m()));
+        llvm::Value* total_offset = b_->CreateMul(col, b_->getInt64(m()));
         llvm::Value* lhs_base_pointer =
             vsl_.ComputeOffsetPointer(lhs_, total_offset);
         ksl_.ForReturnVoid(
@@ -388,9 +385,8 @@ void ColumnMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue(
             /*step=*/1, [&](llvm::Value* scalar_row) {
               llvm::Value* product = vsl_.Mul(
                   vsl_.LoadScalar(lhs_base_pointer, scalar_row), rhs_element);
-              llvm::Value* setting_result_first_time = ir_builder_->CreateAnd(
-                  is_first_scalar_col,
-                  ir_builder_->getInt1(is_first_tiled_column));
+              llvm::Value* setting_result_first_time = b_->CreateAnd(
+                  is_first_scalar_col, b_->getInt1(is_first_tiled_column));
               ksl_.IfReturnVoid(
                   setting_result_first_time,
                   /*true_block_generator=*/
@@ -478,16 +474,15 @@ class RowMajorMatrixVectorProductEmitter
 
   RowMajorMatrixVectorProductEmitter(const Config& config, llvm::Value* lhs,
                                      llvm::Value* rhs, llvm::Value* addend,
-                                     llvm::Value* result,
-                                     llvm::IRBuilder<>* ir_builder)
+                                     llvm::Value* result, llvm::IRBuilder<>* b)
       : config_(config),
         lhs_(lhs),
         rhs_(rhs),
         addend_(addend),
         result_(result),
-        ir_builder_(ir_builder),
-        ksl_(ir_builder_),
-        vsl_(scalar_type(), /*vector_size=*/tile_cols(), ir_builder_, "") {
+        b_(b),
+        ksl_(b_),
+        vsl_(scalar_type(), /*vector_size=*/tile_cols(), b_, "") {
     CHECK(tile_cols() > 0 && IsPowerOfTwo(static_cast<uint64>(tile_cols())));
     CHECK(!has_addend() || addend != nullptr);
   }
@@ -498,7 +493,7 @@ class RowMajorMatrixVectorProductEmitter
 
  private:
   MemoryTile GetLhsMemoryTile(llvm::Value* row_start, int64 row_count) {
-    return MemoryTile(&vsl_, ir_builder_, /*matrix=*/lhs_,
+    return MemoryTile(&vsl_, b_, /*matrix=*/lhs_,
                       /*matrix_size_along_minor_dim=*/k(),
                       /*major_dim_offset=*/row_start,
                       /*tile_size_along_major_dim=*/row_count);
@@ -517,7 +512,7 @@ class RowMajorMatrixVectorProductEmitter
   llvm::Value* rhs_;
   llvm::Value* addend_;
   llvm::Value* result_;
-  llvm::IRBuilder<>* ir_builder_;
+  llvm::IRBuilder<>* b_;
   KernelSupportLibrary ksl_;
   VectorSupportLibrary vsl_;
 };
@@ -559,7 +554,7 @@ void RowMajorMatrixVectorProductEmitter::EmitOuterLoopBody(llvm::Value* row,
   for (int i = 0; i < row_count; i++) {
     llvm::Value* result_value =
         vsl_.Add(horizontal_sums[i], scalar_accumulators[i].Get());
-    llvm::Value* offset = ir_builder_->CreateAdd(ir_builder_->getInt64(i), row);
+    llvm::Value* offset = b_->CreateAdd(b_->getInt64(i), row);
     if (addend_ && row_count != vsl_.vector_size()) {
       result_value = vsl_.Add(vsl_.LoadScalar(addend_, offset), result_value);
     }
@@ -578,7 +573,7 @@ void RowMajorMatrixVectorProductEmitter::Emit() {
       [&](llvm::Value* row) { EmitOuterLoopBody(row, tile_rows()); });
 
   if (row_remainder != 0) {
-    EmitOuterLoopBody(ir_builder_->getInt64(row_limit), row_remainder);
+    EmitOuterLoopBody(b_->getInt64(row_limit), row_remainder);
   }
 }
 
@@ -609,9 +604,8 @@ void RowMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue(
   }
 
   for (int r = 0; r < rows; r++) {
-    llvm::Value* total_offset = ir_builder_->CreateMul(
-        ir_builder_->CreateAdd(ir_builder_->getInt64(r), current_tile_row),
-        ir_builder_->getInt64(k()));
+    llvm::Value* total_offset = b_->CreateMul(
+        b_->CreateAdd(b_->getInt64(r), current_tile_row), b_->getInt64(k()));
     llvm::Value* lhs_base_pointer =
         vsl_.ComputeOffsetPointer(lhs_, total_offset);
     ksl_.ForReturnVoid(
@@ -722,13 +716,13 @@ class MatrixMatrixBlockPanelEmitter {
   // `lhs` with `rhs` and stores the result in `result`.
   explicit MatrixMatrixBlockPanelEmitter(Config config, llvm::Value* lhs,
                                          llvm::Value* rhs, llvm::Value* result,
-                                         llvm::IRBuilder<>* ir_builder)
+                                         llvm::IRBuilder<>* b)
       : lhs_(lhs),
         rhs_(rhs),
         result_(result),
         config_(config),
-        ir_builder_(ir_builder),
-        ksl_(ir_builder_) {
+        b_(b),
+        ksl_(b_) {
     CHECK(max_vectorization_width() > 0 &&
           IsPowerOfTwo(static_cast<uint64>(max_vectorization_width())));
     CHECK_GT(max_vector_count(), 0);
@@ -761,7 +755,7 @@ class MatrixMatrixBlockPanelEmitter {
                      int64 tile_size_m, llvm::Value* m_start,
                      llvm::Value* m_end);
 
-  llvm::Value* GetInt64(int64 value) { return ir_builder_->getInt64(value); }
+  llvm::Value* GetInt64(int64 value) { return b_->getInt64(value); }
 
   Config config() const { return config_; }
   Dimensions dims() const { return config().dims(); }
@@ -782,7 +776,7 @@ class MatrixMatrixBlockPanelEmitter {
   llvm::Value* result_;
   Config config_;
 
-  llvm::IRBuilder<>* ir_builder_;
+  llvm::IRBuilder<>* b_;
   KernelSupportLibrary ksl_;
 };
 
@@ -804,8 +798,8 @@ void MatrixMatrixBlockPanelEmitter::HandleResiduesOnN() {
          current_vectorization_width >= min_vectorization_width()) {
     int64 n_end = dims().n() - (dims().n() % current_vectorization_width);
     if (n_start != n_end) {
-      VectorSupportLibrary vsl(scalar_type(), current_vectorization_width,
-                               ir_builder_, "gebp");
+      VectorSupportLibrary vsl(scalar_type(), current_vectorization_width, b_,
+                               "gebp");
       HandleResiduesOnK(&vsl, GetInt64(n_start), GetInt64(n_end));
       n_start = n_end;
     }
@@ -819,10 +813,9 @@ void MatrixMatrixBlockPanelEmitter::HandleResiduesOnN() {
   }
 
   if (n_start != dims().n()) {
-    VectorSupportLibrary vsl(scalar_type(), 1, ir_builder_, "gebp");
+    VectorSupportLibrary vsl(scalar_type(), 1, b_, "gebp");
     ksl_.ForReturnVoid("epi.n", n_start, dims().n(), 1, [&](llvm::Value* n_i) {
-      llvm::Value* n_i_next =
-          ir_builder_->CreateAdd(n_i, ir_builder_->getInt64(1));
+      llvm::Value* n_i_next = b_->CreateAdd(n_i, b_->getInt64(1));
       HandleResiduesOnK(&vsl, n_i, n_i_next);
     });
   }
@@ -935,11 +928,11 @@ void MatrixMatrixBlockPanelEmitter::EmitTiledGemm(
   ksl_.ForReturnVoid(
       "dot.m", m_start, m_end, tile_size_m, [&](llvm::Value* m_i) {
         MemoryTile result_memory_tile(
-            vsl, ir_builder_, /*matrix=*/result_,
+            vsl, b_, /*matrix=*/result_,
             /*matrix_size_along_minor_dim=*/dims().n(),
             /*major_dim_offset=*/m_i,
             /*tile_size_along_major_dim=*/tile_size_m);
-        MemoryTile lhs_memory_tile(vsl, ir_builder_, /*matrix=*/lhs_,
+        MemoryTile lhs_memory_tile(vsl, b_, /*matrix=*/lhs_,
                                    /*matrix_size_along_minor_dim=*/dims().k(),
                                    /*major_dim_offset=*/m_i,
                                    /*tile_size_along_major_dim=*/tile_size_m);
@@ -949,8 +942,8 @@ void MatrixMatrixBlockPanelEmitter::EmitTiledGemm(
                                            result_memory_tile.LoadTile(n_i));
               ksl_.ForReturnVoid(
                   "dot.k", k_start, k_end, tile_size_k, [&](llvm::Value* k_i) {
-                    MemoryTile rhs_memory_tile(vsl, ir_builder_, rhs_,
-                                               dims().n(), k_i, tile_size_k);
+                    MemoryTile rhs_memory_tile(vsl, b_, rhs_, dims().n(), k_i,
+                                               tile_size_k);
                     std::vector<std::vector<llvm::Value*>> lhs_tile =
                         lhs_memory_tile.LoadBroadcastTile(k_i, tile_size_k);
                     std::vector<llvm::Value*> rhs_tile =
@@ -980,7 +973,7 @@ DotOpEmitter::DotOpEmitter(const HloInstruction& dot,
                            const llvm_ir::IrArray& rhs_array,
                            const llvm_ir::IrArray* addend_array,
                            llvm::Value* executable_run_options_value,
-                           llvm::IRBuilder<>* ir_builder,
+                           llvm::IRBuilder<>* b,
                            const HloModuleConfig& hlo_module_config,
                            const TargetMachineFeatures& target_machine_features)
     : dot_(dot),
@@ -989,7 +982,7 @@ DotOpEmitter::DotOpEmitter(const HloInstruction& dot,
       rhs_array_(rhs_array),
       addend_array_(addend_array),
       executable_run_options_value_(executable_run_options_value),
-      ir_builder_(ir_builder),
+      b_(b),
       hlo_module_config_(hlo_module_config),
       target_machine_features_(target_machine_features) {}
 
@@ -997,15 +990,14 @@ DotOpEmitter::DotOpEmitter(const HloInstruction& dot,
     const HloInstruction& dot, const llvm_ir::IrArray& target_array,
     const llvm_ir::IrArray& lhs_array, const llvm_ir::IrArray& rhs_array,
     const llvm_ir::IrArray* addend_array,
-    llvm::Value* executable_run_options_value, llvm::IRBuilder<>* ir_builder,
+    llvm::Value* executable_run_options_value, llvm::IRBuilder<>* b,
     const HloModuleConfig& hlo_module_config,
     const TargetMachineFeatures& target_machine_features) {
   PrimitiveType type = target_array.GetShape().element_type();
   TF_RET_CHECK(F16 == type || F32 == type || F64 == type || C64 == type);
   DotOpEmitter dot_emitter(dot, target_array, lhs_array, rhs_array,
-                           addend_array, executable_run_options_value,
-                           ir_builder, hlo_module_config,
-                           target_machine_features);
+                           addend_array, executable_run_options_value, b,
+                           hlo_module_config, target_machine_features);
   return dot_emitter.Emit();
 }
 
@@ -1050,13 +1042,13 @@ bool DotOpEmitter::EmitExperimentalGebpDotIfEnabled(
   }
 
   int64 size_bytes = m * n * ShapeUtil::ByteSizeOfPrimitiveType(primitive_type);
-  ir_builder_->CreateMemSet(
-      target, ir_builder_->getInt8(0), size_bytes,
+  b_->CreateMemSet(
+      target, b_->getInt8(0), size_bytes,
       target_machine_features_.minimum_alignment_for_allocation(size_bytes));
 
   int64 max_target_vector_width =
       target_machine_features_.vector_register_num_elements(
-          *ir_builder_->GetInsertBlock()->getParent(), primitive_type);
+          *b_->GetInsertBlock()->getParent(), primitive_type);
 
   int64 tile_size_m, tile_size_k, tile_size_n_in_vector_width;
   std::tie(tile_size_m, tile_size_k, tile_size_n_in_vector_width) =
@@ -1080,12 +1072,12 @@ bool DotOpEmitter::EmitExperimentalGebpDotIfEnabled(
 
   KernelSupportLibrary::EmitAndCallOutlinedKernel(
       /*enable_fast_math=*/enable_fast_math,
-      /*optimize_for_size=*/optimize_for_size, ir_builder_,
-      config.GetCacheKey(), lhs, rhs, target,
+      /*optimize_for_size=*/optimize_for_size, b_, config.GetCacheKey(), lhs,
+      rhs, target,
       [this, config](llvm::Value* lhs, llvm::Value* rhs, llvm::Value* target) {
-        MatrixMatrixBlockPanelEmitter gebp_emitter(
-            config, /*lhs=*/lhs, /*rhs=*/rhs,
-            /*result=*/target, ir_builder_);
+        MatrixMatrixBlockPanelEmitter gebp_emitter(config, /*lhs=*/lhs,
+                                                   /*rhs=*/rhs,
+                                                   /*result=*/target, b_);
         gebp_emitter.Emit();
       });
 
@@ -1163,7 +1155,7 @@ bool DotOpEmitter::EmitLlvmIrDotIfProfitable() {
 
   const int target_vector_register_element_size =
       target_machine_features_.vector_register_num_elements(
-          *ir_builder_->GetInsertBlock()->getParent(), primitive_type);
+          *b_->GetInsertBlock()->getParent(), primitive_type);
 
   // We may not always know the vector register size for the target we're
   // compiling against, in which case target_vector_register_element_size is 0.
@@ -1184,13 +1176,13 @@ bool DotOpEmitter::EmitLlvmIrDotIfProfitable() {
 
     KernelSupportLibrary::EmitAndCallOutlinedKernel(
         /*enable_fast_math=*/enable_fast_math,
-        /*optimize_for_size=*/optimize_for_size, ir_builder_,
-        config.GetCacheKey(), lhs_op, rhs_op,
+        /*optimize_for_size=*/optimize_for_size, b_, config.GetCacheKey(),
+        lhs_op, rhs_op,
         addend_array_ ? addend_array_->GetBasePointer() : nullptr, result_op,
         [this, config](llvm::Value* lhs_op, llvm::Value* rhs_op,
                        llvm::Value* addend_op, llvm::Value* result_op) {
           ColumnMajorMatrixVectorProductEmitter emitter(
-              config, lhs_op, rhs_op, addend_op, result_op, ir_builder_);
+              config, lhs_op, rhs_op, addend_op, result_op, b_);
           emitter.Emit();
         });
   } else {
@@ -1203,13 +1195,13 @@ bool DotOpEmitter::EmitLlvmIrDotIfProfitable() {
 
     KernelSupportLibrary::EmitAndCallOutlinedKernel(
         /*enable_fast_math=*/enable_fast_math,
-        /*optimize_for_size=*/optimize_for_size, ir_builder_,
-        config.GetCacheKey(), lhs_op, rhs_op,
+        /*optimize_for_size=*/optimize_for_size, b_, config.GetCacheKey(),
+        lhs_op, rhs_op,
         addend_array_ ? addend_array_->GetBasePointer() : nullptr, result_op,
         [this, config](llvm::Value* lhs_op, llvm::Value* rhs_op,
                        llvm::Value* addend_op, llvm::Value* result_op) {
-          RowMajorMatrixVectorProductEmitter emitter(
-              config, lhs_op, rhs_op, addend_op, result_op, ir_builder_);
+          RowMajorMatrixVectorProductEmitter emitter(config, lhs_op, rhs_op,
+                                                     addend_op, result_op, b_);
           emitter.Emit();
         });
   }
@@ -1285,11 +1277,11 @@ Status DotOpEmitter::Emit() {
   // Create loop nests which loop through the LHS operand dimensions and the RHS
   // operand dimensions. The reduction dimension of the LHS and RHS are handled
   // in a separate innermost loop which performs the sum of products.
-  llvm_ir::ForLoopNest loop_nest(llvm_ir::IrName(&dot_), ir_builder_);
-  llvm_ir::IrArray::Index lhs_index = EmitOperandArrayLoopNest(
-      &loop_nest, lhs_array_, lhs_reduction_dimension, "lhs");
-  llvm_ir::IrArray::Index rhs_index = EmitOperandArrayLoopNest(
-      &loop_nest, rhs_array_, rhs_reduction_dimension, "rhs");
+  llvm_ir::ForLoopNest loop_nest(llvm_ir::IrName(&dot_), b_);
+  llvm_ir::IrArray::Index lhs_index = loop_nest.EmitOperandArrayLoopNest(
+      lhs_array_, /*dimension_to_skip=*/lhs_reduction_dimension, "lhs");
+  llvm_ir::IrArray::Index rhs_index = loop_nest.EmitOperandArrayLoopNest(
+      rhs_array_, /*dimension_to_skip=*/rhs_reduction_dimension, "rhs");
 
   // Create the loop which does the sum of products reduction.
   //
@@ -1319,62 +1311,55 @@ Status DotOpEmitter::Emit() {
   // Function entry basic block.
   // - Emit alloca for accumulator
   llvm::Function* func = reduction_loop->GetPreheaderBasicBlock()->getParent();
-  SetToFirstInsertPoint(&func->getEntryBlock(), ir_builder_);
+  SetToFirstInsertPoint(&func->getEntryBlock(), b_);
   llvm::Type* accum_type = target_array_.GetElementLlvmType();
-  llvm::Value* accum_address = ir_builder_->CreateAlloca(
-      accum_type, /*ArraySize=*/nullptr, "accum_address");
+  llvm::Value* accum_address =
+      b_->CreateAlloca(accum_type, /*ArraySize=*/nullptr, "accum_address");
 
   // Preheader basic block of reduction loop:
   // - Initialize accumulator to zero.
   llvm::BasicBlock* preheader_bb = reduction_loop->GetPreheaderBasicBlock();
-  ir_builder_->SetInsertPoint(preheader_bb->getTerminator());
+  b_->SetInsertPoint(preheader_bb->getTerminator());
 
-  ir_builder_->CreateStore(llvm::Constant::getNullValue(accum_type),
-                           accum_address);
+  b_->CreateStore(llvm::Constant::getNullValue(accum_type), accum_address);
 
   // Body basic block of reduction loop:
   // - Load elements from lhs and rhs array.
   // - Multiply lhs-element and rhs-element.
   // - Load accumulator and add to product.
   // - Store sum back into accumulator.
-  SetToFirstInsertPoint(reduction_loop->GetBodyBasicBlock(), ir_builder_);
+  SetToFirstInsertPoint(reduction_loop->GetBodyBasicBlock(), b_);
 
-  llvm::Value* lhs_element =
-      lhs_array_.EmitReadArrayElement(lhs_index, ir_builder_);
-  llvm::Value* rhs_element =
-      rhs_array_.EmitReadArrayElement(rhs_index, ir_builder_);
+  llvm::Value* lhs_element = lhs_array_.EmitReadArrayElement(lhs_index, b_);
+  llvm::Value* rhs_element = rhs_array_.EmitReadArrayElement(rhs_index, b_);
 
-  llvm::Value* accum = ir_builder_->CreateLoad(accum_address);
+  llvm::Value* accum = b_->CreateLoad(accum_address);
   llvm::Value* updated_accum;
   if (ShapeUtil::ElementIsComplex(lhs_shape)) {
-    auto real = [&](llvm::Value* x) {
-      return ir_builder_->CreateExtractValue(x, {0});
-    };
-    auto imag = [&](llvm::Value* x) {
-      return ir_builder_->CreateExtractValue(x, {1});
-    };
-    llvm::Value* product_real = ir_builder_->CreateFSub(
-        ir_builder_->CreateFMul(real(lhs_element), real(rhs_element)),
-        ir_builder_->CreateFMul(imag(lhs_element), imag(rhs_element)));
-    llvm::Value* product_imag = ir_builder_->CreateFAdd(
-        ir_builder_->CreateFMul(real(lhs_element), imag(rhs_element)),
-        ir_builder_->CreateFMul(imag(lhs_element), real(rhs_element)));
-    updated_accum = ir_builder_->CreateInsertValue(
-        accum, ir_builder_->CreateFAdd(real(accum), product_real), {0});
-    updated_accum = ir_builder_->CreateInsertValue(
-        updated_accum, ir_builder_->CreateFAdd(imag(accum), product_imag), {1});
+    auto real = [&](llvm::Value* x) { return b_->CreateExtractValue(x, {0}); };
+    auto imag = [&](llvm::Value* x) { return b_->CreateExtractValue(x, {1}); };
+    llvm::Value* product_real =
+        b_->CreateFSub(b_->CreateFMul(real(lhs_element), real(rhs_element)),
+                       b_->CreateFMul(imag(lhs_element), imag(rhs_element)));
+    llvm::Value* product_imag =
+        b_->CreateFAdd(b_->CreateFMul(real(lhs_element), imag(rhs_element)),
+                       b_->CreateFMul(imag(lhs_element), real(rhs_element)));
+    updated_accum = b_->CreateInsertValue(
+        accum, b_->CreateFAdd(real(accum), product_real), {0});
+    updated_accum = b_->CreateInsertValue(
+        updated_accum, b_->CreateFAdd(imag(accum), product_imag), {1});
   } else {
-    llvm::Value* product = ir_builder_->CreateFMul(lhs_element, rhs_element);
-    updated_accum = ir_builder_->CreateFAdd(accum, product);
+    llvm::Value* product = b_->CreateFMul(lhs_element, rhs_element);
+    updated_accum = b_->CreateFAdd(accum, product);
   }
-  ir_builder_->CreateStore(updated_accum, accum_address);
+  b_->CreateStore(updated_accum, accum_address);
 
   // Exit basic block of reduction loop.
   // - Load accumulator value (the result).
   // - Store into output array.
-  SetToFirstInsertPoint(reduction_loop->GetExitBasicBlock(), ir_builder_);
+  SetToFirstInsertPoint(reduction_loop->GetExitBasicBlock(), b_);
 
-  llvm::Value* result = ir_builder_->CreateLoad(accum_address);
+  llvm::Value* result = b_->CreateLoad(accum_address);
 
   // Create index into target address. The target index is the concatenation of
   // the rhs and lhs indexes with the reduction dimensions removed. The terms
@@ -1392,11 +1377,11 @@ Status DotOpEmitter::Emit() {
     }
   }
 
-  target_array_.EmitWriteArrayElement(target_index, result, ir_builder_);
+  target_array_.EmitWriteArrayElement(target_index, result, b_);
 
   // Set the IR builder insert point to the exit basic block of the outer most
   // loop.
-  ir_builder_->SetInsertPoint(loop_nest.GetOuterLoopExitBasicBlock());
+  b_->SetInsertPoint(loop_nest.GetOuterLoopExitBasicBlock());
 
   return Status::OK();
 }
@@ -1405,31 +1390,30 @@ Status DotOpEmitter::EmitScalarDot() {
   // A scalar dot is just a scalar multiply.
   llvm::Value* result;
   // Use the same index_type for all tensor accesses in the same kernel.
-  llvm::Type* index_type = ir_builder_->getInt64Ty();
+  llvm::Type* index_type = b_->getInt64Ty();
   llvm_ir::IrArray::Index element_index(index_type);
   llvm::Value* lhs_value =
-      lhs_array_.EmitReadArrayElement(/*index=*/element_index, ir_builder_);
+      lhs_array_.EmitReadArrayElement(/*index=*/element_index, b_);
   llvm::Value* rhs_value =
-      rhs_array_.EmitReadArrayElement(/*index=*/element_index, ir_builder_);
+      rhs_array_.EmitReadArrayElement(/*index=*/element_index, b_);
   if (ShapeUtil::ElementIsComplex(lhs_array_.GetShape())) {
-#define REAL(x) ir_builder_->CreateExtractValue(x, {0})
-#define IMAG(x) ir_builder_->CreateExtractValue(x, {1})
-    llvm::Value* real = ir_builder_->CreateFSub(
-        ir_builder_->CreateFMul(REAL(lhs_value), REAL(rhs_value)),
-        ir_builder_->CreateFMul(IMAG(lhs_value), IMAG(rhs_value)));
-    llvm::Value* imag = ir_builder_->CreateFAdd(
-        ir_builder_->CreateFMul(REAL(lhs_value), IMAG(rhs_value)),
-        ir_builder_->CreateFMul(IMAG(lhs_value), REAL(rhs_value)));
+#define REAL(x) b_->CreateExtractValue(x, {0})
+#define IMAG(x) b_->CreateExtractValue(x, {1})
+    llvm::Value* real =
+        b_->CreateFSub(b_->CreateFMul(REAL(lhs_value), REAL(rhs_value)),
+                       b_->CreateFMul(IMAG(lhs_value), IMAG(rhs_value)));
+    llvm::Value* imag =
+        b_->CreateFAdd(b_->CreateFMul(REAL(lhs_value), IMAG(rhs_value)),
+                       b_->CreateFMul(IMAG(lhs_value), REAL(rhs_value)));
 #undef IMAG
 #undef REAL
     result = llvm::ConstantAggregateZero::get(lhs_array_.GetElementLlvmType());
-    result = ir_builder_->CreateInsertValue(result, real, {0});
-    result = ir_builder_->CreateInsertValue(result, imag, {1});
+    result = b_->CreateInsertValue(result, real, {0});
+    result = b_->CreateInsertValue(result, imag, {1});
   } else {
-    result = ir_builder_->CreateFMul(lhs_value, rhs_value);
+    result = b_->CreateFMul(lhs_value, rhs_value);
   }
-  target_array_.EmitWriteArrayElement(/*index=*/element_index, result,
-                                      ir_builder_);
+  target_array_.EmitWriteArrayElement(/*index=*/element_index, result, b_);
   return Status::OK();
 }
 
@@ -1452,7 +1436,7 @@ Status DotOpEmitter::EmitCallToRuntime() {
       fn_name = multi_threaded
                     ? runtime::kEigenMatMulF16SymbolName
                     : runtime::kEigenSingleThreadedMatMulF16SymbolName;
-      float_type = ir_builder_->getHalfTy();
+      float_type = b_->getHalfTy();
       break;
     case F32:
       fn_name = multi_threaded
@@ -1461,7 +1445,7 @@ Status DotOpEmitter::EmitCallToRuntime() {
                     : (use_mkl_dnn
                            ? runtime::kMKLSingleThreadedMatMulF32SymbolName
                            : runtime::kEigenSingleThreadedMatMulF32SymbolName);
-      float_type = ir_builder_->getFloatTy();
+      float_type = b_->getFloatTy();
       break;
     case F64:
       fn_name = multi_threaded
@@ -1470,7 +1454,7 @@ Status DotOpEmitter::EmitCallToRuntime() {
                     : (use_mkl_dnn
                            ? runtime::kMKLSingleThreadedMatMulF64SymbolName
                            : runtime::kEigenSingleThreadedMatMulF64SymbolName);
-      float_type = ir_builder_->getDoubleTy();
+      float_type = b_->getDoubleTy();
       break;
     default:
       return Unimplemented("Invalid type %s for dot operation",
@@ -1478,16 +1462,16 @@ Status DotOpEmitter::EmitCallToRuntime() {
   }
 
   llvm::Type* float_ptr_type = float_type->getPointerTo();
-  llvm::Type* int64_type = ir_builder_->getInt64Ty();
-  llvm::Type* int32_type = ir_builder_->getInt32Ty();
-  llvm::Type* int8_ptr_type = ir_builder_->getInt8Ty()->getPointerTo();
+  llvm::Type* int64_type = b_->getInt64Ty();
+  llvm::Type* int32_type = b_->getInt32Ty();
+  llvm::Type* int8_ptr_type = b_->getInt8Ty()->getPointerTo();
   llvm::FunctionType* matmul_type = llvm::FunctionType::get(
-      ir_builder_->getVoidTy(),
+      b_->getVoidTy(),
       {int8_ptr_type, float_ptr_type, float_ptr_type, float_ptr_type,
        int64_type, int64_type, int64_type, int32_type, int32_type},
       /*isVarArg=*/false);
 
-  llvm::Function* function = ir_builder_->GetInsertBlock()->getParent();
+  llvm::Function* function = b_->GetInsertBlock()->getParent();
   llvm::Module* module = function->getParent();
 
   llvm::Function* matmul_func = llvm::cast<llvm::Function>(
@@ -1522,18 +1506,15 @@ Status DotOpEmitter::EmitCallToRuntime() {
     std::swap(transpose_lhs, transpose_rhs);
   }
 
-  ir_builder_->CreateCall(
+  b_->CreateCall(
       matmul_func,
-      {ir_builder_->CreateBitCast(executable_run_options_value_, int8_ptr_type),
-       ir_builder_->CreateBitCast(target_array_.GetBasePointer(),
-                                  float_ptr_type),
-       ir_builder_->CreateBitCast(lhs->GetBasePointer(), float_ptr_type),
-       ir_builder_->CreateBitCast(rhs->GetBasePointer(), float_ptr_type),
-       ir_builder_->getInt64(mat_mult_dims.m),
-       ir_builder_->getInt64(mat_mult_dims.n),
-       ir_builder_->getInt64(mat_mult_dims.k),
-       ir_builder_->getInt32(transpose_lhs),
-       ir_builder_->getInt32(transpose_rhs)});
+      {b_->CreateBitCast(executable_run_options_value_, int8_ptr_type),
+       b_->CreateBitCast(target_array_.GetBasePointer(), float_ptr_type),
+       b_->CreateBitCast(lhs->GetBasePointer(), float_ptr_type),
+       b_->CreateBitCast(rhs->GetBasePointer(), float_ptr_type),
+       b_->getInt64(mat_mult_dims.m), b_->getInt64(mat_mult_dims.n),
+       b_->getInt64(mat_mult_dims.k), b_->getInt32(transpose_lhs),
+       b_->getInt32(transpose_rhs)});
   return Status::OK();
 }
 
@@ -1556,36 +1537,6 @@ DotOpEmitter::MatMultDims DotOpEmitter::GetMatMultDims() const {
       LayoutUtil::Minor(target_array_.GetShape().layout(), 0) == 0};
 }
 
-llvm_ir::IrArray::Index DotOpEmitter::EmitOperandArrayLoopNest(
-    llvm_ir::ForLoopNest* loop_nest, const llvm_ir::IrArray& operand_array,
-    int64 reduction_dimension, tensorflow::StringPiece name_suffix) {
-  // Prepares the dimension list we will use to emit the loop nest. Outermost
-  // loops are added first. Add loops in major-to-minor order, and skip the
-  // reduction dimension.
-  std::vector<int64> dimensions;
-  const Shape& shape = operand_array.GetShape();
-  for (int i = LayoutUtil::MinorToMajor(shape).size() - 1; i >= 0; --i) {
-    int64 dimension = LayoutUtil::Minor(shape.layout(), i);
-    if (dimension != reduction_dimension) {
-      dimensions.push_back(dimension);
-    }
-  }
-
-  // Create loop nest with one for-loop for each dimension of the
-  // output.
-  llvm_ir::IrArray::Index index =
-      loop_nest->AddLoopsForShapeOnDimensions(shape, dimensions, name_suffix);
-  // Verify every dimension except the reduction dimension was set in the index.
-  for (int dimension = 0; dimension < index.size(); ++dimension) {
-    if (dimension == reduction_dimension) {
-      DCHECK_EQ(nullptr, index[dimension]);
-    } else {
-      DCHECK_NE(nullptr, index[dimension]);
-    }
-  }
-  return index;
-}
-
 // Return whether the given shape is a matrix with no padding.
 static bool IsRank2WithNoPadding(const Shape& shape) {
   return ShapeUtil::Rank(shape) == 2 && !LayoutUtil::IsPadded(shape);
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
index ed2a18976a..590032fbe9 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
@@ -61,7 +61,7 @@ class DotOpEmitter {
       const HloInstruction& dot, const llvm_ir::IrArray& target_array,
       const llvm_ir::IrArray& lhs_array, const llvm_ir::IrArray& rhs_array,
       const llvm_ir::IrArray* addend_array,
-      llvm::Value* executable_run_options_value, llvm::IRBuilder<>* ir_builder,
+      llvm::Value* executable_run_options_value, llvm::IRBuilder<>* b,
       const HloModuleConfig& hlo_module_config,
       const TargetMachineFeatures& target_machine_features);
 
@@ -70,8 +70,7 @@ class DotOpEmitter {
                const llvm_ir::IrArray& lhs_array,
                const llvm_ir::IrArray& rhs_array,
                const llvm_ir::IrArray* addend_array,
-               llvm::Value* executable_run_options_value,
-               llvm::IRBuilder<>* ir_builder,
+               llvm::Value* executable_run_options_value, llvm::IRBuilder<>* b,
                const HloModuleConfig& hlo_module_config,
                const TargetMachineFeatures& target_machine_features);
 
@@ -89,17 +88,6 @@ class DotOpEmitter {
   // Emits a call to the CPU runtime to perform the matrix multiply.
   Status EmitCallToRuntime();
 
-  // Emits a series of nested loops for iterating over an operand array in the
-  // dot operation. Loops are constructed in major to minor dimension layout
-  // order. No loop is emitted for the given reduction_dimension. The function
-  // returns an IrArray index for the given operand_array containing the indvars
-  // of the loops. All dimensions of the index are filled except for the
-  // reduction dimension. name_suffix is the string to append to the names of
-  // LLVM constructs (eg, basic blocks) constructed by this method.
-  llvm_ir::IrArray::Index EmitOperandArrayLoopNest(
-      llvm_ir::ForLoopNest* loop_nest, const llvm_ir::IrArray& operand_array,
-      int64 reduction_dimension, tensorflow::StringPiece name_suffix);
-
   // Represents the dimensions of a matrix-matrix multiply operation.
   struct MatMultDims {
     // The number of rows in the LHS.
@@ -171,7 +159,7 @@ class DotOpEmitter {
   const llvm_ir::IrArray& rhs_array_;
   const llvm_ir::IrArray* addend_array_;
   llvm::Value* executable_run_options_value_;
-  llvm::IRBuilder<>* ir_builder_;
+  llvm::IRBuilder<>* b_;
   const HloModuleConfig& hlo_module_config_;
   const TargetMachineFeatures& target_machine_features_;
 };
diff --git a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
index e97113dfa0..cf955a8add 100644
--- a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
@@ -38,8 +38,7 @@ StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitFloatUnaryOp(
       switch (element_type) {
         case F16:
           cast_result_to_fp16 = true;
-          operand_value = ir_builder_->CreateFPCast(operand_value,
-                                                    ir_builder_->getFloatTy());
+          operand_value = b_->CreateFPCast(operand_value, b_->getFloatTy());
           TF_FALLTHROUGH_INTENDED;
         case F32:
           function_name = "tanhf";
@@ -59,9 +58,9 @@ StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitFloatUnaryOp(
       function->setDoesNotThrow();
       function->setDoesNotAccessMemory();
       // Create an instruction to call the function.
-      llvm::Value* result = ir_builder_->CreateCall(function, operand_value);
+      llvm::Value* result = b_->CreateCall(function, operand_value);
       if (cast_result_to_fp16) {
-        result = ir_builder_->CreateFPCast(result, ir_builder_->getHalfTy());
+        result = b_->CreateFPCast(result, b_->getHalfTy());
       }
       return result;
     }
@@ -77,8 +76,8 @@ StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitAtan2(
   switch (prim_type) {
     case F16:
       cast_result_to_fp16 = true;
-      lhs = ir_builder_->CreateFPCast(lhs, ir_builder_->getFloatTy());
-      rhs = ir_builder_->CreateFPCast(rhs, ir_builder_->getFloatTy());
+      lhs = b_->CreateFPCast(lhs, b_->getFloatTy());
+      rhs = b_->CreateFPCast(rhs, b_->getFloatTy());
       TF_FALLTHROUGH_INTENDED;
     case F32:
       function_name = "atan2f";
@@ -98,9 +97,9 @@ StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitAtan2(
   function->setDoesNotThrow();
   function->setDoesNotAccessMemory();
   // Create an instruction to call the function.
-  llvm::Value* result = ir_builder_->CreateCall(function, {lhs, rhs});
+  llvm::Value* result = b_->CreateCall(function, {lhs, rhs});
   if (cast_result_to_fp16) {
-    result = ir_builder_->CreateFPCast(result, ir_builder_->getHalfTy());
+    result = b_->CreateFPCast(result, b_->getHalfTy());
   }
   return result;
 }
diff --git a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h
index 4446dfd282..9598a886ab 100644
--- a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h
@@ -31,7 +31,7 @@ class CpuElementalIrEmitter : public ElementalIrEmitter {
  public:
   CpuElementalIrEmitter(const HloModuleConfig& module_config,
                         IrEmitter* ir_emitter, llvm::Module* module)
-      : ElementalIrEmitter(module_config, module, ir_emitter->ir_builder()),
+      : ElementalIrEmitter(module_config, module, ir_emitter->b()),
         ir_emitter_(ir_emitter) {}
 
   llvm_ir::ElementGenerator MakeElementGenerator(
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 05f431642c..a6d8551841 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -51,6 +51,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h"
@@ -89,14 +90,14 @@ IrEmitter::IrEmitter(
     : assignment_(assignment),
       module_(llvm_module),
       arch_type_(llvm::Triple(llvm_module->getTargetTriple()).getArch()),
-      ir_builder_(llvm_module->getContext()),
+      b_(llvm_module->getContext()),
       instruction_to_profile_idx_(std::move(instruction_to_profile_idx)),
       computation_to_profile_idx_(std::move(computation_to_profile_idx)),
       alias_analysis_(hlo_module, assignment, &llvm_module->getContext()),
       hlo_module_config_(hlo_module.config()),
       is_top_level_computation_(false),
       target_machine_features_(*target_machine_features) {
-  ir_builder_.setFastMathFlags(llvm_ir::GetFastMathFlags(
+  b_.setFastMathFlags(llvm_ir::GetFastMathFlags(
       /*fast_math_enabled=*/hlo_module_config_.debug_options()
           .xla_enable_fast_math()));
 }
@@ -146,7 +147,7 @@ void IrEmitter::InitializeIrFunction(const string& function_name) {
       new IrFunction(function_name, linkage,
                      options::OptimizeForSizeRequested(hlo_module_config_),
                      hlo_module_config_.debug_options().xla_enable_fast_math(),
-                     module_, &ir_builder_, num_dynamic_loop_bounds_));
+                     module_, &b_, num_dynamic_loop_bounds_));
 }
 
 IrEmitter::~IrEmitter() {}
@@ -154,9 +155,9 @@ IrEmitter::~IrEmitter() {}
 Status IrEmitter::HandleBitcast(HloInstruction* bitcast) {
   VLOG(2) << "HandleBitcast: " << bitcast->ToString();
   emitted_value_[bitcast] =
-      ir_builder_.CreateBitCast(GetEmittedValueFor(bitcast->operand(0)),
-                                IrShapeType(bitcast->shape())->getPointerTo(),
-                                AsStringRef(IrName(bitcast)));
+      b_.CreateBitCast(GetEmittedValueFor(bitcast->operand(0)),
+                       IrShapeType(bitcast->shape())->getPointerTo(),
+                       AsStringRef(IrName(bitcast)));
   return Status::OK();
 }
 
@@ -175,25 +176,36 @@ llvm::Constant* IrEmitter::EmitGlobalForLiteral(const Literal& literal) {
       result_global, IrShapeType(literal.shape())->getPointerTo());
 }
 
-Status IrEmitter::HandleConstant(HloInstruction* constant) {
-  VLOG(2) << "HandleConstant: " << constant->ToString();
-  const Literal& literal = constant->literal();
-  llvm::Constant* global_for_const;
+Status IrEmitter::EmitConstantGlobals() {
+  for (const BufferAllocation& allocation : assignment_.Allocations()) {
+    if (!allocation.is_constant()) {
+      continue;
+    }
 
-  auto it = emitted_literals_.find(&literal);
-  if (it != emitted_literals_.end()) {
-    global_for_const = it->second;
-  } else {
-    global_for_const = EmitGlobalForLiteral(literal);
-    emitted_literals_[&literal] = global_for_const;
+    const Literal& literal = llvm_ir::LiteralForConstantAllocation(allocation);
+    llvm::Constant* global_for_const;
+    auto it = emitted_literals_.find(&literal);
+    if (it != emitted_literals_.end()) {
+      global_for_const = it->second;
+    } else {
+      global_for_const = EmitGlobalForLiteral(literal);
+      InsertOrDie(&emitted_literals_, &literal, global_for_const);
+    }
+
+    InsertOrDie(&constant_buffer_to_global_, allocation.index(),
+                global_for_const);
   }
-  emitted_value_[constant] = global_for_const;
-  VLOG(2) << "  emitted value: " << llvm_ir::DumpToString(*global_for_const);
-  VLOG(2) << "  its type: "
-          << llvm_ir::DumpToString(*global_for_const->getType());
+
   return Status::OK();
 }
 
+Status IrEmitter::HandleConstant(HloInstruction* constant) {
+  VLOG(2) << "HandleConstant: " << constant->ToString();
+  // IrEmitter::EmitConstantGlobals has already taken care of emitting the body
+  // of the constant.
+  return EmitTargetAddressForOp(constant);
+}
+
 Status IrEmitter::HandleCopy(HloInstruction* copy) {
   if (ShapeUtil::IsTuple(copy->shape())) {
     // kCopy shallow copies a tuple so just memcpy the top-level buffer.
@@ -273,7 +285,7 @@ Status IrEmitter::HandleGetTupleElement(HloInstruction* get_tuple_element) {
   const Shape& shape = get_tuple_element->shape();
   emitted_value_[get_tuple_element] = llvm_ir::EmitGetTupleElement(
       shape, get_tuple_element->tuple_index(), MinimumAlignmentForShape(shape),
-      GetEmittedValueFor(operand), &ir_builder_, module_);
+      GetEmittedValueFor(operand), &b_, module_);
   return Status::OK();
 }
 
@@ -293,7 +305,7 @@ Status IrEmitter::HandleTupleSelect(HloInstruction* tuple_select) {
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(tuple_select));
   llvm_ir::EmitTupleSelect(GetIrArrayFor(tuple_select), GetIrArrayFor(pred),
                            GetEmittedValueFor(on_true),
-                           GetEmittedValueFor(on_false), &ir_builder_, module_);
+                           GetEmittedValueFor(on_false), &b_, module_);
   return Status::OK();
 }
 
@@ -316,8 +328,8 @@ Status IrEmitter::HandleInfeed(HloInstruction* instruction) {
                       assignment_.GetUniqueSlice(infeed, {1}));
   llvm::Value* token_address = EmitTempBufferPointer(
       token_slice, ShapeUtil::GetTupleElementShape(infeed->shape(), 1));
-  llvm_ir::EmitTuple(GetIrArrayFor(infeed), {data_address, token_address},
-                     &ir_builder_, module_);
+  llvm_ir::EmitTuple(GetIrArrayFor(infeed), {data_address, token_address}, &b_,
+                     module_);
 
   if (ShapeUtil::IsTuple(data_shape)) {
     TF_RET_CHECK(!ShapeUtil::IsNestedTuple(data_shape));
@@ -348,7 +360,7 @@ Status IrEmitter::HandleInfeed(HloInstruction* instruction) {
     }
 
     llvm_ir::EmitTuple(llvm_ir::IrArray(data_address, data_shape),
-                       tuple_element_addresses, &ir_builder_, module_);
+                       tuple_element_addresses, &b_, module_);
   } else {
     TF_RETURN_IF_ERROR(
         EmitXfeedTransfer(XfeedKind::kInfeed, data_shape, data_address));
@@ -369,14 +381,14 @@ Status IrEmitter::EmitXfeedTransfer(XfeedKind kind, const Shape& shape,
   int32 length_32 = static_cast<int32>(length);
 
   int32 shape_length;
-  TF_ASSIGN_OR_RETURN(llvm::Value * shape_ptr,
-                      llvm_ir::EncodeSelfDescribingShapeConstant(
-                          shape, &shape_length, &ir_builder_));
+  TF_ASSIGN_OR_RETURN(
+      llvm::Value * shape_ptr,
+      llvm_ir::EncodeSelfDescribingShapeConstant(shape, &shape_length, &b_));
 
   // The signature of the acquire infeed buffer function is:
   //
   //   (void*)(int32 length);
-  llvm::Type* int32_type = ir_builder_.getInt32Ty();
+  llvm::Type* int32_type = b_.getInt32Ty();
   llvm::Type* i8_ptr_type = llvm::Type::getInt8PtrTy(module_->getContext());
   llvm::FunctionType* acquire_type = llvm::FunctionType::get(
       i8_ptr_type, {int32_type, i8_ptr_type, int32_type},
@@ -396,8 +408,7 @@ Status IrEmitter::EmitXfeedTransfer(XfeedKind kind, const Shape& shape,
   //
   //   (void)(int32 length, void* buffer);
   llvm::FunctionType* release_type = llvm::FunctionType::get(
-      ir_builder_.getVoidTy(),
-      {int32_type, i8_ptr_type, i8_ptr_type, int32_type},
+      b_.getVoidTy(), {int32_type, i8_ptr_type, i8_ptr_type, int32_type},
       /*isVarArg=*/false);
 
   llvm::Function* release_func;
@@ -414,25 +425,22 @@ Status IrEmitter::EmitXfeedTransfer(XfeedKind kind, const Shape& shape,
   // of size exactly 'length_32', and the runtime is responsible for
   // check-failing the process if there is a mismatch, versus passing us back a
   // buffer that we might overrun.
-  llvm::Value* acquired_pointer = ir_builder_.CreateCall(
-      acquire_func, {ir_builder_.getInt32(length_32), shape_ptr,
-                     ir_builder_.getInt32(shape_length)});
+  llvm::Value* acquired_pointer = b_.CreateCall(
+      acquire_func,
+      {b_.getInt32(length_32), shape_ptr, b_.getInt32(shape_length)});
 
   if (kind == XfeedKind::kInfeed) {
     // Copy to the program buffer address from the acquired buffer.
-    ir_builder_.CreateMemCpy(program_buffer_address, /*DstAlign=*/1,
-                             acquired_pointer,
-                             /*SrcAlign=*/1, length_32);
+    b_.CreateMemCpy(program_buffer_address, /*DstAlign=*/1, acquired_pointer,
+                    /*SrcAlign=*/1, length_32);
   } else {
     // Outfeed -- copy from the in-program address to the acquired buffer.
-    ir_builder_.CreateMemCpy(acquired_pointer, /*DstAlign=*/1,
-                             program_buffer_address,
-                             /*SrcAlign=*/1, length_32);
+    b_.CreateMemCpy(acquired_pointer, /*DstAlign=*/1, program_buffer_address,
+                    /*SrcAlign=*/1, length_32);
   }
 
-  ir_builder_.CreateCall(release_func,
-                         {ir_builder_.getInt32(length_32), acquired_pointer,
-                          shape_ptr, ir_builder_.getInt32(shape_length)});
+  b_.CreateCall(release_func, {b_.getInt32(length_32), acquired_pointer,
+                               shape_ptr, b_.getInt32(shape_length)});
 
   return Status::OK();
 }
@@ -453,7 +461,7 @@ Status IrEmitter::HandleOutfeed(HloInstruction* outfeed) {
         ShapeUtil::GetTupleElementShape(operand_shape, i);
     llvm::Value* tuple_element = llvm_ir::EmitGetTupleElement(
         tuple_element_shape, i, MinimumAlignmentForShape(tuple_element_shape),
-        value, &ir_builder_, module_);
+        value, &b_, module_);
     TF_RETURN_IF_ERROR(EmitXfeedTransfer(XfeedKind::kOutfeed,
                                          tuple_element_shape, tuple_element));
   }
@@ -472,7 +480,7 @@ Status IrEmitter::HandleTuple(HloInstruction* tuple) {
   for (auto operand : tuple->operands()) {
     base_ptrs.push_back(GetEmittedValueFor(operand));
   }
-  llvm_ir::EmitTuple(GetIrArrayFor(tuple), base_ptrs, &ir_builder_, module_);
+  llvm_ir::EmitTuple(GetIrArrayFor(tuple), base_ptrs, &b_, module_);
   return Status::OK();
 }
 
@@ -483,8 +491,7 @@ StatusOr<llvm::Value*> IrEmitter::EmitTargetElementLoopBodyForMap(
   std::vector<llvm::Value*> parameter_addresses;
   for (const HloInstruction* operand : map->operands()) {
     const llvm_ir::IrArray& array = GetIrArrayFor(operand);
-    parameter_addresses.push_back(
-        array.EmitArrayElementAddress(index, &ir_builder_));
+    parameter_addresses.push_back(array.EmitArrayElementAddress(index, &b_));
   }
   return EmitElementFunctionCall(mapped_ir_function, map->shape(),
                                  parameter_addresses, "map_function");
@@ -510,13 +517,12 @@ StatusOr<llvm::Value*> IrEmitter::EmitTargetElementLoopBodyForReduceWindow(
   PrimitiveType operand_element_type = operand->shape().element_type();
   llvm::Value* accumulator_address = llvm_ir::EmitAllocaAtFunctionEntry(
       llvm_ir::PrimitiveTypeToIrType(operand_element_type, module_),
-      "reduce_window_accumulator_address", &ir_builder_,
+      "reduce_window_accumulator_address", &b_,
       MinimumAlignmentForPrimitiveType(operand_element_type));
-  ir_builder_.CreateStore(
-      ir_builder_.CreateLoad(GetEmittedValueFor(reduce_window->operand(1))),
-      accumulator_address);
+  b_.CreateStore(b_.CreateLoad(GetEmittedValueFor(reduce_window->operand(1))),
+                 accumulator_address);
 
-  llvm_ir::ForLoopNest loops(IrName(reduce_window, "inner"), &ir_builder_);
+  llvm_ir::ForLoopNest loops(IrName(reduce_window, "inner"), &b_);
   std::vector<int64> window_size;
   for (const auto& dim : window.dimensions()) {
     window_size.push_back(dim.size());
@@ -525,48 +531,47 @@ StatusOr<llvm::Value*> IrEmitter::EmitTargetElementLoopBodyForReduceWindow(
       ShapeUtil::MakeShape(operand_element_type, window_size), "window");
   CHECK_EQ(window_index.size(), index.size());
 
-  SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &ir_builder_);
+  SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &b_);
 
-  llvm_ir::IrArray::Index input_index(ir_builder_.getInt64Ty(), index.size());
+  llvm_ir::IrArray::Index input_index(b_.getInt64Ty(), index.size());
   llvm::Value* in_bounds_condition = nullptr;
   for (size_t i = 0; i < index.size(); ++i) {
-    llvm::Value* strided_index = ir_builder_.CreateNSWMul(
-        index[i], ir_builder_.getInt64(window.dimensions(i).stride()));
-    input_index[i] = ir_builder_.CreateNSWSub(
-        ir_builder_.CreateNSWAdd(strided_index, window_index[i]),
-        ir_builder_.getInt64(window.dimensions(i).padding_low()));
+    llvm::Value* strided_index =
+        b_.CreateNSWMul(index[i], b_.getInt64(window.dimensions(i).stride()));
+    input_index[i] =
+        b_.CreateNSWSub(b_.CreateNSWAdd(strided_index, window_index[i]),
+                        b_.getInt64(window.dimensions(i).padding_low()));
 
     // We need to check if 0 <= input_index[i] < bound, as otherwise we are in
     // the padding so that we can skip the computation. That is equivalent to
     // input_index[i] < bound as an *unsigned* comparison, since a negative
     // value will wrap to a large positive value.
-    llvm::Value* index_condition = ir_builder_.CreateICmpULT(
+    llvm::Value* index_condition = b_.CreateICmpULT(
         input_index[i],
-        ir_builder_.getInt64(ShapeUtil::GetDimension(operand->shape(), i)));
+        b_.getInt64(ShapeUtil::GetDimension(operand->shape(), i)));
     if (in_bounds_condition == nullptr) {
       in_bounds_condition = index_condition;
     } else {
-      in_bounds_condition =
-          ir_builder_.CreateAnd(in_bounds_condition, index_condition);
+      in_bounds_condition = b_.CreateAnd(in_bounds_condition, index_condition);
     }
   }
   CHECK(in_bounds_condition != nullptr);
 
   llvm_ir::LlvmIfData if_data =
-      llvm_ir::EmitIfThenElse(in_bounds_condition, "in-bounds", &ir_builder_);
-  SetToFirstInsertPoint(if_data.true_block, &ir_builder_);
+      llvm_ir::EmitIfThenElse(in_bounds_condition, "in-bounds", &b_);
+  SetToFirstInsertPoint(if_data.true_block, &b_);
 
   // We are not in the padding, so carry out the computation.
   llvm_ir::IrArray input_array(GetIrArrayFor(operand));
   llvm::Value* input_value_address =
-      input_array.EmitArrayElementAddress(input_index, &ir_builder_);
+      input_array.EmitArrayElementAddress(input_index, &b_);
   llvm::Value* result = EmitElementFunctionCall(
       reducer_function, reduce_window->shape(),
       {accumulator_address, input_value_address}, "reducer_function");
-  ir_builder_.CreateStore(result, accumulator_address);
+  b_.CreateStore(result, accumulator_address);
 
-  SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &ir_builder_);
-  return ir_builder_.CreateLoad(accumulator_address);
+  SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &b_);
+  return b_.CreateLoad(accumulator_address);
 }
 
 Status IrEmitter::HandleReduceWindow(HloInstruction* reduce_window) {
@@ -649,141 +654,127 @@ Status IrEmitter::HandleSelectAndScatter(HloInstruction* select_and_scatter) {
       select_and_scatter, /*desc=*/IrName(select_and_scatter, "init"),
       [this, init_value](const llvm_ir::IrArray::Index& target_index) {
         llvm::Value* init_value_addr = GetEmittedValueFor(init_value);
-        return ir_builder_.CreateLoad(init_value_addr);
+        return b_.CreateLoad(init_value_addr);
       }));
 
   // Create a loop to iterate over the source array to scatter to the output.
-  llvm_ir::ForLoopNest source_loops(IrName(select_and_scatter), &ir_builder_);
+  llvm_ir::ForLoopNest source_loops(IrName(select_and_scatter), &b_);
   const llvm_ir::IrArray::Index source_index =
       source_loops.AddLoopsForShape(source->shape(), "source");
-  SetToFirstInsertPoint(source_loops.GetInnerLoopBodyBasicBlock(),
-                        &ir_builder_);
+  SetToFirstInsertPoint(source_loops.GetInnerLoopBodyBasicBlock(), &b_);
 
   // Allocate space to keep the currently selected value, its index, and
   // the boolean initialized_flag, which is initially set to false.
   llvm::Value* selected_value_address = llvm_ir::EmitAllocaAtFunctionEntry(
       llvm_ir::PrimitiveTypeToIrType(operand_element_type, module_),
-      "selected_value_address", &ir_builder_,
+      "selected_value_address", &b_,
       MinimumAlignmentForPrimitiveType(operand_element_type));
   llvm::Value* selected_index_address =
       llvm_ir::EmitAllocaAtFunctionEntryWithCount(
-          ir_builder_.getInt64Ty(), ir_builder_.getInt32(rank),
-          "selected_index_address", &ir_builder_);
+          b_.getInt64Ty(), b_.getInt32(rank), "selected_index_address", &b_);
   llvm::Value* initialized_flag_address = llvm_ir::EmitAllocaAtFunctionEntry(
-      ir_builder_.getInt1Ty(), "initialized_flag_address", &ir_builder_);
-  ir_builder_.CreateStore(ir_builder_.getInt1(false), initialized_flag_address);
+      b_.getInt1Ty(), "initialized_flag_address", &b_);
+  b_.CreateStore(b_.getInt1(false), initialized_flag_address);
 
   // Create the inner loop to iterate over the window.
-  llvm_ir::ForLoopNest window_loops(IrName(select_and_scatter, "window"),
-                                    &ir_builder_);
+  llvm_ir::ForLoopNest window_loops(IrName(select_and_scatter, "window"), &b_);
   std::vector<int64> window_size;
   for (const auto& dim : window.dimensions()) {
     window_size.push_back(dim.size());
   }
   const llvm_ir::IrArray::Index window_index = window_loops.AddLoopsForShape(
       ShapeUtil::MakeShape(operand_element_type, window_size), "window");
-  SetToFirstInsertPoint(window_loops.GetInnerLoopBodyBasicBlock(),
-                        &ir_builder_);
+  SetToFirstInsertPoint(window_loops.GetInnerLoopBodyBasicBlock(), &b_);
 
   // Compute the operand index to visit and evaluate the condition whether the
   // operand index is within the bounds. The unsigned comparison includes
   // checking whether the operand index >= 0.
-  llvm_ir::IrArray::Index operand_index(ir_builder_.getInt64Ty(),
-                                        source_index.size());
-  llvm::Value* in_bounds_condition = ir_builder_.getTrue();
+  llvm_ir::IrArray::Index operand_index(b_.getInt64Ty(), source_index.size());
+  llvm::Value* in_bounds_condition = b_.getTrue();
   for (int64 i = 0; i < rank; ++i) {
-    llvm::Value* strided_index = ir_builder_.CreateNSWMul(
-        source_index[i], ir_builder_.getInt64(window.dimensions(i).stride()));
-    operand_index[i] = ir_builder_.CreateNSWSub(
-        ir_builder_.CreateNSWAdd(strided_index, window_index[i]),
-        ir_builder_.getInt64(window.dimensions(i).padding_low()));
-    llvm::Value* index_condition = ir_builder_.CreateICmpULT(
+    llvm::Value* strided_index = b_.CreateNSWMul(
+        source_index[i], b_.getInt64(window.dimensions(i).stride()));
+    operand_index[i] =
+        b_.CreateNSWSub(b_.CreateNSWAdd(strided_index, window_index[i]),
+                        b_.getInt64(window.dimensions(i).padding_low()));
+    llvm::Value* index_condition = b_.CreateICmpULT(
         operand_index[i],
-        ir_builder_.getInt64(ShapeUtil::GetDimension(operand->shape(), i)));
-    in_bounds_condition =
-        ir_builder_.CreateAnd(in_bounds_condition, index_condition);
+        b_.getInt64(ShapeUtil::GetDimension(operand->shape(), i)));
+    in_bounds_condition = b_.CreateAnd(in_bounds_condition, index_condition);
   }
   CHECK(in_bounds_condition != nullptr);
 
   // Only need to do something if the operand index is within the bounds. First
   // check if the initialized_flag is set.
   llvm_ir::LlvmIfData if_in_bounds =
-      llvm_ir::EmitIfThenElse(in_bounds_condition, "in-bounds", &ir_builder_);
-  SetToFirstInsertPoint(if_in_bounds.true_block, &ir_builder_);
-  llvm_ir::LlvmIfData if_initialized =
-      llvm_ir::EmitIfThenElse(ir_builder_.CreateLoad(initialized_flag_address),
-                              "initialized", &ir_builder_);
+      llvm_ir::EmitIfThenElse(in_bounds_condition, "in-bounds", &b_);
+  SetToFirstInsertPoint(if_in_bounds.true_block, &b_);
+  llvm_ir::LlvmIfData if_initialized = llvm_ir::EmitIfThenElse(
+      b_.CreateLoad(initialized_flag_address), "initialized", &b_);
 
   // If the initialized_flag is false, initialize the selected value and index
   // with the currently visiting operand.
-  SetToFirstInsertPoint(if_initialized.false_block, &ir_builder_);
+  SetToFirstInsertPoint(if_initialized.false_block, &b_);
   const auto save_operand_index =
       [&](const llvm_ir::IrArray::Index& operand_index) {
         for (int64 i = 0; i < rank; ++i) {
           llvm::Value* selected_index_address_slot =
-              ir_builder_.CreateInBoundsGEP(selected_index_address,
-                                            {ir_builder_.getInt32(i)});
-          ir_builder_.CreateStore(operand_index[i],
-                                  selected_index_address_slot);
+              b_.CreateInBoundsGEP(selected_index_address, {b_.getInt32(i)});
+          b_.CreateStore(operand_index[i], selected_index_address_slot);
         }
       };
   llvm_ir::IrArray operand_array(GetIrArrayFor(operand));
   llvm::Value* operand_data =
-      operand_array.EmitReadArrayElement(operand_index, &ir_builder_);
-  ir_builder_.CreateStore(operand_data, selected_value_address);
+      operand_array.EmitReadArrayElement(operand_index, &b_);
+  b_.CreateStore(operand_data, selected_value_address);
   save_operand_index(operand_index);
-  ir_builder_.CreateStore(ir_builder_.getInt1(true), initialized_flag_address);
+  b_.CreateStore(b_.getInt1(true), initialized_flag_address);
 
   // If the initialized_flag is true, call the `select` function to potentially
   // update the selected value and index with the currently visiting operand.
-  SetToFirstInsertPoint(if_initialized.true_block, &ir_builder_);
+  SetToFirstInsertPoint(if_initialized.true_block, &b_);
   const Shape output_shape = ShapeUtil::MakeShape(PRED, {});
   llvm::Value* operand_address =
-      operand_array.EmitArrayElementAddress(operand_index, &ir_builder_);
+      operand_array.EmitArrayElementAddress(operand_index, &b_);
   llvm::Value* result = EmitElementFunctionCall(
       select_function, output_shape, {selected_value_address, operand_address},
       "select_function");
 
   // If the 'select' function returns false, update the selected value and the
   // index to the currently visiting operand.
-  llvm::Value* cond = ir_builder_.CreateICmpNE(
+  llvm::Value* cond = b_.CreateICmpNE(
       result,
       llvm::ConstantInt::get(llvm_ir::PrimitiveTypeToIrType(PRED, module_), 0),
       "boolean_predicate");
   llvm_ir::LlvmIfData if_select_lhs =
-      llvm_ir::EmitIfThenElse(cond, "if-select-lhs", &ir_builder_);
-  SetToFirstInsertPoint(if_select_lhs.false_block, &ir_builder_);
-  ir_builder_.CreateStore(ir_builder_.CreateLoad(operand_address),
-                          selected_value_address);
+      llvm_ir::EmitIfThenElse(cond, "if-select-lhs", &b_);
+  SetToFirstInsertPoint(if_select_lhs.false_block, &b_);
+  b_.CreateStore(b_.CreateLoad(operand_address), selected_value_address);
   save_operand_index(operand_index);
 
   // After iterating over the window elements, scatter the source element to
   // the selected index of the output. The value we store at the output
   // location is computed by calling the `scatter` function with the source
   // value and the current output value.
-  SetToFirstInsertPoint(window_loops.GetOuterLoopExitBasicBlock(),
-                        &ir_builder_);
+  SetToFirstInsertPoint(window_loops.GetOuterLoopExitBasicBlock(), &b_);
   llvm_ir::IrArray::Index selected_index(source_index.GetType());
   for (int64 i = 0; i < rank; ++i) {
-    llvm::Value* selected_index_address_slot = ir_builder_.CreateInBoundsGEP(
-        selected_index_address, {ir_builder_.getInt32(i)});
-    selected_index.push_back(
-        ir_builder_.CreateLoad(selected_index_address_slot));
+    llvm::Value* selected_index_address_slot =
+        b_.CreateInBoundsGEP(selected_index_address, {b_.getInt32(i)});
+    selected_index.push_back(b_.CreateLoad(selected_index_address_slot));
   }
   llvm_ir::IrArray source_array(GetIrArrayFor(source));
   llvm::Value* source_value_address =
-      source_array.EmitArrayElementAddress(source_index, &ir_builder_);
+      source_array.EmitArrayElementAddress(source_index, &b_);
   llvm_ir::IrArray output_array(GetIrArrayFor(select_and_scatter));
   llvm::Value* output_value_address =
-      output_array.EmitArrayElementAddress(selected_index, &ir_builder_);
+      output_array.EmitArrayElementAddress(selected_index, &b_);
   llvm::Value* scatter_value = EmitElementFunctionCall(
       scatter_function, source->shape(),
       {output_value_address, source_value_address}, "scatter_function");
-  output_array.EmitWriteArrayElement(selected_index, scatter_value,
-                                     &ir_builder_);
+  output_array.EmitWriteArrayElement(selected_index, scatter_value, &b_);
 
-  SetToFirstInsertPoint(source_loops.GetOuterLoopExitBasicBlock(),
-                        &ir_builder_);
+  SetToFirstInsertPoint(source_loops.GetOuterLoopExitBasicBlock(), &b_);
   return Status::OK();
 }
 
@@ -822,7 +813,7 @@ Status IrEmitter::HandleDot(HloInstruction* dot) {
   // Dot operation is complicated so we delegate to a helper class.
   return DotOpEmitter::EmitDotOperation(
       *dot, target_array, lhs_array, rhs_array, /*addend_array=*/nullptr,
-      GetExecutableRunOptionsArgument(), &ir_builder_, hlo_module_config_,
+      GetExecutableRunOptionsArgument(), &b_, hlo_module_config_,
       target_machine_features_);
 }
 
@@ -849,12 +840,12 @@ StatusOr<llvm::Value*> IrEmitter::EmitTargetElementLoopBodyForConvolution(
   llvm::Type* lhs_llvm_type =
       llvm_ir::PrimitiveTypeToIrType(lhs_element_type, module_);
   llvm::Value* sum_address = llvm_ir::EmitAllocaAtFunctionEntry(
-      lhs_llvm_type, "convolution_sum_address", &ir_builder_,
+      lhs_llvm_type, "convolution_sum_address", &b_,
       MinimumAlignmentForPrimitiveType(lhs_element_type));
   llvm::Value* constant_zero = llvm::Constant::getNullValue(lhs_llvm_type);
-  ir_builder_.CreateStore(constant_zero, sum_address);
+  b_.CreateStore(constant_zero, sum_address);
 
-  llvm_ir::ForLoopNest loops(IrName(convolution, "inner"), &ir_builder_);
+  llvm_ir::ForLoopNest loops(IrName(convolution, "inner"), &b_);
   std::vector<llvm::Value*> kernel_spatial(num_spatial_dims);
   for (int i = 0; i < num_spatial_dims; ++i) {
     kernel_spatial[i] =
@@ -870,7 +861,7 @@ StatusOr<llvm::Value*> IrEmitter::EmitTargetElementLoopBodyForConvolution(
                    "iz")
           ->GetIndVarValue();
 
-  SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &ir_builder_);
+  SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &b_);
 
   // Calculate the spatial index in the input array, taking striding, dilation
   // and padding into account. An index in the padding will be out of the bounds
@@ -878,13 +869,12 @@ StatusOr<llvm::Value*> IrEmitter::EmitTargetElementLoopBodyForConvolution(
   const auto calculate_input_index = [this](llvm::Value* output_index,
                                             llvm::Value* kernel_index,
                                             const WindowDimension& window_dim) {
-    llvm::Value* strided_index = ir_builder_.CreateNSWMul(
-        output_index, ir_builder_.getInt64(window_dim.stride()));
-    llvm::Value* dilated_kernel_index = ir_builder_.CreateNSWMul(
-        kernel_index, ir_builder_.getInt64(window_dim.window_dilation()));
-    return ir_builder_.CreateNSWSub(
-        ir_builder_.CreateNSWAdd(strided_index, dilated_kernel_index),
-        ir_builder_.getInt64(window_dim.padding_low()));
+    llvm::Value* strided_index =
+        b_.CreateNSWMul(output_index, b_.getInt64(window_dim.stride()));
+    llvm::Value* dilated_kernel_index = b_.CreateNSWMul(
+        kernel_index, b_.getInt64(window_dim.window_dilation()));
+    return b_.CreateNSWSub(b_.CreateNSWAdd(strided_index, dilated_kernel_index),
+                           b_.getInt64(window_dim.padding_low()));
   };
   std::vector<llvm::Value*> input_spatial(num_spatial_dims);
   for (int i = 0; i < num_spatial_dims; ++i) {
@@ -901,30 +891,27 @@ StatusOr<llvm::Value*> IrEmitter::EmitTargetElementLoopBodyForConvolution(
   // Also need to check that the input coordinates are not in one of the
   // holes created by base dilation.
   const auto not_in_hole = [&](llvm::Value* input_index, int64 base_dilation) {
-    llvm::Value* remainder = ir_builder_.CreateSRem(
-        input_index, ir_builder_.getInt64(base_dilation));
-    return ir_builder_.CreateICmpEQ(remainder, ir_builder_.getInt64(0));
+    llvm::Value* remainder =
+        b_.CreateSRem(input_index, b_.getInt64(base_dilation));
+    return b_.CreateICmpEQ(remainder, b_.getInt64(0));
   };
 
-  llvm::Value* in_bounds_condition = ir_builder_.getInt1(true);
+  llvm::Value* in_bounds_condition = b_.getInt1(true);
   for (int i = 0; i < num_spatial_dims; ++i) {
-    llvm::ConstantInt* input_bound =
-        ir_builder_.getInt64(window_util::DilatedBound(
-            lhs->shape().dimensions(dnums.input_spatial_dimensions(i)),
-            window.dimensions(i).base_dilation()));
-    llvm::Value* dim_in_bound =
-        ir_builder_.CreateICmpULT(input_spatial[i], input_bound);
+    llvm::ConstantInt* input_bound = b_.getInt64(window_util::DilatedBound(
+        lhs->shape().dimensions(dnums.input_spatial_dimensions(i)),
+        window.dimensions(i).base_dilation()));
+    llvm::Value* dim_in_bound = b_.CreateICmpULT(input_spatial[i], input_bound);
     llvm::Value* dim_not_in_hole =
         not_in_hole(input_spatial[i], window.dimensions(i).base_dilation());
-    llvm::Value* dim_ok = ir_builder_.CreateAnd(dim_in_bound, dim_not_in_hole);
-    in_bounds_condition = ir_builder_.CreateAnd(in_bounds_condition, dim_ok);
+    llvm::Value* dim_ok = b_.CreateAnd(dim_in_bound, dim_not_in_hole);
+    in_bounds_condition = b_.CreateAnd(in_bounds_condition, dim_ok);
   }
 
   // Now we need to map the dilated base coordinates back to the actual
   // data indices on the lhs.
   const auto undilate = [&](llvm::Value* input_index, int64 base_dilation) {
-    return ir_builder_.CreateSDiv(input_index,
-                                  ir_builder_.getInt64(base_dilation));
+    return b_.CreateSDiv(input_index, b_.getInt64(base_dilation));
   };
   for (int i = 0; i < num_spatial_dims; ++i) {
     input_spatial[i] =
@@ -932,12 +919,12 @@ StatusOr<llvm::Value*> IrEmitter::EmitTargetElementLoopBodyForConvolution(
   }
 
   llvm_ir::LlvmIfData if_data =
-      llvm_ir::EmitIfThenElse(in_bounds_condition, "in-bounds", &ir_builder_);
-  SetToFirstInsertPoint(if_data.true_block, &ir_builder_);
+      llvm_ir::EmitIfThenElse(in_bounds_condition, "in-bounds", &b_);
+  SetToFirstInsertPoint(if_data.true_block, &b_);
 
   // We are not in the padding, so carry out the computation.
   int num_dims = num_spatial_dims + 2;
-  llvm_ir::IrArray::Index input_index(ir_builder_.getInt64Ty(), num_dims);
+  llvm_ir::IrArray::Index input_index(b_.getInt64Ty(), num_dims);
   for (int i = 0; i < num_spatial_dims; ++i) {
     input_index[dnums.input_spatial_dimensions(i)] = input_spatial[i];
   }
@@ -945,13 +932,12 @@ StatusOr<llvm::Value*> IrEmitter::EmitTargetElementLoopBodyForConvolution(
   input_index[dnums.input_batch_dimension()] = batch;
 
   llvm_ir::IrArray kernel_array(GetIrArrayFor(rhs));
-  llvm_ir::IrArray::Index kernel_index(ir_builder_.getInt64Ty(), num_dims);
+  llvm_ir::IrArray::Index kernel_index(b_.getInt64Ty(), num_dims);
   for (int i = 0; i < num_spatial_dims; ++i) {
     kernel_index[dnums.kernel_spatial_dimensions(i)] =
         window.dimensions(i).window_reversal()
-            ? ir_builder_.CreateNSWSub(
-                  ir_builder_.getInt64(window.dimensions(i).size() - 1),
-                  kernel_spatial[i])
+            ? b_.CreateNSWSub(b_.getInt64(window.dimensions(i).size() - 1),
+                              kernel_spatial[i])
             : kernel_spatial[i];
   }
 
@@ -959,15 +945,14 @@ StatusOr<llvm::Value*> IrEmitter::EmitTargetElementLoopBodyForConvolution(
   kernel_index[dnums.kernel_output_feature_dimension()] = output_feature;
 
   llvm_ir::IrArray input_array(GetIrArrayFor(lhs));
-  llvm::Value* product = ir_builder_.CreateFMul(
-      input_array.EmitReadArrayElement(input_index, &ir_builder_),
-      kernel_array.EmitReadArrayElement(kernel_index, &ir_builder_));
-  llvm::Value* sum =
-      ir_builder_.CreateFAdd(ir_builder_.CreateLoad(sum_address), product);
-  ir_builder_.CreateStore(sum, sum_address);
+  llvm::Value* product =
+      b_.CreateFMul(input_array.EmitReadArrayElement(input_index, &b_),
+                    kernel_array.EmitReadArrayElement(kernel_index, &b_));
+  llvm::Value* sum = b_.CreateFAdd(b_.CreateLoad(sum_address), product);
+  b_.CreateStore(sum, sum_address);
 
-  SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &ir_builder_);
-  return ir_builder_.CreateLoad(sum_address);
+  SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &b_);
+  return b_.CreateLoad(sum_address);
 }
 
 Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
@@ -1056,12 +1041,12 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
 
       PrimitiveType primitive_type = lhs->shape().element_type();
       llvm::Type* ir_ptr_type = primitive_type == F16
-                                    ? ir_builder_.getHalfTy()->getPointerTo()
-                                    : ir_builder_.getFloatTy()->getPointerTo();
-      llvm::Type* int64_type = ir_builder_.getInt64Ty();
-      llvm::Type* int8_ptr_type = ir_builder_.getInt8Ty()->getPointerTo();
+                                    ? b_.getHalfTy()->getPointerTo()
+                                    : b_.getFloatTy()->getPointerTo();
+      llvm::Type* int64_type = b_.getInt64Ty();
+      llvm::Type* int8_ptr_type = b_.getInt8Ty()->getPointerTo();
       llvm::FunctionType* conv_type = llvm::FunctionType::get(
-          ir_builder_.getVoidTy(),
+          b_.getVoidTy(),
           {int8_ptr_type, ir_ptr_type, ir_ptr_type, ir_ptr_type, int64_type,
            int64_type,    int64_type,  int64_type,  int64_type,  int64_type,
            int64_type,    int64_type,  int64_type,  int64_type,  int64_type,
@@ -1093,34 +1078,34 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
       conv_func->setCallingConv(llvm::CallingConv::C);
       conv_func->setDoesNotThrow();
       conv_func->setOnlyAccessesArgMemory();
-      ir_builder_.CreateCall(
-          conv_func, {
-                         GetExecutableRunOptionsArgument(),
-                         ir_builder_.CreateBitCast(
-                             GetEmittedValueFor(convolution), ir_ptr_type),
-                         ir_builder_.CreateBitCast(lhs_address, ir_ptr_type),
-                         ir_builder_.CreateBitCast(rhs_address, ir_ptr_type),
-                         ir_builder_.getInt64(input_batch),
-                         ir_builder_.getInt64(input_rows),
-                         ir_builder_.getInt64(input_cols),
-                         ir_builder_.getInt64(input_channels),
-                         ir_builder_.getInt64(kernel_rows),
-                         ir_builder_.getInt64(kernel_cols),
-                         ir_builder_.getInt64(kernel_channels),
-                         ir_builder_.getInt64(kernel_filters),
-                         ir_builder_.getInt64(output_rows),
-                         ir_builder_.getInt64(output_cols),
-                         ir_builder_.getInt64(row_stride),
-                         ir_builder_.getInt64(col_stride),
-                         ir_builder_.getInt64(padding_top),
-                         ir_builder_.getInt64(padding_bottom),
-                         ir_builder_.getInt64(padding_left),
-                         ir_builder_.getInt64(padding_right),
-                         ir_builder_.getInt64(lhs_row_dilation),
-                         ir_builder_.getInt64(lhs_col_dilation),
-                         ir_builder_.getInt64(rhs_row_dilation),
-                         ir_builder_.getInt64(rhs_col_dilation),
-                     });
+      b_.CreateCall(
+          conv_func,
+          {
+              GetExecutableRunOptionsArgument(),
+              b_.CreateBitCast(GetEmittedValueFor(convolution), ir_ptr_type),
+              b_.CreateBitCast(lhs_address, ir_ptr_type),
+              b_.CreateBitCast(rhs_address, ir_ptr_type),
+              b_.getInt64(input_batch),
+              b_.getInt64(input_rows),
+              b_.getInt64(input_cols),
+              b_.getInt64(input_channels),
+              b_.getInt64(kernel_rows),
+              b_.getInt64(kernel_cols),
+              b_.getInt64(kernel_channels),
+              b_.getInt64(kernel_filters),
+              b_.getInt64(output_rows),
+              b_.getInt64(output_cols),
+              b_.getInt64(row_stride),
+              b_.getInt64(col_stride),
+              b_.getInt64(padding_top),
+              b_.getInt64(padding_bottom),
+              b_.getInt64(padding_left),
+              b_.getInt64(padding_right),
+              b_.getInt64(lhs_row_dilation),
+              b_.getInt64(lhs_col_dilation),
+              b_.getInt64(rhs_row_dilation),
+              b_.getInt64(rhs_col_dilation),
+          });
 
       return Status::OK();
     }
@@ -1159,11 +1144,11 @@ Status IrEmitter::HandleFft(HloInstruction* fft) {
   }
 
   // Args have been computed, make the call.
-  llvm::Type* int8_ptr_type = ir_builder_.getInt8Ty()->getPointerTo();
-  llvm::Type* int32_type = ir_builder_.getInt32Ty();
-  llvm::Type* int64_type = ir_builder_.getInt64Ty();
+  llvm::Type* int8_ptr_type = b_.getInt8Ty()->getPointerTo();
+  llvm::Type* int32_type = b_.getInt32Ty();
+  llvm::Type* int64_type = b_.getInt64Ty();
   llvm::FunctionType* fft_type = llvm::FunctionType::get(
-      ir_builder_.getVoidTy(),
+      b_.getVoidTy(),
       {int8_ptr_type, int8_ptr_type, int8_ptr_type, int32_type, int32_type,
        int64_type, int64_type, int64_type, int64_type},
       /*isVarArg=*/false);
@@ -1180,16 +1165,15 @@ Status IrEmitter::HandleFft(HloInstruction* fft) {
   fft_func->setDoesNotThrow();
   fft_func->setOnlyAccessesInaccessibleMemOrArgMem();
   const int fft_rank = fft_length.size();
-  ir_builder_.CreateCall(
+  b_.CreateCall(
       fft_func,
       {GetExecutableRunOptionsArgument(),
-       ir_builder_.CreateBitCast(GetEmittedValueFor(fft), int8_ptr_type),
-       ir_builder_.CreateBitCast(operand_address, int8_ptr_type),
-       ir_builder_.getInt32(fft->fft_type()), ir_builder_.getInt32(fft_rank),
-       ir_builder_.getInt64(input_batch),
-       ir_builder_.getInt64(fft_rank > 0 ? fft_length[0] : 0),
-       ir_builder_.getInt64(fft_rank > 1 ? fft_length[1] : 0),
-       ir_builder_.getInt64(fft_rank > 2 ? fft_length[2] : 0)});
+       b_.CreateBitCast(GetEmittedValueFor(fft), int8_ptr_type),
+       b_.CreateBitCast(operand_address, int8_ptr_type),
+       b_.getInt32(fft->fft_type()), b_.getInt32(fft_rank),
+       b_.getInt64(input_batch), b_.getInt64(fft_rank > 0 ? fft_length[0] : 0),
+       b_.getInt64(fft_rank > 1 ? fft_length[1] : 0),
+       b_.getInt64(fft_rank > 2 ? fft_length[2] : 0)});
 
   return Status::OK();
 }
@@ -1228,11 +1212,10 @@ Status IrEmitter::HandleCrossReplicaSum(HloInstruction* crs) {
     operand_ptrs.push_back(EmitTempBufferPointer(out_slice, operand_shape));
 
     // TODO(b/63762267): Be more aggressive about specifying alignment.
-    ir_builder_.CreateMemCpy(operand_ptrs.back(), /*DstAlign=*/1, in_ptr,
-                             /*SrcAlign=*/1,
-                             ShapeUtil::ByteSizeOf(operand_shape));
+    b_.CreateMemCpy(operand_ptrs.back(), /*DstAlign=*/1, in_ptr,
+                    /*SrcAlign=*/1, ShapeUtil::ByteSizeOf(operand_shape));
   }
-  llvm_ir::EmitTuple(GetIrArrayFor(crs), operand_ptrs, &ir_builder_, module_);
+  llvm_ir::EmitTuple(GetIrArrayFor(crs), operand_ptrs, &b_, module_);
   return Status::OK();
 }
 
@@ -1278,9 +1261,8 @@ Status IrEmitter::HandleParameter(HloInstruction* parameter) {
   // example, float for an XLA F32 element type).
   llvm::Value* params = compute_function_->parameters_arg();
   llvm::Value* param_address_offset =
-      llvm_ir::EmitBufferIndexingGEP(params, param_number, &ir_builder_);
-  llvm::LoadInst* param_address_untyped =
-      ir_builder_.CreateLoad(param_address_offset);
+      llvm_ir::EmitBufferIndexingGEP(params, param_number, &b_);
+  llvm::LoadInst* param_address_untyped = b_.CreateLoad(param_address_offset);
   param_address_untyped->setName(AsStringRef(IrName(parameter, "untyped")));
   if (is_top_level_computation_ &&
       hlo_module_config_.debug_options()
@@ -1295,7 +1277,7 @@ Status IrEmitter::HandleParameter(HloInstruction* parameter) {
         llvm::MDNode::get(param_address_untyped->getContext(), /*MDs=*/{}));
   }
 
-  llvm::Value* param_address_typed = ir_builder_.CreateBitCast(
+  llvm::Value* param_address_typed = b_.CreateBitCast(
       param_address_untyped, IrShapeType(param_shape)->getPointerTo());
   emitted_value_[parameter] = param_address_typed;
 
@@ -1403,62 +1385,61 @@ IrEmitter::ReductionGenerator IrEmitter::MatchReductionGenerator(
       return nullptr;
 
     case HloOpcode::kAdd:
-      return [root_is_integral](llvm::IRBuilder<>* ir_builder, llvm::Value* lhs,
+      return [root_is_integral](llvm::IRBuilder<>* b, llvm::Value* lhs,
                                 llvm::Value* rhs) {
-        return root_is_integral ? ir_builder->CreateAdd(lhs, rhs)
-                                : ir_builder->CreateFAdd(lhs, rhs);
+        return root_is_integral ? b->CreateAdd(lhs, rhs)
+                                : b->CreateFAdd(lhs, rhs);
       };
 
     case HloOpcode::kMultiply:
-      return [root_is_integral](llvm::IRBuilder<>* ir_builder, llvm::Value* lhs,
+      return [root_is_integral](llvm::IRBuilder<>* b, llvm::Value* lhs,
                                 llvm::Value* rhs) {
-        return root_is_integral ? ir_builder->CreateMul(lhs, rhs)
-                                : ir_builder->CreateFMul(lhs, rhs);
+        return root_is_integral ? b->CreateMul(lhs, rhs)
+                                : b->CreateFMul(lhs, rhs);
       };
 
     case HloOpcode::kAnd:
-      return [](llvm::IRBuilder<>* ir_builder, llvm::Value* lhs,
-                llvm::Value* rhs) { return ir_builder->CreateAnd(lhs, rhs); };
+      return [](llvm::IRBuilder<>* b, llvm::Value* lhs, llvm::Value* rhs) {
+        return b->CreateAnd(lhs, rhs);
+      };
 
     case HloOpcode::kOr:
-      return [](llvm::IRBuilder<>* ir_builder, llvm::Value* lhs,
-                llvm::Value* rhs) { return ir_builder->CreateOr(lhs, rhs); };
+      return [](llvm::IRBuilder<>* b, llvm::Value* lhs, llvm::Value* rhs) {
+        return b->CreateOr(lhs, rhs);
+      };
 
     case HloOpcode::kXor:
-      return [](llvm::IRBuilder<>* ir_builder, llvm::Value* lhs,
-                llvm::Value* rhs) { return ir_builder->CreateXor(lhs, rhs); };
+      return [](llvm::IRBuilder<>* b, llvm::Value* lhs, llvm::Value* rhs) {
+        return b->CreateXor(lhs, rhs);
+      };
 
     case HloOpcode::kMaximum:
       return [root_is_floating_point, root_is_signed](
-                 llvm::IRBuilder<>* ir_builder, llvm::Value* lhs,
-                 llvm::Value* rhs) {
+                 llvm::IRBuilder<>* b, llvm::Value* lhs, llvm::Value* rhs) {
         if (root_is_floating_point) {
           return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::maxnum,
-                                              {lhs, rhs}, {lhs->getType()},
-                                              ir_builder);
+                                              {lhs, rhs}, {lhs->getType()}, b);
         }
 
-        return ir_builder->CreateSelect(
-            ir_builder->CreateICmp(root_is_signed ? llvm::ICmpInst::ICMP_SGE
-                                                  : llvm::ICmpInst::ICMP_UGE,
-                                   lhs, rhs),
+        return b->CreateSelect(
+            b->CreateICmp(root_is_signed ? llvm::ICmpInst::ICMP_SGE
+                                         : llvm::ICmpInst::ICMP_UGE,
+                          lhs, rhs),
             lhs, rhs);
       };
 
     case HloOpcode::kMinimum:
       return [root_is_floating_point, root_is_signed](
-                 llvm::IRBuilder<>* ir_builder, llvm::Value* lhs,
-                 llvm::Value* rhs) {
+                 llvm::IRBuilder<>* b, llvm::Value* lhs, llvm::Value* rhs) {
         if (root_is_floating_point) {
           return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::minnum,
-                                              {lhs, rhs}, {lhs->getType()},
-                                              ir_builder);
+                                              {lhs, rhs}, {lhs->getType()}, b);
         }
 
-        return ir_builder->CreateSelect(
-            ir_builder->CreateICmp(root_is_signed ? llvm::ICmpInst::ICMP_SLE
-                                                  : llvm::ICmpInst::ICMP_ULE,
-                                   lhs, rhs),
+        return b->CreateSelect(
+            b->CreateICmp(root_is_signed ? llvm::ICmpInst::ICMP_SLE
+                                         : llvm::ICmpInst::ICMP_ULE,
+                          lhs, rhs),
             lhs, rhs);
       };
   }
@@ -1527,34 +1508,31 @@ IrEmitter::EmitInnerLoopForVectorizedReduction(
   accumulator.reserve(accumulator_type.size());
   for (auto accumulator_shard_type : accumulator_type) {
     accumulator.push_back(llvm_ir::EmitAllocaAtFunctionEntry(
-        accumulator_shard_type, "accumulator", &ir_builder_, 0));
+        accumulator_shard_type, "accumulator", &b_, 0));
   }
 
-  llvm::Value* init_value_ssa =
-      ir_builder_.CreateLoad(GetEmittedValueFor(init_value));
+  llvm::Value* init_value_ssa = b_.CreateLoad(GetEmittedValueFor(init_value));
 
   for (llvm::Value* accumulator_shard : accumulator) {
     llvm::Value* initial_value;
     auto shard_type = accumulator_shard->getType()->getPointerElementType();
     if (auto vector_type = llvm::dyn_cast<llvm::VectorType>(shard_type)) {
-      initial_value = ir_builder_.CreateVectorSplat(
-          vector_type->getNumElements(), init_value_ssa);
+      initial_value =
+          b_.CreateVectorSplat(vector_type->getNumElements(), init_value_ssa);
     } else {
       initial_value = init_value_ssa;
     }
 
-    ir_builder_.CreateAlignedStore(initial_value, accumulator_shard,
-                                   element_alignment);
+    b_.CreateAlignedStore(initial_value, accumulator_shard, element_alignment);
   }
 
   llvm_ir::ForLoopNest reduction_loop_nest(IrName(arg, "vectorized_inner"),
-                                           &ir_builder_);
+                                           &b_);
   llvm_ir::IrArray::Index reduced_dims_index =
       reduction_loop_nest.AddLoopsForShapeOnDimensions(arg->shape(), dimensions,
                                                        "reduction_dim");
 
-  SetToFirstInsertPoint(reduction_loop_nest.GetInnerLoopBodyBasicBlock(),
-                        &ir_builder_);
+  SetToFirstInsertPoint(reduction_loop_nest.GetInnerLoopBodyBasicBlock(), &b_);
 
   llvm_ir::IrArray arg_array(GetIrArrayFor(arg));
   llvm_ir::IrArray::Index input_index = reduced_dims_index;
@@ -1567,38 +1545,34 @@ IrEmitter::EmitInnerLoopForVectorizedReduction(
   }
   CHECK(output_index.end() == it);
 
-  llvm::Value* input_address = ir_builder_.CreateBitCast(
-      arg_array.EmitArrayElementAddress(input_index, &ir_builder_),
-      ir_builder_.getInt8PtrTy());
+  llvm::Value* input_address = b_.CreateBitCast(
+      arg_array.EmitArrayElementAddress(input_index, &b_), b_.getInt8PtrTy());
 
   for (int i = 0; i < accumulator.size(); i++) {
     auto input_address_typed =
-        ir_builder_.CreateBitCast(input_address, accumulator[i]->getType());
+        b_.CreateBitCast(input_address, accumulator[i]->getType());
     auto current_accumulator_value =
-        ir_builder_.CreateAlignedLoad(accumulator[i], element_alignment);
-    auto addend =
-        ir_builder_.CreateAlignedLoad(input_address_typed, element_alignment);
+        b_.CreateAlignedLoad(accumulator[i], element_alignment);
+    auto addend = b_.CreateAlignedLoad(input_address_typed, element_alignment);
     arg_array.AnnotateLoadStoreInstructionWithMetadata(addend);
 
     auto reduced_result =
-        reduction_generator(&ir_builder_, current_accumulator_value, addend);
-    ir_builder_.CreateAlignedStore(reduced_result, accumulator[i],
-                                   element_alignment);
+        reduction_generator(&b_, current_accumulator_value, addend);
+    b_.CreateAlignedStore(reduced_result, accumulator[i], element_alignment);
 
     if (i != (accumulator.size() - 1)) {
-      input_address = ir_builder_.CreateConstInBoundsGEP1_32(
-          reduced_result->getType(), input_address_typed, 1);
+      input_address = b_.CreateConstInBoundsGEP1_32(reduced_result->getType(),
+                                                    input_address_typed, 1);
     }
   }
 
-  SetToFirstInsertPoint(reduction_loop_nest.GetOuterLoopExitBasicBlock(),
-                        &ir_builder_);
+  SetToFirstInsertPoint(reduction_loop_nest.GetOuterLoopExitBasicBlock(), &b_);
 
   ShardedVector result_ssa;
   result_ssa.reserve(accumulator.size());
   for (auto accumulator_shard : accumulator) {
     result_ssa.push_back(
-        ir_builder_.CreateAlignedLoad(accumulator_shard, element_alignment));
+        b_.CreateAlignedLoad(accumulator_shard, element_alignment));
   }
   return result_ssa;
 }
@@ -1607,17 +1581,17 @@ void IrEmitter::EmitShardedVectorStore(
     llvm::Value* store_address, const std::vector<llvm::Value*>& value_to_store,
     const int alignment, const llvm_ir::IrArray& containing_array) {
   for (int i = 0; i < value_to_store.size(); i++) {
-    auto store_address_typed = ir_builder_.CreateBitCast(
+    auto store_address_typed = b_.CreateBitCast(
         store_address,
         llvm::PointerType::getUnqual(value_to_store[i]->getType()));
 
-    auto store_instruction = ir_builder_.CreateAlignedStore(
+    auto store_instruction = b_.CreateAlignedStore(
         value_to_store[i], store_address_typed, alignment);
     containing_array.AnnotateLoadStoreInstructionWithMetadata(
         store_instruction);
 
     if (i != (value_to_store.size() - 1)) {
-      store_address = ir_builder_.CreateConstInBoundsGEP1_32(
+      store_address = b_.CreateConstInBoundsGEP1_32(
           value_to_store[i]->getType(), store_address_typed, 1);
     }
   }
@@ -1683,8 +1657,8 @@ StatusOr<bool> IrEmitter::EmitVectorizedReduce(
   //    }
   //  }
 
-  llvm_ir::ForLoopNest loop_nest(IrName(reduce), &ir_builder_);
-  llvm_ir::IrArray::Index array_index(ir_builder_.getInt64Ty(),
+  llvm_ir::ForLoopNest loop_nest(IrName(reduce), &b_);
+  llvm_ir::IrArray::Index array_index(b_.getInt64Ty(),
                                       reduce->shape().dimensions_size());
   for (int i = LayoutUtil::MinorToMajor(reduce->shape()).size() - 1; i > 0;
        --i) {
@@ -1703,7 +1677,7 @@ StatusOr<bool> IrEmitter::EmitVectorizedReduce(
 
   if (llvm::BasicBlock* innermost_body_bb =
           loop_nest.GetInnerLoopBodyBasicBlock()) {
-    SetToFirstInsertPoint(innermost_body_bb, &ir_builder_);
+    SetToFirstInsertPoint(innermost_body_bb, &b_);
   }
 
   auto outermost_loop_exit_block = loop_nest.GetOuterLoopExitBasicBlock();
@@ -1717,7 +1691,7 @@ StatusOr<bool> IrEmitter::EmitVectorizedReduce(
         tensorflow::strings::Printf("dim.%lld", innermost_dimension));
     array_index[innermost_dimension] = loop->GetIndVarValue();
 
-    SetToFirstInsertPoint(loop->GetBodyBasicBlock(), &ir_builder_);
+    SetToFirstInsertPoint(loop->GetBodyBasicBlock(), &b_);
 
     ShardedVectorType vector_type = CreateShardedVectorType(
         reduce->shape().element_type(), vectorization_factor);
@@ -1728,16 +1702,16 @@ StatusOr<bool> IrEmitter::EmitVectorizedReduce(
 
     llvm_ir::IrArray target_array = GetIrArrayFor(reduce);
     llvm::Value* output_address =
-        target_array.EmitArrayElementAddress(array_index, &ir_builder_);
+        target_array.EmitArrayElementAddress(array_index, &b_);
     EmitShardedVectorStore(output_address, accumulator, element_alignment,
                            target_array);
 
     if (auto exit_terminator = loop->GetExitBasicBlock()->getTerminator()) {
       CHECK_GT(LayoutUtil::MinorToMajor(reduce->shape()).size(), 1);
-      ir_builder_.SetInsertPoint(exit_terminator);
+      b_.SetInsertPoint(exit_terminator);
     } else {
       CHECK_EQ(LayoutUtil::MinorToMajor(reduce->shape()).size(), 1);
-      ir_builder_.SetInsertPoint(loop->GetExitBasicBlock());
+      b_.SetInsertPoint(loop->GetExitBasicBlock());
     }
   }
 
@@ -1747,8 +1721,8 @@ StatusOr<bool> IrEmitter::EmitVectorizedReduce(
   if (innermost_dimension_size % vectorization_factor) {
     // TODO(b/63775531): Consider using a scalar loop here to save on code size.
     array_index[innermost_dimension] =
-        ir_builder_.getInt64(innermost_dimension_size -
-                             (innermost_dimension_size % vectorization_factor));
+        b_.getInt64(innermost_dimension_size -
+                    (innermost_dimension_size % vectorization_factor));
 
     ShardedVectorType vector_type = CreateShardedVectorType(
         reduce->shape().element_type(),
@@ -1760,13 +1734,13 @@ StatusOr<bool> IrEmitter::EmitVectorizedReduce(
 
     llvm_ir::IrArray target_array = GetIrArrayFor(reduce);
     llvm::Value* output_address =
-        target_array.EmitArrayElementAddress(array_index, &ir_builder_);
+        target_array.EmitArrayElementAddress(array_index, &b_);
     EmitShardedVectorStore(output_address, accumulator, element_alignment,
                            target_array);
   }
 
   if (outermost_loop_exit_block) {
-    ir_builder_.SetInsertPoint(outermost_loop_exit_block);
+    b_.SetInsertPoint(outermost_loop_exit_block);
   }
 
   return true;
@@ -1785,22 +1759,22 @@ StatusOr<llvm::Value*> IrEmitter::EmitTargetElementLoopBodyForReduce(
   PrimitiveType accumulator_type = reduce->shape().element_type();
   llvm::AllocaInst* accumulator_addr = llvm_ir::EmitAllocaAtFunctionEntry(
       llvm_ir::PrimitiveTypeToIrType(accumulator_type, module_), "accumulator",
-      &ir_builder_, MinimumAlignmentForPrimitiveType(accumulator_type));
+      &b_, MinimumAlignmentForPrimitiveType(accumulator_type));
   llvm::Value* init_value_addr = GetEmittedValueFor(init_value);
-  llvm::Value* load_init_value = ir_builder_.CreateLoad(init_value_addr);
-  ir_builder_.CreateStore(load_init_value, accumulator_addr);
+  llvm::Value* load_init_value = b_.CreateLoad(init_value_addr);
+  b_.CreateStore(load_init_value, accumulator_addr);
 
   // The enclosing loops go over all the target elements. Now we have to compute
   // the actual target element. For this, we build a new loop nest to iterate
   // over all the reduction dimensions in the argument.
   // AddLoopsForShapeOnDimensions will return an Index where induction Value*s
   // are placed for each dimension in dimensions, and all the rest are nullptrs.
-  llvm_ir::ForLoopNest loops(IrName(reduce, "inner"), &ir_builder_);
+  llvm_ir::ForLoopNest loops(IrName(reduce, "inner"), &b_);
   const llvm_ir::IrArray::Index reduced_dims_index =
       loops.AddLoopsForShapeOnDimensions(arg->shape(), dimensions,
                                          "reduction_dim");
 
-  SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &ir_builder_);
+  SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &b_);
 
   // Build a full index for the input argument, using reduced_dims_index as the
   // base. In reduced_dims_index only the reduction dimensions are filled in. We
@@ -1820,14 +1794,14 @@ StatusOr<llvm::Value*> IrEmitter::EmitTargetElementLoopBodyForReduce(
 
   // Apply the reduction function to the loaded value.
   llvm::Value* input_address =
-      arg_array.EmitArrayElementAddress(input_index, &ir_builder_);
+      arg_array.EmitArrayElementAddress(input_index, &b_);
   llvm::Value* result = EmitElementFunctionCall(
       reducer_function, reduce->shape(), {accumulator_addr, input_address},
       "reduce_function");
-  ir_builder_.CreateStore(result, accumulator_addr);
+  b_.CreateStore(result, accumulator_addr);
 
-  SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &ir_builder_);
-  return ir_builder_.CreateLoad(accumulator_addr);
+  SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &b_);
+  return b_.CreateLoad(accumulator_addr);
 }
 
 Status IrEmitter::HandleReduce(HloInstruction* reduce) {
@@ -1957,7 +1931,7 @@ Status IrEmitter::HandleSlice(HloInstruction* slice) {
   llvm_ir::IrArray target_array = GetIrArrayFor(slice);
 
   const int64 num_outer_loops = outer_dims.size();
-  llvm_ir::ForLoopNest loops(IrName(slice), &ir_builder_);
+  llvm_ir::ForLoopNest loops(IrName(slice), &b_);
   llvm_ir::IrArray::Index target_index =
       loops.AddLoopsForShapeOnDimensions(slice->shape(), outer_dims, "slice");
 
@@ -1966,21 +1940,21 @@ Status IrEmitter::HandleSlice(HloInstruction* slice) {
   // for the rest of the dimensions the copy writes to the full dimension.
   std::replace(target_index.begin(), target_index.end(),
                static_cast<llvm::Value*>(nullptr),
-               static_cast<llvm::Value*>(ir_builder_.getInt64(0)));
+               static_cast<llvm::Value*>(b_.getInt64(0)));
 
   if (num_outer_loops > 0) {
-    SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &ir_builder_);
+    SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &b_);
   }
 
   llvm_ir::IrArray source_array = GetIrArrayFor(operand);
   const llvm_ir::IrArray::Index source_index = target_index.SourceIndexOfSlice(
       /*shape=*/slice->shape(), /*starts=*/slice->slice_starts(),
-      /*strides=*/slice->slice_strides(), /*builder=*/&ir_builder_);
+      /*strides=*/slice->slice_strides(), /*builder=*/&b_);
 
-  llvm::Value* memcpy_dest = target_array.EmitArrayElementAddress(
-      target_index, &ir_builder_, "slice.dest");
-  llvm::Value* memcpy_source = source_array.EmitArrayElementAddress(
-      source_index, &ir_builder_, "slice.source");
+  llvm::Value* memcpy_dest =
+      target_array.EmitArrayElementAddress(target_index, &b_, "slice.dest");
+  llvm::Value* memcpy_source =
+      source_array.EmitArrayElementAddress(source_index, &b_, "slice.source");
 
   const int64 memcpy_elements =
       primitive_elements_per_logical_element * memcpy_logical_elements;
@@ -1997,7 +1971,7 @@ Status IrEmitter::HandleSlice(HloInstruction* slice) {
   }
 
   if (num_outer_loops > 0) {
-    SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &ir_builder_);
+    SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &b_);
   }
 
   return Status::OK();
@@ -2023,7 +1997,7 @@ Status IrEmitter::HandleDynamicUpdateSlice(
     auto operands = GetIrArraysForOperandsOf(dynamic_update_slice);
     return llvm_ir::EmitDynamicUpdateSliceInPlace(
         operands, GetIrArrayFor(dynamic_update_slice),
-        IrName(dynamic_update_slice, "in_place"), &ir_builder_);
+        IrName(dynamic_update_slice, "in_place"), &b_);
   }
   return DefaultAction(dynamic_update_slice);
 }
@@ -2057,43 +2031,41 @@ Status IrEmitter::HandlePad(HloInstruction* pad) {
       [this, pad](const llvm_ir::IrArray::Index& target_index) {
         const HloInstruction* padding_value = pad->operand(1);
         llvm::Value* padding_value_addr = GetEmittedValueFor(padding_value);
-        return ir_builder_.CreateLoad(padding_value_addr);
+        return b_.CreateLoad(padding_value_addr);
       }));
 
   // Create a loop to iterate over the operand elements and update the output
   // locations where the operand elements should be stored.
-  llvm_ir::ForLoopNest loops(IrName(pad, "assign"), &ir_builder_);
+  llvm_ir::ForLoopNest loops(IrName(pad, "assign"), &b_);
   const HloInstruction* operand = pad->operand(0);
   const llvm_ir::IrArray::Index operand_index =
       loops.AddLoopsForShape(operand->shape(), "operand");
 
-  SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &ir_builder_);
+  SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &b_);
 
   // Load an element from the operand.
   llvm_ir::IrArray operand_array(GetIrArrayFor(operand));
   llvm::Value* operand_data =
-      operand_array.EmitReadArrayElement(operand_index, &ir_builder_);
+      operand_array.EmitReadArrayElement(operand_index, &b_);
 
   // Compute the output index the operand element should be assigned to.
   // output_index := edge_padding_low + operand_index * (interior_padding + 1)
   const PaddingConfig& padding_config = pad->padding_config();
   llvm_ir::IrArray::Index output_index(operand_index.GetType());
   for (size_t i = 0; i < operand_index.size(); ++i) {
-    llvm::Value* offset = ir_builder_.CreateMul(
+    llvm::Value* offset = b_.CreateMul(
         operand_index[i],
-        ir_builder_.getInt64(padding_config.dimensions(i).interior_padding() +
-                             1));
-    llvm::Value* index = ir_builder_.CreateAdd(
-        offset,
-        ir_builder_.getInt64(padding_config.dimensions(i).edge_padding_low()));
+        b_.getInt64(padding_config.dimensions(i).interior_padding() + 1));
+    llvm::Value* index = b_.CreateAdd(
+        offset, b_.getInt64(padding_config.dimensions(i).edge_padding_low()));
     output_index.push_back(index);
   }
 
   // Store the operand element to the computed output location.
   llvm_ir::IrArray output_array(GetIrArrayFor(pad));
-  output_array.EmitWriteArrayElement(output_index, operand_data, &ir_builder_);
+  output_array.EmitWriteArrayElement(output_index, operand_data, &b_);
 
-  SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &ir_builder_);
+  SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &b_);
   return Status::OK();
 }
 
@@ -2115,8 +2087,7 @@ Status IrEmitter::HandleFusion(HloInstruction* fusion) {
     // Delegate to common implementation of fused in-place dynamic-update-slice.
     auto operands = GetIrArraysForOperandsOf(fusion);
     return llvm_ir::EmitFusedDynamicUpdateSliceInPlace(
-        fusion, operands, GetIrArrayFor(fusion), &elemental_emitter,
-        &ir_builder_);
+        fusion, operands, GetIrArrayFor(fusion), &elemental_emitter, &b_);
   } else if (fusion->fusion_kind() == HloInstruction::FusionKind::kLoop) {
     VLOG(3) << "HandleFusion kLoop";
     CpuElementalIrEmitter elemental_emitter(hlo_module_config_, this, module_);
@@ -2151,7 +2122,7 @@ Status IrEmitter::HandleFusion(HloInstruction* fusion) {
 
     TF_RETURN_IF_ERROR(DotOpEmitter::EmitDotOperation(
         *dot, target_array, lhs_array, rhs_array, &addend_array,
-        GetExecutableRunOptionsArgument(), &ir_builder_, hlo_module_config_,
+        GetExecutableRunOptionsArgument(), &b_, hlo_module_config_,
         target_machine_features_));
     return Status::OK();
   } else {
@@ -2174,7 +2145,7 @@ Status IrEmitter::HandleCall(HloInstruction* call) {
     // ParallelTaskAssignment assigned partitions, emit call to
     // ParallelForkJoin.
     std::vector<llvm::Value*> call_args = GetArrayFunctionCallArguments(
-        parameter_addresses, &ir_builder_, computation->name(),
+        parameter_addresses, &b_, computation->name(),
         /*return_value_buffer=*/emitted_value_[call],
         /*exec_run_options_arg=*/GetExecutableRunOptionsArgument(),
         /*temp_buffers_arg=*/GetTempBuffersArgument(),
@@ -2182,8 +2153,8 @@ Status IrEmitter::HandleCall(HloInstruction* call) {
 
     HloInstruction* root = computation->root_instruction();
     TF_RETURN_IF_ERROR(EmitCallToParallelForkJoin(
-        call_args, root->shape(), root->outer_dimension_partitions(),
-        &ir_builder_, call_ir_function, computation->name()));
+        call_args, root->shape(), root->outer_dimension_partitions(), &b_,
+        call_ir_function, computation->name()));
   } else {
     EmitArrayFunctionCallInto(call_ir_function, parameter_addresses,
                               emitted_value_[call], computation->name());
@@ -2195,33 +2166,31 @@ Status IrEmitter::HandleCall(HloInstruction* call) {
 Status IrEmitter::HandleCustomCall(HloInstruction* custom_call) {
   gtl::ArraySlice<HloInstruction*> operands(custom_call->operands());
   tensorflow::StringPiece custom_call_target(custom_call->custom_call_target());
-  llvm::Type* i8_ptr_type = ir_builder_.getInt8PtrTy();
+  llvm::Type* i8_ptr_type = b_.getInt8PtrTy();
   llvm::AllocaInst* operands_alloca =
       llvm_ir::EmitAllocaAtFunctionEntryWithCount(
-          i8_ptr_type, ir_builder_.getInt32(operands.size()),
-          "cc_operands_alloca", &ir_builder_);
+          i8_ptr_type, b_.getInt32(operands.size()), "cc_operands_alloca", &b_);
   for (size_t i = 0; i < operands.size(); ++i) {
     const HloInstruction* operand = operands[i];
     llvm::Value* operand_as_i8ptr =
-        ir_builder_.CreatePointerCast(GetEmittedValueFor(operand), i8_ptr_type);
-    llvm::Value* slot_in_operands_alloca = ir_builder_.CreateInBoundsGEP(
-        operands_alloca, {ir_builder_.getInt64(i)});
-    ir_builder_.CreateStore(operand_as_i8ptr, slot_in_operands_alloca);
+        b_.CreatePointerCast(GetEmittedValueFor(operand), i8_ptr_type);
+    llvm::Value* slot_in_operands_alloca =
+        b_.CreateInBoundsGEP(operands_alloca, {b_.getInt64(i)});
+    b_.CreateStore(operand_as_i8ptr, slot_in_operands_alloca);
   }
   auto* custom_call_ir_function =
       llvm::cast<llvm::Function>(module_->getOrInsertFunction(
           AsStringRef(custom_call_target),
           llvm::FunctionType::get(
-              /*Result=*/ir_builder_.getVoidTy(),
+              /*Result=*/b_.getVoidTy(),
               /*Params=*/{i8_ptr_type, operands_alloca->getType()},
               /*isVarArg=*/false)));
 
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(custom_call));
-  auto* output_address_arg = ir_builder_.CreatePointerCast(
-      GetEmittedValueFor(custom_call), i8_ptr_type);
+  auto* output_address_arg =
+      b_.CreatePointerCast(GetEmittedValueFor(custom_call), i8_ptr_type);
 
-  ir_builder_.CreateCall(custom_call_ir_function,
-                         {output_address_arg, operands_alloca});
+  b_.CreateCall(custom_call_ir_function, {output_address_arg, operands_alloca});
 
   return Status::OK();
 }
@@ -2286,8 +2255,8 @@ Status IrEmitter::HandleWhile(HloInstruction* xla_while) {
   llvm::BasicBlock* header_bb = llvm::BasicBlock::Create(
       module_->getContext(), AsStringRef(IrName(xla_while, "header")),
       compute_function_->function());
-  ir_builder_.CreateBr(header_bb);
-  ir_builder_.SetInsertPoint(header_bb);
+  b_.CreateBr(header_bb);
+  b_.SetInsertPoint(header_bb);
 
   // Calls the condition function to determine whether to proceed with the
   // body.  It must return a bool, so use the scalar call form.
@@ -2295,7 +2264,7 @@ Status IrEmitter::HandleWhile(HloInstruction* xla_while) {
   llvm::Value* while_condition = EmitElementFunctionCall(
       condition_ir_function, condition->root_instruction()->shape(),
       {while_result}, IrName(xla_while, "cond"));
-  llvm::Value* while_predicate = ir_builder_.CreateICmpNE(
+  llvm::Value* while_predicate = b_.CreateICmpNE(
       while_condition,
       llvm::ConstantInt::get(llvm_ir::PrimitiveTypeToIrType(PRED, module_), 0));
 
@@ -2305,20 +2274,20 @@ Status IrEmitter::HandleWhile(HloInstruction* xla_while) {
       compute_function_->function());
   llvm::BasicBlock* exit_bb = llvm::BasicBlock::Create(
       module_->getContext(), AsStringRef(IrName(xla_while, "exit")));
-  ir_builder_.CreateCondBr(while_predicate, body_bb, exit_bb);
+  b_.CreateCondBr(while_predicate, body_bb, exit_bb);
 
   // Calls the body function from the body block.
-  ir_builder_.SetInsertPoint(body_bb);
+  b_.SetInsertPoint(body_bb);
 
   // Calls the body function.
   EmitArrayFunctionCallInto(body_ir_function, {while_result}, while_result,
                             IrName(xla_while, "body"));
   // Finishes with a branch back to the header.
-  ir_builder_.CreateBr(header_bb);
+  b_.CreateBr(header_bb);
 
   // Adds the exit block to the function and sets the insert point there.
   compute_function_->function()->getBasicBlockList().push_back(exit_bb);
-  ir_builder_.SetInsertPoint(exit_bb);
+  b_.SetInsertPoint(exit_bb);
 
   return Status::OK();
 }
@@ -2360,21 +2329,21 @@ StatusOr<bool> IrEmitter::EmitFastConcatenate(
   std::vector<int64> outer_dims(std::next(concat_dim_layout_itr),
                                 output_min2maj.end());
 
-  llvm::Type* i8_ptr_type = ir_builder_.getInt8PtrTy();
-  llvm::Type* i8_type = ir_builder_.getInt8Ty();
+  llvm::Type* i8_ptr_type = b_.getInt8PtrTy();
+  llvm::Type* i8_type = b_.getInt8Ty();
 
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(concatenate));
   llvm_ir::IrArray target_array = GetIrArrayFor(concatenate);
 
-  llvm_ir::ForLoopNest loops(IrName(concatenate), &ir_builder_);
+  llvm_ir::ForLoopNest loops(IrName(concatenate), &b_);
   llvm_ir::IrArray::Index outer_dims_index =
       loops.AddLoopsForShapeOnDimensions(output_shape, outer_dims, "concat");
   std::replace(outer_dims_index.begin(), outer_dims_index.end(),
                static_cast<llvm::Value*>(nullptr),
-               static_cast<llvm::Value*>(ir_builder_.getInt64(0)));
+               static_cast<llvm::Value*>(b_.getInt64(0)));
 
   if (!outer_dims.empty()) {
-    SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &ir_builder_);
+    SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &b_);
   }
 
   PrimitiveType primitive_type = output_shape.element_type();
@@ -2383,10 +2352,10 @@ StatusOr<bool> IrEmitter::EmitFastConcatenate(
 
   // Contiguous subregions from each operand to the concatenate contribute to a
   // contiguous subregion in the target buffer starting at target_region_begin.
-  llvm::Value* target_region_begin = ir_builder_.CreateBitCast(
-      target_array.EmitArrayElementAddress(outer_dims_index, &ir_builder_,
-                                           "target_region"),
-      i8_ptr_type);
+  llvm::Value* target_region_begin =
+      b_.CreateBitCast(target_array.EmitArrayElementAddress(
+                           outer_dims_index, &b_, "target_region"),
+                       i8_ptr_type);
   int64 byte_offset_into_target_region = 0;
 
   int64 inner_dims_product =
@@ -2400,14 +2369,13 @@ StatusOr<bool> IrEmitter::EmitFastConcatenate(
   for (HloInstruction* operand : operands) {
     const Shape& input_shape = operand->shape();
     llvm_ir::IrArray source_array = GetIrArrayFor(operand);
-    llvm::Value* copy_source_address = ir_builder_.CreateBitCast(
-        source_array.EmitArrayElementAddress(outer_dims_index, &ir_builder_,
-                                             "src_addr"),
+    llvm::Value* copy_source_address = b_.CreateBitCast(
+        source_array.EmitArrayElementAddress(outer_dims_index, &b_, "src_addr"),
         i8_ptr_type);
 
-    llvm::Value* copy_target_address = ir_builder_.CreateGEP(
-        i8_type, target_region_begin,
-        ir_builder_.getInt64(byte_offset_into_target_region));
+    llvm::Value* copy_target_address =
+        b_.CreateGEP(i8_type, target_region_begin,
+                     b_.getInt64(byte_offset_into_target_region));
 
     EmitTransferElements(
         copy_target_address, copy_source_address,
@@ -2420,7 +2388,7 @@ StatusOr<bool> IrEmitter::EmitFastConcatenate(
   }
 
   if (!outer_dims.empty()) {
-    SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &ir_builder_);
+    SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &b_);
   }
 
   return true;
@@ -2439,16 +2407,15 @@ void IrEmitter::EmitTransferElements(llvm::Value* target, llvm::Value* source,
       llvm_ir::PrimitiveTypeToIrType(primitive_type, module_));
 
   if (element_count == 1) {
-    auto* load_instruction = ir_builder_.CreateAlignedLoad(
-        ir_builder_.CreateBitCast(source, primitive_ptr_type),
-        element_alignment);
+    auto* load_instruction = b_.CreateAlignedLoad(
+        b_.CreateBitCast(source, primitive_ptr_type), element_alignment);
     source_array.AnnotateLoadStoreInstructionWithMetadata(load_instruction);
-    auto* store_instruction = ir_builder_.CreateAlignedStore(
-        load_instruction, ir_builder_.CreateBitCast(target, primitive_ptr_type),
+    auto* store_instruction = b_.CreateAlignedStore(
+        load_instruction, b_.CreateBitCast(target, primitive_ptr_type),
         element_alignment);
     target_array.AnnotateLoadStoreInstructionWithMetadata(store_instruction);
   } else {
-    auto* memcpy_instruction = ir_builder_.CreateMemCpy(
+    auto* memcpy_instruction = b_.CreateMemCpy(
         target, /*DstAlign=*/element_alignment, source,
         /*SrcAlign=*/element_alignment, element_count * primitive_type_size);
 
@@ -2518,24 +2485,24 @@ Status IrEmitter::HandleConditional(HloInstruction* conditional) {
   //     cond_result = true_computation(true_operand)
   //   else
   //     cond_result = false_computation(false_operand)
-  llvm::LoadInst* pred_value = ir_builder_.CreateLoad(
+  llvm::LoadInst* pred_value = b_.CreateLoad(
       GetIrArrayFor(pred).GetBasePointer(), "load_predicate_value");
-  llvm::Value* pred_cond = ir_builder_.CreateICmpNE(
+  llvm::Value* pred_cond = b_.CreateICmpNE(
       pred_value,
       llvm::ConstantInt::get(llvm_ir::PrimitiveTypeToIrType(PRED, module_), 0),
       "boolean_predicate");
   llvm_ir::LlvmIfData if_data =
-      llvm_ir::EmitIfThenElse(pred_cond, "conditional", &ir_builder_);
+      llvm_ir::EmitIfThenElse(pred_cond, "conditional", &b_);
 
-  SetToFirstInsertPoint(if_data.true_block, &ir_builder_);
+  SetToFirstInsertPoint(if_data.true_block, &b_);
   EmitArrayFunctionCallInto(true_function, {GetEmittedValueFor(true_arg)},
                             conditional_result, IrName(conditional, "_true"));
 
-  SetToFirstInsertPoint(if_data.false_block, &ir_builder_);
+  SetToFirstInsertPoint(if_data.false_block, &b_);
   EmitArrayFunctionCallInto(false_function, {GetEmittedValueFor(false_arg)},
                             conditional_result, IrName(conditional, "_false"));
 
-  SetToFirstInsertPoint(if_data.after_block, &ir_builder_);
+  SetToFirstInsertPoint(if_data.after_block, &b_);
   return Status::OK();
 }
 
@@ -2546,6 +2513,28 @@ Status IrEmitter::HandleAfterAll(HloInstruction* gen_token) {
   return Status::OK();
 }
 
+Status IrEmitter::HandleIota(HloInstruction* iota) {
+  // TODO(b/64798317): implement iota on CPU.
+  return Unimplemented("Iota is not implemented on CPU.");
+}
+
+Status IrEmitter::HandleRng(HloInstruction* rng) {
+  ElementalIrEmitter::HloToElementGeneratorMap operand_to_generator;
+  for (const HloInstruction* operand : rng->operands()) {
+    operand_to_generator[operand] = [=](const llvm_ir::IrArray::Index& index) {
+      return GetIrArrayFor(operand).EmitReadArrayElement(index, &b_);
+    };
+  }
+
+  CpuElementalIrEmitter elemental_emitter(hlo_module_config_, this, module_);
+  TF_RETURN_IF_ERROR(EmitTargetElementLoop(
+      rng, elemental_emitter.MakeElementGenerator(rng, operand_to_generator)));
+
+  llvm_ir::IncrementVariableForPhiloxRngState(1, module_, &b_);
+
+  return Status::OK();
+}
+
 Status IrEmitter::FinishVisit(HloInstruction* root) {
   // When this method is called, we should have already emitted an IR value for
   // the root (return) op. The IR value holds the address of the buffer holding
@@ -2563,7 +2552,7 @@ Status IrEmitter::FinishVisit(HloInstruction* root) {
 
   auto record_complete_computation = [&](llvm::Value* prof_counter) {
     if (prof_counter) {
-      profiling_state_.RecordCompleteComputation(&ir_builder_, prof_counter);
+      profiling_state_.RecordCompleteComputation(&b_, prof_counter);
     }
   };
 
@@ -2585,54 +2574,51 @@ llvm::Value* IrEmitter::GetProfileCounterCommon(
 
   int64 prof_counter_idx = it->second;
   string counter_name = IrName("prof_counter", hlo.name());
-  return ir_builder_.CreateGEP(GetProfileCountersArgument(),
-                               ir_builder_.getInt64(prof_counter_idx),
-                               AsStringRef(counter_name));
+  return b_.CreateGEP(GetProfileCountersArgument(),
+                      b_.getInt64(prof_counter_idx), AsStringRef(counter_name));
 }
 
-void IrEmitter::ProfilingState::UpdateProfileCounter(
-    llvm::IRBuilder<>* ir_builder, llvm::Value* prof_counter,
-    llvm::Value* cycle_end, llvm::Value* cycle_start) {
-  auto* cycle_diff = ir_builder->CreateSub(cycle_end, cycle_start);
+void IrEmitter::ProfilingState::UpdateProfileCounter(llvm::IRBuilder<>* b,
+                                                     llvm::Value* prof_counter,
+                                                     llvm::Value* cycle_end,
+                                                     llvm::Value* cycle_start) {
+  auto* cycle_diff = b->CreateSub(cycle_end, cycle_start);
   llvm::LoadInst* old_cycle_count =
-      ir_builder->CreateLoad(prof_counter, "old_cycle_count");
+      b->CreateLoad(prof_counter, "old_cycle_count");
   auto* new_cycle_count =
-      ir_builder->CreateAdd(cycle_diff, old_cycle_count, "new_cycle_count");
-  ir_builder->CreateStore(new_cycle_count, prof_counter);
+      b->CreateAdd(cycle_diff, old_cycle_count, "new_cycle_count");
+  b->CreateStore(new_cycle_count, prof_counter);
 }
 
-llvm::Value* IrEmitter::ProfilingState::ReadCycleCounter(
-    llvm::IRBuilder<>* ir_builder) {
-  llvm::Module* module = ir_builder->GetInsertBlock()->getModule();
+llvm::Value* IrEmitter::ProfilingState::ReadCycleCounter(llvm::IRBuilder<>* b) {
+  llvm::Module* module = b->GetInsertBlock()->getModule();
   if (use_rdtscp_) {
     llvm::Function* func_llvm_readcyclecounter =
         llvm::Intrinsic::getDeclaration(module,
                                         llvm::Intrinsic::readcyclecounter);
-    return ir_builder->CreateCall(func_llvm_readcyclecounter);
+    return b->CreateCall(func_llvm_readcyclecounter);
   }
   llvm::Function* func_llvm_x86_rdtscp =
       llvm::Intrinsic::getDeclaration(module, llvm::Intrinsic::x86_rdtscp);
   if (!aux_i8ptr_) {
-    llvm::AllocaInst* rdtscp_aux = llvm_ir::EmitAllocaAtFunctionEntry(
-        ir_builder->getInt32Ty(), "rdtscp_aux", ir_builder);
-    aux_i8ptr_ =
-        ir_builder->CreateBitCast(rdtscp_aux, ir_builder->getInt8PtrTy());
+    llvm::AllocaInst* rdtscp_aux =
+        llvm_ir::EmitAllocaAtFunctionEntry(b->getInt32Ty(), "rdtscp_aux", b);
+    aux_i8ptr_ = b->CreateBitCast(rdtscp_aux, b->getInt8PtrTy());
   }
-  llvm::ConstantInt* alloca_size = ir_builder->getInt64(4);
+  llvm::ConstantInt* alloca_size = b->getInt64(4);
   llvm::Function* func_llvm_lifetime_start =
       llvm::Intrinsic::getDeclaration(module, llvm::Intrinsic::lifetime_start);
-  ir_builder->CreateCall(func_llvm_lifetime_start, {alloca_size, aux_i8ptr_});
-  llvm::Value* rdtscp_call =
-      ir_builder->CreateCall(func_llvm_x86_rdtscp, aux_i8ptr_);
+  b->CreateCall(func_llvm_lifetime_start, {alloca_size, aux_i8ptr_});
+  llvm::Value* rdtscp_call = b->CreateCall(func_llvm_x86_rdtscp, aux_i8ptr_);
   llvm::Function* func_llvm_lifetime_end =
       llvm::Intrinsic::getDeclaration(module, llvm::Intrinsic::lifetime_end);
-  ir_builder->CreateCall(func_llvm_lifetime_end, {alloca_size, aux_i8ptr_});
+  b->CreateCall(func_llvm_lifetime_end, {alloca_size, aux_i8ptr_});
   return rdtscp_call;
 }
 
-void IrEmitter::ProfilingState::RecordCycleStart(llvm::IRBuilder<>* ir_builder,
+void IrEmitter::ProfilingState::RecordCycleStart(llvm::IRBuilder<>* b,
                                                  HloInstruction* hlo) {
-  auto* cycle_start = ReadCycleCounter(ir_builder);
+  auto* cycle_start = ReadCycleCounter(b);
   cycle_start->setName(AsStringRef(IrName(hlo, "cycle_start")));
   cycle_starts_[hlo] = cycle_start;
   if (first_read_cycle_start_ == nullptr) {
@@ -2640,20 +2626,20 @@ void IrEmitter::ProfilingState::RecordCycleStart(llvm::IRBuilder<>* ir_builder,
   }
 }
 
-void IrEmitter::ProfilingState::RecordCycleDelta(llvm::IRBuilder<>* ir_builder,
+void IrEmitter::ProfilingState::RecordCycleDelta(llvm::IRBuilder<>* b,
                                                  HloInstruction* hlo,
                                                  llvm::Value* prof_counter) {
-  auto* cycle_end = ReadCycleCounter(ir_builder);
+  auto* cycle_end = ReadCycleCounter(b);
   cycle_end->setName(AsStringRef(IrName(hlo, "cycle_end")));
   auto* cycle_start = cycle_starts_[hlo];
-  UpdateProfileCounter(ir_builder, prof_counter, cycle_end, cycle_start);
+  UpdateProfileCounter(b, prof_counter, cycle_end, cycle_start);
   last_read_cycle_end_ = cycle_end;
 }
 
 void IrEmitter::ProfilingState::RecordCompleteComputation(
-    llvm::IRBuilder<>* ir_builder, llvm::Value* prof_counter) {
+    llvm::IRBuilder<>* b, llvm::Value* prof_counter) {
   if (last_read_cycle_end_ && first_read_cycle_start_) {
-    UpdateProfileCounter(ir_builder, prof_counter, last_read_cycle_end_,
+    UpdateProfileCounter(b, prof_counter, last_read_cycle_end_,
                          first_read_cycle_start_);
   }
 }
@@ -2661,14 +2647,14 @@ void IrEmitter::ProfilingState::RecordCompleteComputation(
 Status IrEmitter::Preprocess(HloInstruction* hlo) {
   VLOG(3) << "Visiting: " << hlo->ToString();
   if (instruction_to_profile_idx_.count(hlo)) {
-    profiling_state_.RecordCycleStart(&ir_builder_, hlo);
+    profiling_state_.RecordCycleStart(&b_, hlo);
   }
   return Status::OK();
 }
 
 Status IrEmitter::Postprocess(HloInstruction* hlo) {
   if (auto* prof_counter = GetProfileCounterFor(*hlo)) {
-    profiling_state_.RecordCycleDelta(&ir_builder_, hlo, prof_counter);
+    profiling_state_.RecordCycleDelta(&b_, hlo, prof_counter);
   }
   return Status::OK();
 }
@@ -2727,22 +2713,24 @@ llvm::Value* IrEmitter::EmitTempBufferPointer(
     CHECK_EQ(1, assigned_buffers.size());
     const Shape& shape = assigned_buffers.begin()->first->shape();
 
-    llvm::AllocaInst*& tempbuf_address = thread_local_buffers_[{
-        ir_builder_.GetInsertBlock()->getParent(), slice}];
+    llvm::AllocaInst*& tempbuf_address =
+        thread_local_buffers_[{b_.GetInsertBlock()->getParent(), slice}];
     if (tempbuf_address == nullptr) {
       tempbuf_address = llvm_ir::EmitAllocaAtFunctionEntry(
           IrShapeType(shape),
-          tensorflow::strings::StrCat("thread_local", slice.ToString()),
-          &ir_builder_, MinimumAlignmentForShape(target_shape));
+          tensorflow::strings::StrCat("thread_local", slice.ToString()), &b_,
+          MinimumAlignmentForShape(target_shape));
     }
-    return ir_builder_.CreateBitCast(tempbuf_address,
-                                     element_type->getPointerTo());
+    return b_.CreateBitCast(tempbuf_address, element_type->getPointerTo());
+  }
+
+  if (allocation.is_constant()) {
+    return FindOrDie(constant_buffer_to_global_, allocation.index());
   }
 
   llvm::Value* tempbuf_address_ptr = llvm_ir::EmitBufferIndexingGEP(
-      GetTempBuffersArgument(), slice.index(), &ir_builder_);
-  llvm::LoadInst* tempbuf_address_base =
-      ir_builder_.CreateLoad(tempbuf_address_ptr);
+      GetTempBuffersArgument(), slice.index(), &b_);
+  llvm::LoadInst* tempbuf_address_base = b_.CreateLoad(tempbuf_address_ptr);
   if (is_top_level_computation_ &&
       hlo_module_config_.debug_options()
           .xla_llvm_enable_invariant_load_metadata()) {
@@ -2761,11 +2749,11 @@ llvm::Value* IrEmitter::EmitTempBufferPointer(
   llvm::Value* tempbuf_address_untyped = tempbuf_address_base;
   if (slice.offset() > 0) {
     // Adjust the address to account for the slice offset.
-    tempbuf_address_untyped = ir_builder_.CreateInBoundsGEP(
-        tempbuf_address_base, ir_builder_.getInt64(slice.offset()));
+    tempbuf_address_untyped =
+        b_.CreateInBoundsGEP(tempbuf_address_base, b_.getInt64(slice.offset()));
   }
-  return ir_builder_.CreateBitCast(tempbuf_address_untyped,
-                                   element_type->getPointerTo());
+  return b_.CreateBitCast(tempbuf_address_untyped,
+                          element_type->getPointerTo());
 }
 
 // Emits a function call returning a single array element.  Allocates space
@@ -2776,7 +2764,7 @@ llvm::Value* IrEmitter::EmitElementFunctionCall(
     tensorflow::StringPiece name) {
   llvm::Value* return_value_buffer = EmitArrayFunctionCall(
       function, return_shape, 1, parameter_addresses, name);
-  return ir_builder_.CreateLoad(
+  return b_.CreateLoad(
       return_value_buffer,
       AsStringRef(tensorflow::strings::StrCat(name, "_return_value")));
 }
@@ -2794,9 +2782,9 @@ llvm::Value* IrEmitter::EmitElementFunctionCall(
 void IrEmitter::EmitArrayFunctionCallInto(
     llvm::Function* function, gtl::ArraySlice<llvm::Value*> parameter_addresses,
     llvm::Value* return_value_buffer, tensorflow::StringPiece name) {
-  ir_builder_.CreateCall(
-      function, GetArrayFunctionCallArguments(
-                    parameter_addresses, &ir_builder_, name,
+  b_.CreateCall(function,
+                GetArrayFunctionCallArguments(
+                    parameter_addresses, &b_, name,
                     /*return_value_buffer=*/return_value_buffer,
                     /*exec_run_options_arg=*/GetExecutableRunOptionsArgument(),
                     /*temp_buffers_arg=*/GetTempBuffersArgument(),
@@ -2808,13 +2796,13 @@ llvm::Value* IrEmitter::EmitArrayFunctionCall(
     gtl::ArraySlice<llvm::Value*> parameter_addresses,
     tensorflow::StringPiece name) {
   llvm::Value* elements =
-      llvm::ConstantInt::get(ir_builder_.getInt64Ty(), element_count);
+      llvm::ConstantInt::get(b_.getInt64Ty(), element_count);
   PrimitiveType return_type = return_shape.element_type();
   llvm::Value* return_value_buffer =
       llvm_ir::EmitAllocaAtFunctionEntryWithCount(
           llvm_ir::PrimitiveTypeToIrType(return_type, module_), elements,
-          tensorflow::strings::StrCat(name, "_return_value_address"),
-          &ir_builder_, MinimumAlignmentForPrimitiveType(return_type));
+          tensorflow::strings::StrCat(name, "_return_value_address"), &b_,
+          MinimumAlignmentForPrimitiveType(return_type));
   EmitArrayFunctionCallInto(function, parameter_addresses, return_value_buffer,
                             name);
   return return_value_buffer;
@@ -2836,8 +2824,7 @@ Status IrEmitter::EmitTargetAddressForOp(const HloInstruction* op) {
       attr_builder.addDereferenceableAttr(ByteSizeOf(target_shape));
       retval->addAttrs(attr_builder);
     }
-    addr = ir_builder_.CreateBitCast(retval,
-                                     IrShapeType(target_shape)->getPointerTo());
+    addr = b_.CreateBitCast(retval, IrShapeType(target_shape)->getPointerTo());
   } else {
     // For other nodes, we need the temporary buffer allocated for this node to
     // write the result into.
@@ -2879,14 +2866,14 @@ Status IrEmitter::EmitTargetElementLoop(
           llvm_ir::IrArray(op_target_address, element_shape));
     }
     TF_RETURN_IF_ERROR(
-        llvm_ir::LoopEmitter(element_generator, output_arrays, &ir_builder_)
+        llvm_ir::LoopEmitter(element_generator, output_arrays, &b_)
             .EmitLoop(IrName(target_op)));
 
     std::vector<llvm::Value*> tuple_operand_ptrs;
     for (int64 i = 0; i < output_arrays.size(); ++i) {
       tuple_operand_ptrs.push_back(output_arrays[i].GetBasePointer());
     }
-    llvm_ir::EmitTuple(target_array, tuple_operand_ptrs, &ir_builder_, module_);
+    llvm_ir::EmitTuple(target_array, tuple_operand_ptrs, &b_, module_);
 
   } else {
     if (ShouldEmitParallelLoopFor(*target_op)) {
@@ -2895,11 +2882,11 @@ Status IrEmitter::EmitTargetElementLoop(
           compute_function_->GetDynamicLoopBounds();
       // Emit parallel loop with dynamic loop bounds for most-major dimensions.
       TF_RETURN_IF_ERROR(ParallelLoopEmitter(element_generator, target_array,
-                                             &dynamic_loop_bounds, &ir_builder_)
+                                             &dynamic_loop_bounds, &b_)
                              .EmitLoop(IrName(target_op)));
     } else {
       TF_RETURN_IF_ERROR(
-          llvm_ir::LoopEmitter(element_generator, target_array, &ir_builder_)
+          llvm_ir::LoopEmitter(element_generator, target_array, &b_)
               .EmitLoop(IrName(target_op)));
     }
   }
@@ -2912,8 +2899,8 @@ Status IrEmitter::EmitMemcpy(const HloInstruction& source,
   llvm::Value* destination_value = GetEmittedValueFor(&destination);
   int64 source_size = ByteSizeOf(source.shape());
   // TODO(b/63762267): Be more aggressive about specifying alignment.
-  ir_builder_.CreateMemCpy(destination_value, /*DstAlign=*/1, source_value,
-                           /*SrcAlign=*/1, source_size);
+  b_.CreateMemCpy(destination_value, /*DstAlign=*/1, source_value,
+                  /*SrcAlign=*/1, source_size);
   return Status::OK();
 }
 
@@ -2941,7 +2928,7 @@ Status IrEmitter::DefaultAction(HloInstruction* hlo) {
   ElementalIrEmitter::HloToElementGeneratorMap operand_to_generator;
   for (const HloInstruction* operand : hlo->operands()) {
     operand_to_generator[operand] = [=](const llvm_ir::IrArray::Index& index) {
-      return GetIrArrayFor(operand).EmitReadArrayElement(index, &ir_builder_);
+      return GetIrArrayFor(operand).EmitReadArrayElement(index, &b_);
     };
   }
   CpuElementalIrEmitter elemental_emitter(hlo_module_config_, this, module_);
@@ -2956,8 +2943,8 @@ StatusOr<llvm::Value*> IrEmitter::EmitScalarCall(
   std::vector<llvm::Value*> argument_addrs;
   for (auto argument : arguments) {
     llvm::Value* argument_addr = llvm_ir::EmitAllocaAtFunctionEntry(
-        argument->getType(), "arg_addr", &ir_builder_);
-    ir_builder_.CreateStore(argument, argument_addr);
+        argument->getType(), "arg_addr", &b_);
+    b_.CreateStore(argument, argument_addr);
     argument_addrs.push_back(argument_addr);
   }
   return EmitElementFunctionCall(llvm_function,
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
index 419f19c24d..03bbb2afb5 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
@@ -98,13 +98,16 @@ class IrEmitter : public DfsHloVisitorWithDefault {
       bool is_top_level_computation,
       std::vector<const HloInstruction*>* instruction_order);
 
-  llvm::IRBuilder<>* ir_builder() { return &ir_builder_; }
+  llvm::IRBuilder<>* b() { return &b_; }
 
   // Emits a call to `computation` with scalar arguments `arguments`.
   StatusOr<llvm::Value*> EmitScalarCall(
       PrimitiveType return_type, HloComputation* computation,
       const std::vector<llvm::Value*>& arguments, tensorflow::StringPiece name);
 
+  // Emit an LLVM global variable for every constant buffer allocation.
+  Status EmitConstantGlobals();
+
  protected:
   //
   // The following methods implement the DfsHloVisitor interface.
@@ -148,6 +151,8 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   Status HandleConcatenate(HloInstruction* concatenate) override;
   Status HandleConditional(HloInstruction* conditional) override;
   Status HandleAfterAll(HloInstruction* gen_token) override;
+  Status HandleIota(HloInstruction* iota) override;
+  Status HandleRng(HloInstruction* rng) override;
   Status FinishVisit(HloInstruction* root) override;
 
   Status Preprocess(HloInstruction* hlo) override;
@@ -415,7 +420,7 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   // creates the encapsulated llvm::Function s.t. it is added to the llvm
   // module's function list).
   std::unique_ptr<IrFunction> compute_function_;
-  llvm::IRBuilder<> ir_builder_;
+  llvm::IRBuilder<> b_;
 
   // Maps HLO instructions to their index into the profile counter array.
   const std::unordered_map<const HloInstruction*, int64>
@@ -451,23 +456,22 @@ class IrEmitter : public DfsHloVisitorWithDefault {
         : use_rdtscp_(use_rdtscp), prof_counters_(prof_counters) {}
 
     // Record the cycle counter before an HLO executes.
-    void RecordCycleStart(llvm::IRBuilder<>* ir_builder, HloInstruction* hlo);
+    void RecordCycleStart(llvm::IRBuilder<>* b, HloInstruction* hlo);
     // Record the number of cycles it took for an HLO to execute.
-    void RecordCycleDelta(llvm::IRBuilder<>* ir_builder, HloInstruction* hlo,
+    void RecordCycleDelta(llvm::IRBuilder<>* b, HloInstruction* hlo,
                           llvm::Value* prof_counter);
     // Record the number of cycles it took for the entire computation to
     // execute.
-    void RecordCompleteComputation(llvm::IRBuilder<>* ir_builder,
+    void RecordCompleteComputation(llvm::IRBuilder<>* b,
                                    llvm::Value* prof_counter);
 
     // Convenience function to generate a call to an intrinsic which reads the
     // CPU cycle counter.
-    llvm::Value* ReadCycleCounter(llvm::IRBuilder<>* ir_builder);
+    llvm::Value* ReadCycleCounter(llvm::IRBuilder<>* b);
 
     // Store the cycle counter delta to the per-HLO profile counter.
-    void UpdateProfileCounter(llvm::IRBuilder<>* ir_builder,
-                              llvm::Value* prof_counter, llvm::Value* cycle_end,
-                              llvm::Value* cycle_start);
+    void UpdateProfileCounter(llvm::IRBuilder<>* b, llvm::Value* prof_counter,
+                              llvm::Value* cycle_end, llvm::Value* cycle_start);
 
    private:
     // Should we use the x86-specific rdtscp or the generic readcyclecounter
@@ -559,6 +563,9 @@ class IrEmitter : public DfsHloVisitorWithDefault {
                            LiteralPtrHashFunctor, LiteralPtrEqualityFunctor>
       emitted_literals_;
 
+  tensorflow::gtl::FlatMap<BufferAllocation::Index, llvm::Constant*>
+      constant_buffer_to_global_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(IrEmitter);
 };
 
diff --git a/tensorflow/compiler/xla/service/cpu/ir_function.cc b/tensorflow/compiler/xla/service/cpu/ir_function.cc
index 2d6f2f3818..6aff838462 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_function.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_function.cc
@@ -49,11 +49,10 @@ IrFunction::IrFunction(const string& function_name,
                        llvm::Function::LinkageTypes linkage,
                        const bool optimize_for_size_requested,
                        const bool enable_fast_math, llvm::Module* llvm_module,
-                       llvm::IRBuilder<>* ir_builder,
-                       int64 num_dynamic_loop_bounds)
-    : ir_builder_(ir_builder),
+                       llvm::IRBuilder<>* b, int64 num_dynamic_loop_bounds)
+    : b_(b),
       llvm_module_(llvm_module),
-      caller_insert_point_guard_(*ir_builder),
+      caller_insert_point_guard_(*b),
       num_dynamic_loop_bounds_(num_dynamic_loop_bounds) {
   Initialize(function_name, linkage, optimize_for_size_requested,
              enable_fast_math);
@@ -61,7 +60,7 @@ IrFunction::IrFunction(const string& function_name,
 
 IrFunction::~IrFunction() {
   // Emit function return value.
-  ir_builder_->CreateRetVoid();
+  b_->CreateRetVoid();
 }
 
 DynamicLoopBounds IrFunction::GetDynamicLoopBounds() {
@@ -174,7 +173,7 @@ void IrFunction::Initialize(const string& function_name,
     function_->addAttribute(argument.getArgNo() + 1, llvm::Attribute::NoAlias);
   }
 
-  ir_builder_->SetInsertPoint(llvm::BasicBlock::Create(
+  b_->SetInsertPoint(llvm::BasicBlock::Create(
       /*Context=*/llvm_module_->getContext(),
       /*Name=*/"entry",
       /*Parent=*/function_));
@@ -184,9 +183,8 @@ llvm::Value* IrFunction::GetDynamicLoopBound(const int64 offset) {
   CHECK_GT(num_dynamic_loop_bounds_, 0);
   CHECK_LT(offset, num_dynamic_loop_bounds_ * 2);
   string name = tensorflow::strings::StrCat("dynamic_loop_bound_", offset);
-  return ir_builder_->CreateLoad(
-      ir_builder_->CreateGEP(CHECK_NOTNULL(dynamic_loop_bounds_arg_),
-                             ir_builder_->getInt64(offset), AsStringRef(name)));
+  return b_->CreateLoad(b_->CreateGEP(CHECK_NOTNULL(dynamic_loop_bounds_arg_),
+                                      b_->getInt64(offset), AsStringRef(name)));
 }
 
 // Emits code to allocate an array of parameter address pointers, and store
@@ -195,27 +193,25 @@ llvm::Value* IrFunction::GetDynamicLoopBound(const int64 offset) {
 // address buffer).
 std::vector<llvm::Value*> GetArrayFunctionCallArguments(
     tensorflow::gtl::ArraySlice<llvm::Value*> parameter_addresses,
-    llvm::IRBuilder<>* ir_builder, tensorflow::StringPiece name,
+    llvm::IRBuilder<>* b, tensorflow::StringPiece name,
     llvm::Value* return_value_buffer, llvm::Value* exec_run_options_arg,
     llvm::Value* temp_buffers_arg, llvm::Value* profile_counters_arg) {
   llvm::Value* parameter_addresses_buffer =
       llvm_ir::EmitAllocaAtFunctionEntryWithCount(
-          ir_builder->getInt8PtrTy(),
-          ir_builder->getInt32(parameter_addresses.size()),
-          tensorflow::strings::StrCat(name, "_parameter_addresses"),
-          ir_builder);
+          b->getInt8PtrTy(), b->getInt32(parameter_addresses.size()),
+          tensorflow::strings::StrCat(name, "_parameter_addresses"), b);
   for (size_t i = 0; i < parameter_addresses.size(); ++i) {
-    llvm::Value* parameter_as_i8ptr = ir_builder->CreateBitCast(
-        parameter_addresses[i], ir_builder->getInt8PtrTy(),
-        AsStringRef(tensorflow::strings::StrCat(name, "_parameter_", i,
-                                                "_address_as_i8ptr")));
-    llvm::Value* slot_in_param_addresses = ir_builder->CreateInBoundsGEP(
-        parameter_addresses_buffer, {ir_builder->getInt64(i)});
-    ir_builder->CreateStore(parameter_as_i8ptr, slot_in_param_addresses);
+    llvm::Value* parameter_as_i8ptr =
+        b->CreateBitCast(parameter_addresses[i], b->getInt8PtrTy(),
+                         AsStringRef(tensorflow::strings::StrCat(
+                             name, "_parameter_", i, "_address_as_i8ptr")));
+    llvm::Value* slot_in_param_addresses =
+        b->CreateInBoundsGEP(parameter_addresses_buffer, {b->getInt64(i)});
+    b->CreateStore(parameter_as_i8ptr, slot_in_param_addresses);
   }
 
   const auto to_int8_ptr = [=](llvm::Value* ptr) {
-    return ir_builder->CreatePointerCast(ptr, ir_builder->getInt8PtrTy());
+    return b->CreatePointerCast(ptr, b->getInt8PtrTy());
   };
   std::vector<llvm::Value*> arguments{
       to_int8_ptr(return_value_buffer), to_int8_ptr(exec_run_options_arg),
@@ -230,22 +226,21 @@ std::vector<llvm::Value*> GetArrayFunctionCallArguments(
 // calls to 'parallel_function' (and joins threads before returning).
 Status EmitCallToParallelForkJoin(
     const std::vector<llvm::Value*>& arguments, const Shape& shape,
-    const std::vector<int64>& dimension_partition_counts,
-    llvm::IRBuilder<>* ir_builder, llvm::Function* parallel_function,
-    const string& name) {
-  llvm::Module* module = ir_builder->GetInsertBlock()->getModule();
+    const std::vector<int64>& dimension_partition_counts, llvm::IRBuilder<>* b,
+    llvm::Function* parallel_function, const string& name) {
+  llvm::Module* module = b->GetInsertBlock()->getModule();
 
   // Build ParallelForkJoin function type.
   std::vector<llvm::Type*> compute_function_params =
       GetComputeFunctionParams(module, /*num_dynamic_loop_bounds=*/0);
   // Number of parallel compute functions.
-  compute_function_params.push_back(ir_builder->getInt32Ty());
+  compute_function_params.push_back(b->getInt32Ty());
   // Array of partitions. There is an array element for each
   // partition x partition_dim x 2 (for dimension start and limit).
   compute_function_params.push_back(
       llvm::Type::getInt64PtrTy(module->getContext()));
   // Number of partitioned most-major dimensions in 'shape'.
-  compute_function_params.push_back(ir_builder->getInt32Ty());
+  compute_function_params.push_back(b->getInt32Ty());
   // Function pointer for compute function to be dispatched in parallel.
   compute_function_params.push_back(
       llvm::Type::getInt8PtrTy(module->getContext()));
@@ -268,7 +263,7 @@ Status EmitCallToParallelForkJoin(
   ShapePartitionIterator partition_iterator(shape, dimension_partition_counts);
   const int64 num_partitions = partition_iterator.GetTotalPartitionCount();
   // Add argument specifying the number of parallel partitions.
-  fork_join_arguments.push_back(ir_builder->getInt32(num_partitions));
+  fork_join_arguments.push_back(b->getInt32(num_partitions));
 
   // The number of partitioned most-major dimensions in 'shape'.
   const int32 num_partitioned_dims = dimension_partition_counts.size();
@@ -293,15 +288,15 @@ Status EmitCallToParallelForkJoin(
       const std::pair<int64, int64>& dim_partition = dim_partitions[j];
       const int32 index = partition_index + j * dim_partition_size;
       // Store partition [dim_start, dim_limit) intervals for each dimension.
-      partitions[index] = ir_builder->getInt64(dim_partition.first);
+      partitions[index] = b->getInt64(dim_partition.first);
       partitions[index + 1] =
-          ir_builder->getInt64(dim_partition.first + dim_partition.second);
+          b->getInt64(dim_partition.first + dim_partition.second);
     }
   }
 
   // Create global variable out of dimension partitions in 'partitions'.
   llvm::ArrayType* partitions_array_type =
-      llvm::ArrayType::get(ir_builder->getInt64Ty(), partition_array_size);
+      llvm::ArrayType::get(b->getInt64Ty(), partition_array_size);
   llvm::Constant* partitions_array =
       llvm::ConstantArray::get(partitions_array_type, partitions);
   llvm::GlobalVariable* global_partitions_array = new llvm::GlobalVariable(
@@ -315,16 +310,16 @@ Status EmitCallToParallelForkJoin(
           tensorflow::strings::StrCat(name, "_parallel_dimension_partitions")));
 
   // Add argument specifying parallel dimension partitions.
-  fork_join_arguments.push_back(ir_builder->CreateBitCast(
-      global_partitions_array,
-      llvm::Type::getInt64PtrTy(module->getContext())));
+  fork_join_arguments.push_back(
+      b->CreateBitCast(global_partitions_array,
+                       llvm::Type::getInt64PtrTy(module->getContext())));
   // Add argument specifying the number of partitioned most-major dimensions.
-  fork_join_arguments.push_back(ir_builder->getInt32(num_partitioned_dims));
+  fork_join_arguments.push_back(b->getInt32(num_partitioned_dims));
   // Add argument for parallel compute function pointer.
   fork_join_arguments.push_back(
-      ir_builder->CreateBitCast(parallel_function, ir_builder->getInt8PtrTy()));
+      b->CreateBitCast(parallel_function, b->getInt8PtrTy()));
   // Emit call to parallel fork/join.
-  ir_builder->CreateCall(fork_join_func, fork_join_arguments);
+  b->CreateCall(fork_join_func, fork_join_arguments);
 
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/cpu/ir_function.h b/tensorflow/compiler/xla/service/cpu/ir_function.h
index 2e55181eed..a41cbb64cd 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_function.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_function.h
@@ -54,7 +54,7 @@ class IrFunction {
   IrFunction(const string& function_name, llvm::Function::LinkageTypes linkage,
              const bool optimize_for_size_requested,
              const bool enable_fast_math, llvm::Module* llvm_module,
-             llvm::IRBuilder<>* ir_builder, int64 num_dynamic_loop_bounds);
+             llvm::IRBuilder<>* b, int64 num_dynamic_loop_bounds);
   ~IrFunction();
 
   // Emit ir to read and return the set of ir values representing the dynamic
@@ -97,7 +97,7 @@ class IrFunction {
   // 'offset' from the "dynamic_loop_bounds" argument of this function.
   llvm::Value* GetDynamicLoopBound(int64 offset);
 
-  llvm::IRBuilder<>* ir_builder_;
+  llvm::IRBuilder<>* b_;
   llvm::Module* llvm_module_;
   llvm::IRBuilder<>::InsertPointGuard caller_insert_point_guard_;
 
@@ -116,7 +116,7 @@ class IrFunction {
 // Returns an array of compute function call argument ir values.
 std::vector<llvm::Value*> GetArrayFunctionCallArguments(
     tensorflow::gtl::ArraySlice<llvm::Value*> parameter_addresses,
-    llvm::IRBuilder<>* ir_builder, tensorflow::StringPiece name,
+    llvm::IRBuilder<>* b, tensorflow::StringPiece name,
     llvm::Value* return_value_buffer, llvm::Value* exec_run_options_arg,
     llvm::Value* temp_buffers_arg, llvm::Value* profile_counters_arg);
 
@@ -124,9 +124,8 @@ std::vector<llvm::Value*> GetArrayFunctionCallArguments(
 // calls to 'parallel_function' (and joins threads before returning).
 Status EmitCallToParallelForkJoin(
     const std::vector<llvm::Value*>& arguments, const Shape& shape,
-    const std::vector<int64>& dimension_partition_counts,
-    llvm::IRBuilder<>* ir_builder, llvm::Function* parallel_function,
-    const string& name);
+    const std::vector<int64>& dimension_partition_counts, llvm::IRBuilder<>* b,
+    llvm::Function* parallel_function, const string& name);
 
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
index 2e5cc96098..cef5e57b0b 100644
--- a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "llvm/IR/Verifier.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "tensorflow/compiler/xla/service/cpu/vector_support_library.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/math_ops.h"
 #include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/platform/logging.h"
 
@@ -52,46 +53,14 @@ llvm::Function* EmitVectorF32TanhIfNeeded(llvm::Module* module,
   llvm::BasicBlock* vector_tanh_body =
       llvm::BasicBlock::Create(*context, "body", vector_tanh_function);
 
-  llvm::IRBuilder<> ir_builder(vector_tanh_body);
+  llvm::IRBuilder<> b(vector_tanh_body);
   llvm::FastMathFlags fast_math_flags;
-  fast_math_flags.setFast();
-  ir_builder.setFastMathFlags(fast_math_flags);
-
-  VectorSupportLibrary vsl(F32, vector_width, &ir_builder, "tanh_f32");
+  fast_math_flags.setFast(enable_fast_math);
+  b.setFastMathFlags(fast_math_flags);
 
   llvm::Value* input = &*vector_tanh_function->arg_begin();
-  CHECK_EQ(input->getType(), vsl.vector_type());
-
-  // This implements the same rational interpolant as implemented in Eigen3.
-  llvm::Value* input_clamped =
-      vsl.Clamp(input, /*low=*/GetIeeeF32(-9.0), /*high=*/GetIeeeF32(9.0));
-
-  std::array<float, 7> numerator_coeffs{
-      -2.76076847742355e-16f, 2.00018790482477e-13f, -8.60467152213735e-11f,
-      5.12229709037114e-08f,  1.48572235717979e-05f, 6.37261928875436e-04f,
-      4.89352455891786e-03f};
-
-  std::array<float, 4> denominator_coeffs{
-      1.19825839466702e-06f, 1.18534705686654e-04f, 2.26843463243900e-03f,
-      4.89352518554385e-03f};
-
-  llvm::Value* input_squared = vsl.Mul(input_clamped, input_clamped);
-  llvm::Value* numerator = vsl.SplatFloat(GetIeeeF32(numerator_coeffs[0]));
-  for (int i = 1; i < numerator_coeffs.size(); i++) {
-    numerator =
-        vsl.MulAdd(input_squared, numerator, GetIeeeF32(numerator_coeffs[i]));
-  }
-
-  numerator = vsl.Mul(input_clamped, numerator);
-
-  llvm::Value* denominator = vsl.SplatFloat(GetIeeeF32(denominator_coeffs[0]));
-  for (int i = 1; i < denominator_coeffs.size(); i++) {
-    denominator = vsl.MulAdd(input_squared, denominator,
-                             GetIeeeF32(denominator_coeffs[i]));
-  }
-
-  llvm::Value* result = vsl.Div(numerator, denominator);
-  ir_builder.CreateRet(result);
+  CHECK_EQ(vector_width, input->getType()->getVectorNumElements());
+  b.CreateRet(llvm_ir::EmitFastTanh(&b, input));
 
   DCHECK(!llvm::verifyFunction(*vector_tanh_function));
   return vector_tanh_function;
@@ -113,12 +82,12 @@ llvm::Function* EmitVectorF32ExpIfNeeded(llvm::Module* module,
   llvm::BasicBlock* vector_exp_body =
       llvm::BasicBlock::Create(*context, "body", vector_exp_function);
 
-  llvm::IRBuilder<> ir_builder(vector_exp_body);
+  llvm::IRBuilder<> b(vector_exp_body);
   llvm::FastMathFlags fast_math_flags;
   fast_math_flags.setFast();
-  ir_builder.setFastMathFlags(fast_math_flags);
+  b.setFastMathFlags(fast_math_flags);
 
-  VectorSupportLibrary vsl(F32, vector_width, &ir_builder, "exp_f32");
+  VectorSupportLibrary vsl(F32, vector_width, &b, "exp_f32");
 
   // This implements the same polynomial approximation as implemented in Eigen3.
 
@@ -160,21 +129,21 @@ llvm::Function* EmitVectorF32ExpIfNeeded(llvm::Module* module,
   // VectorSupportLibrary (intentionally) can't juggle more than one type at a
   // time so drop down to IRBuilder for this bit.
   llvm::Value* vector_constant_0x7f =
-      ir_builder.CreateVectorSplat(vector_width, ir_builder.getInt32(0x7f));
+      b.CreateVectorSplat(vector_width, b.getInt32(0x7f));
   llvm::Value* vector_constant_23 =
-      ir_builder.CreateVectorSplat(vector_width, ir_builder.getInt32(23));
+      b.CreateVectorSplat(vector_width, b.getInt32(23));
   llvm::Type* i32_vector_type =
-      llvm::VectorType::get(ir_builder.getInt32Ty(), vector_width);
+      llvm::VectorType::get(b.getInt32Ty(), vector_width);
   // fx is clamped so we don't have to worry about it being out of range for
   // i32.
-  llvm::Value* emm0 = ir_builder.CreateFPToSI(fx, i32_vector_type);
-  emm0 = ir_builder.CreateAdd(emm0, vector_constant_0x7f);
-  emm0 = ir_builder.CreateShl(emm0, vector_constant_23);
-  llvm::Value* emm0_f32 = ir_builder.CreateBitCast(emm0, vsl.vector_type());
+  llvm::Value* emm0 = b.CreateFPToSI(fx, i32_vector_type);
+  emm0 = b.CreateAdd(emm0, vector_constant_0x7f);
+  emm0 = b.CreateShl(emm0, vector_constant_23);
+  llvm::Value* emm0_f32 = b.CreateBitCast(emm0, vsl.vector_type());
 
   llvm::Value* result = vsl.Max(vsl.Mul(y, emm0_f32), input);
 
-  ir_builder.CreateRet(result);
+  b.CreateRet(result);
 
   DCHECK(!llvm::verifyFunction(*vector_exp_function));
   return vector_exp_function;
@@ -196,13 +165,13 @@ llvm::Function* EmitVectorF32LogIfNeeded(llvm::Module* module,
   llvm::BasicBlock* vector_log_body =
       llvm::BasicBlock::Create(*context, "body", vector_log_function);
 
-  llvm::IRBuilder<> ir_builder(vector_log_body);
+  llvm::IRBuilder<> b(vector_log_body);
   llvm::FastMathFlags fast_math_flags;
   fast_math_flags.setFast();
-  ir_builder.setFastMathFlags(fast_math_flags);
+  b.setFastMathFlags(fast_math_flags);
 
   llvm::Value* input = &*vector_log_function->arg_begin();
-  VectorSupportLibrary vsl(F32, vector_width, &ir_builder, "log_f32");
+  VectorSupportLibrary vsl(F32, vector_width, &b, "log_f32");
 
   const llvm::APFloat half = GetIeeeF32(0.5);
   const llvm::APFloat one = GetIeeeF32(1.0);
@@ -238,22 +207,21 @@ llvm::Function* EmitVectorF32LogIfNeeded(llvm::Module* module,
   // VectorSupportLibrary (intentionally) can't juggle more than one type at a
   // time so drop down to IRBuilder for this bit.
   llvm::Value* vector_constant_0x7f =
-      ir_builder.CreateVectorSplat(vector_width, ir_builder.getInt32(0x7f));
+      b.CreateVectorSplat(vector_width, b.getInt32(0x7f));
   llvm::Value* vector_constant_23 =
-      ir_builder.CreateVectorSplat(vector_width, ir_builder.getInt32(23));
+      b.CreateVectorSplat(vector_width, b.getInt32(23));
   llvm::Type* i32_vector_type =
-      llvm::VectorType::get(ir_builder.getInt32Ty(), vector_width);
+      llvm::VectorType::get(b.getInt32Ty(), vector_width);
 
-  llvm::Value* emm0 = ir_builder.CreateLShr(
-      ir_builder.CreateBitCast(input, i32_vector_type), vector_constant_23);
+  llvm::Value* emm0 =
+      b.CreateLShr(b.CreateBitCast(input, i32_vector_type), vector_constant_23);
 
   // Keep only the fractional part.
   input = vsl.FloatAnd(input, inv_mant_mask);
   input = vsl.FloatOr(input, half);
 
-  emm0 = ir_builder.CreateSub(emm0, vector_constant_0x7f);
-  llvm::Value* e =
-      vsl.Add(one, ir_builder.CreateSIToFP(emm0, vsl.vector_type()));
+  emm0 = b.CreateSub(emm0, vector_constant_0x7f);
+  llvm::Value* e = vsl.Add(one, b.CreateSIToFP(emm0, vsl.vector_type()));
 
   // part2:
   //   if( x < SQRTHF ) {
@@ -294,7 +262,7 @@ llvm::Function* EmitVectorF32LogIfNeeded(llvm::Module* module,
   llvm::Value* or_rhs = vsl.FloatAnd(iszero_mask, minus_inf);
   llvm::Value* result = vsl.FloatOr(or_lhs, or_rhs);
 
-  ir_builder.CreateRet(result);
+  b.CreateRet(result);
 
   DCHECK(!llvm::verifyFunction(*vector_log_function));
   return vector_log_function;
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc
index 59ae5acd8b..8560e4296a 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc
@@ -25,8 +25,8 @@ namespace cpu {
 ParallelLoopEmitter::ParallelLoopEmitter(
     const llvm_ir::ElementGenerator& target_element_generator,
     const llvm_ir::IrArray& target_array,
-    const DynamicLoopBounds* dynamic_loop_bounds, llvm::IRBuilder<>* ir_builder)
-    : LoopEmitter(target_element_generator, target_array, ir_builder),
+    const DynamicLoopBounds* dynamic_loop_bounds, llvm::IRBuilder<>* b)
+    : LoopEmitter(target_element_generator, target_array, b),
       dynamic_loop_bounds_(dynamic_loop_bounds) {}
 
 std::vector<llvm_ir::IrArray::Index>
@@ -37,7 +37,7 @@ ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
   CHECK(!ShapeUtil::IsTuple(shape_));
   CHECK(!ShapeUtil::IsScalar(shape_));
 
-  llvm_ir::ForLoopNest loop_nest(loop_name, ir_builder_);
+  llvm_ir::ForLoopNest loop_nest(loop_name, b_);
   const int64 num_dims = shape_.dimensions_size();
   llvm_ir::IrArray::Index array_index(index_type, num_dims);
 
@@ -65,8 +65,7 @@ ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
     }
   }
   // Point IR builder at inner loop BB.
-  llvm_ir::SetToFirstInsertPoint(loop_nest.GetInnerLoopBodyBasicBlock(),
-                                 ir_builder_);
+  llvm_ir::SetToFirstInsertPoint(loop_nest.GetInnerLoopBodyBasicBlock(), b_);
 
   // Set exit_bb_ to the exit block of the loop nest.
   exit_bb_ = loop_nest.GetOuterLoopExitBasicBlock();
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h
index 25e182a26d..076c683ca5 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h
@@ -54,7 +54,7 @@ class ParallelLoopEmitter : public llvm_ir::LoopEmitter {
   ParallelLoopEmitter(const llvm_ir::ElementGenerator& target_element_generator,
                       const llvm_ir::IrArray& target_array,
                       const DynamicLoopBounds* dynamic_loop_bounds,
-                      llvm::IRBuilder<>* ir_builder);
+                      llvm::IRBuilder<>* b);
 
   ParallelLoopEmitter(const ParallelLoopEmitter&) = delete;
   ParallelLoopEmitter& operator=(const ParallelLoopEmitter&) = delete;
diff --git a/tensorflow/compiler/xla/service/cpu/sample_harness.cc b/tensorflow/compiler/xla/service/cpu/sample_harness.cc
index d9e8dcaed9..f227e4ae13 100644
--- a/tensorflow/compiler/xla/service/cpu/sample_harness.cc
+++ b/tensorflow/compiler/xla/service/cpu/sample_harness.cc
@@ -21,8 +21,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
diff --git a/tensorflow/compiler/xla/service/cpu/tests/BUILD b/tensorflow/compiler/xla/service/cpu/tests/BUILD
index b4c33e2f6c..181cec3cdd 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/tests/BUILD
@@ -135,9 +135,9 @@ tf_cc_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:cpu_plugin",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_infeed_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_infeed_test.cc
index 0d45918d09..c433bddc84 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_infeed_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_infeed_test.cc
@@ -19,8 +19,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc
index ccb61740f6..01daed4bcd 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc
@@ -78,7 +78,7 @@ TEST_F(CpuNoAliasTest, Concat) {
   llvm::Function* func = llvm::cast<llvm::Function>(
       ir_module.getOrInsertFunction("test_fn", llvm::Type::getVoidTy(context)));
   llvm::BasicBlock* bb = llvm::BasicBlock::Create(context, "body", func);
-  llvm::IRBuilder<> ir_builder(bb);
+  llvm::IRBuilder<> b(bb);
   auto* zero = llvm::ConstantInt::get(llvm::Type::getInt32Ty(context), 0);
   llvm_ir::IrArray::Index zero2D({zero, zero});
 
@@ -90,7 +90,7 @@ TEST_F(CpuNoAliasTest, Concat) {
         ir_module.getOrInsertGlobal("param_x", array2d_type);
     llvm_ir::IrArray param_x_array(param_x_val, param_shape);
     aa.AddAliasingInformationToIrArray(*param_x, &param_x_array);
-    param_x_array.EmitReadArrayElement(zero2D, &ir_builder)
+    param_x_array.EmitReadArrayElement(zero2D, &b)
         ->setName("read_param_x_array");
   }
 
@@ -100,7 +100,7 @@ TEST_F(CpuNoAliasTest, Concat) {
     auto shape = ShapeUtil::MakeShape(F32, {2, 4});
     llvm_ir::IrArray concat1_array(concat1_val, shape);
     aa.AddAliasingInformationToIrArray(*concat1, &concat1_array);
-    concat1_array.EmitReadArrayElement(zero2D, &ir_builder)
+    concat1_array.EmitReadArrayElement(zero2D, &b)
         ->setName("read_concat1_array");
   }
 
@@ -110,7 +110,7 @@ TEST_F(CpuNoAliasTest, Concat) {
     auto shape = ShapeUtil::MakeShape(F32, {2, 6});
     llvm_ir::IrArray concat2_array(concat2_val, shape);
     aa.AddAliasingInformationToIrArray(*concat2, &concat2_array);
-    concat2_array.EmitReadArrayElement(zero2D, &ir_builder)
+    concat2_array.EmitReadArrayElement(zero2D, &b)
         ->setName("read_concat2_array");
   }
 
diff --git a/tensorflow/compiler/xla/service/cpu/vector_support_library.cc b/tensorflow/compiler/xla/service/cpu/vector_support_library.cc
index c444d15185..3274be8d9d 100644
--- a/tensorflow/compiler/xla/service/cpu/vector_support_library.cc
+++ b/tensorflow/compiler/xla/service/cpu/vector_support_library.cc
@@ -23,14 +23,14 @@ namespace xla {
 namespace cpu {
 VectorSupportLibrary::VectorSupportLibrary(PrimitiveType primitive_type,
                                            int64 vector_size,
-                                           llvm::IRBuilder<>* ir_builder,
+                                           llvm::IRBuilder<>* b,
                                            std::string name)
     : vector_size_(vector_size),
       primitive_type_(primitive_type),
-      ir_builder_(ir_builder),
+      b_(b),
       name_(std::move(name)) {
   scalar_type_ = llvm_ir::PrimitiveTypeToIrType(
-      primitive_type, ir_builder_->GetInsertBlock()->getModule());
+      primitive_type, b_->GetInsertBlock()->getModule());
   scalar_pointer_type_ = llvm::PointerType::getUnqual(scalar_type_);
   vector_type_ = llvm::VectorType::get(scalar_type_, vector_size);
   vector_pointer_type_ = llvm::PointerType::getUnqual(vector_type_);
@@ -63,9 +63,9 @@ llvm::Value* VectorSupportLibrary::Mul(llvm::Value* lhs, llvm::Value* rhs) {
 llvm::Value* VectorSupportLibrary::MulInternal(llvm::Value* lhs,
                                                llvm::Value* rhs) {
   if (scalar_type_->isFloatingPointTy()) {
-    return ir_builder()->CreateFMul(lhs, rhs, name());
+    return b()->CreateFMul(lhs, rhs, name());
   } else {
-    return ir_builder()->CreateMul(lhs, rhs, name());
+    return b()->CreateMul(lhs, rhs, name());
   }
 }
 
@@ -76,13 +76,13 @@ llvm::Value* VectorSupportLibrary::Add(llvm::Value* lhs, llvm::Value* rhs) {
 
 llvm::Value* VectorSupportLibrary::Sub(llvm::Value* lhs, llvm::Value* rhs) {
   AssertCorrectTypes({lhs, rhs});
-  return ir_builder()->CreateFSub(lhs, rhs);
+  return b()->CreateFSub(lhs, rhs);
 }
 
 llvm::Value* VectorSupportLibrary::Max(llvm::Value* lhs, llvm::Value* rhs) {
   AssertCorrectTypes({lhs, rhs});
   if (scalar_type_->isFloatingPointTy()) {
-    return llvm_ir::EmitFloatMax(lhs, rhs, ir_builder_);
+    return llvm_ir::EmitFloatMax(lhs, rhs, b_);
   } else {
     LOG(FATAL) << "Max for integers is unimplemented";
   }
@@ -91,13 +91,13 @@ llvm::Value* VectorSupportLibrary::Max(llvm::Value* lhs, llvm::Value* rhs) {
 llvm::Value* VectorSupportLibrary::Floor(llvm::Value* a) {
   AssertCorrectTypes({a});
   return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::floor, {a},
-                                      {a->getType()}, ir_builder());
+                                      {a->getType()}, b());
 }
 
 llvm::Value* VectorSupportLibrary::Div(llvm::Value* lhs, llvm::Value* rhs) {
   AssertCorrectTypes({lhs, rhs});
   if (scalar_type_->isFloatingPointTy()) {
-    return ir_builder()->CreateFDiv(lhs, rhs, name());
+    return b()->CreateFDiv(lhs, rhs, name());
   } else {
     LOG(FATAL) << "Division for integers is unimplemented";
   }
@@ -111,42 +111,41 @@ llvm::Value* VectorSupportLibrary::Clamp(llvm::Value* a,
   CHECK(low.compare(high) == llvm::APFloat::cmpLessThan);
   CHECK(scalar_type_->isFloatingPointTy());
   return llvm_ir::EmitFloatMin(
-      llvm_ir::EmitFloatMax(a, GetConstantFloat(type, low), ir_builder_),
-      GetConstantFloat(type, high), ir_builder_);
+      llvm_ir::EmitFloatMax(a, GetConstantFloat(type, low), b_),
+      GetConstantFloat(type, high), b_);
 }
 
 llvm::Value* VectorSupportLibrary::FCmpEQMask(llvm::Value* lhs,
                                               llvm::Value* rhs) {
   AssertCorrectTypes({lhs, rhs});
-  return I1ToFloat(ir_builder()->CreateFCmpOEQ(lhs, rhs, name()));
+  return I1ToFloat(b()->CreateFCmpOEQ(lhs, rhs, name()));
 }
 
 llvm::Value* VectorSupportLibrary::FCmpOLTMask(llvm::Value* lhs,
                                                llvm::Value* rhs) {
   AssertCorrectTypes({lhs, rhs});
-  return I1ToFloat(ir_builder()->CreateFCmpOLT(lhs, rhs, name()));
+  return I1ToFloat(b()->CreateFCmpOLT(lhs, rhs, name()));
 }
 
 llvm::Value* VectorSupportLibrary::FCmpULEMask(llvm::Value* lhs,
                                                llvm::Value* rhs) {
   AssertCorrectTypes({lhs, rhs});
-  return I1ToFloat(ir_builder()->CreateFCmpULE(lhs, rhs, name()));
+  return I1ToFloat(b()->CreateFCmpULE(lhs, rhs, name()));
 }
 
 llvm::Value* VectorSupportLibrary::I1ToFloat(llvm::Value* i1) {
   bool is_vector = llvm::isa<llvm::VectorType>(i1->getType());
   llvm::Type* integer_type = IntegerTypeForFloatSize(is_vector);
-  return ir_builder()->CreateBitCast(
-      ir_builder()->CreateSExt(i1, integer_type, name()),
-      is_vector ? vector_type() : scalar_type(), name());
+  return b()->CreateBitCast(b()->CreateSExt(i1, integer_type, name()),
+                            is_vector ? vector_type() : scalar_type(), name());
 }
 
 llvm::Type* VectorSupportLibrary::IntegerTypeForFloatSize(bool vector) {
   CHECK(scalar_type()->isFloatingPointTy());
   const llvm::DataLayout& data_layout =
-      ir_builder()->GetInsertBlock()->getModule()->getDataLayout();
+      b()->GetInsertBlock()->getModule()->getDataLayout();
   int64 float_size_bits = data_layout.getTypeSizeInBits(scalar_type());
-  llvm::Type* scalar_int_type = ir_builder()->getIntNTy(float_size_bits);
+  llvm::Type* scalar_int_type = b()->getIntNTy(float_size_bits);
   if (vector) {
     return llvm::VectorType::get(scalar_int_type, vector_size());
   } else {
@@ -156,7 +155,7 @@ llvm::Type* VectorSupportLibrary::IntegerTypeForFloatSize(bool vector) {
 
 llvm::Value* VectorSupportLibrary::BroadcastScalar(llvm::Value* x) {
   CHECK_EQ(x->getType(), scalar_type());
-  return ir_builder()->CreateVectorSplat(vector_size(), x, name());
+  return b()->CreateVectorSplat(vector_size(), x, name());
 }
 
 llvm::Value* VectorSupportLibrary::FloatAnd(llvm::Value* lhs,
@@ -164,10 +163,9 @@ llvm::Value* VectorSupportLibrary::FloatAnd(llvm::Value* lhs,
   AssertCorrectTypes({lhs, rhs});
   llvm::Type* int_type =
       IntegerTypeForFloatSize(lhs->getType() == vector_type());
-  return ir_builder()->CreateBitCast(
-      ir_builder()->CreateAnd(
-          ir_builder()->CreateBitCast(lhs, int_type, name()),
-          ir_builder()->CreateBitCast(rhs, int_type, name()), name()),
+  return b()->CreateBitCast(
+      b()->CreateAnd(b()->CreateBitCast(lhs, int_type, name()),
+                     b()->CreateBitCast(rhs, int_type, name()), name()),
       vector_type());
 }
 
@@ -175,9 +173,8 @@ llvm::Value* VectorSupportLibrary::FloatNot(llvm::Value* lhs) {
   AssertCorrectTypes({lhs});
   llvm::Type* int_type =
       IntegerTypeForFloatSize(lhs->getType() == vector_type());
-  return ir_builder()->CreateBitCast(
-      ir_builder()->CreateNot(
-          ir_builder()->CreateBitCast(lhs, int_type, name()), name()),
+  return b()->CreateBitCast(
+      b()->CreateNot(b()->CreateBitCast(lhs, int_type, name()), name()),
       vector_type());
 }
 
@@ -185,47 +182,43 @@ llvm::Value* VectorSupportLibrary::FloatOr(llvm::Value* lhs, llvm::Value* rhs) {
   AssertCorrectTypes({lhs, rhs});
   llvm::Type* int_type =
       IntegerTypeForFloatSize(lhs->getType() == vector_type());
-  return ir_builder()->CreateBitCast(
-      ir_builder()->CreateOr(ir_builder()->CreateBitCast(lhs, int_type, name()),
-                             ir_builder()->CreateBitCast(rhs, int_type, name()),
-                             name()),
+  return b()->CreateBitCast(
+      b()->CreateOr(b()->CreateBitCast(lhs, int_type, name()),
+                    b()->CreateBitCast(rhs, int_type, name()), name()),
       vector_type(), name());
 }
 
 llvm::Value* VectorSupportLibrary::AddInternal(llvm::Value* lhs,
                                                llvm::Value* rhs) {
   if (scalar_type_->isFloatingPointTy()) {
-    return ir_builder()->CreateFAdd(lhs, rhs, name());
+    return b()->CreateFAdd(lhs, rhs, name());
   } else {
-    return ir_builder()->CreateAdd(lhs, rhs, name());
+    return b()->CreateAdd(lhs, rhs, name());
   }
 }
 
 llvm::Value* VectorSupportLibrary::ComputeOffsetPointer(
     llvm::Value* base_pointer, llvm::Value* offset_elements) {
   if (base_pointer->getType() != scalar_pointer_type()) {
-    base_pointer = ir_builder()->CreateBitCast(base_pointer,
-                                               scalar_pointer_type(), name());
+    base_pointer =
+        b()->CreateBitCast(base_pointer, scalar_pointer_type(), name());
   }
-  return ir_builder()->CreateInBoundsGEP(base_pointer, {offset_elements},
-                                         name());
+  return b()->CreateInBoundsGEP(base_pointer, {offset_elements}, name());
 }
 
 llvm::Value* VectorSupportLibrary::LoadVector(llvm::Value* pointer) {
   if (pointer->getType() != vector_pointer_type()) {
-    pointer =
-        ir_builder()->CreateBitCast(pointer, vector_pointer_type(), name());
+    pointer = b()->CreateBitCast(pointer, vector_pointer_type(), name());
   }
-  return ir_builder()->CreateAlignedLoad(
+  return b()->CreateAlignedLoad(
       pointer, ShapeUtil::ByteSizeOfPrimitiveType(primitive_type_), name());
 }
 
 llvm::Value* VectorSupportLibrary::LoadScalar(llvm::Value* pointer) {
   if (pointer->getType() != scalar_pointer_type()) {
-    pointer =
-        ir_builder()->CreateBitCast(pointer, scalar_pointer_type(), name());
+    pointer = b()->CreateBitCast(pointer, scalar_pointer_type(), name());
   }
-  return ir_builder()->CreateAlignedLoad(
+  return b()->CreateAlignedLoad(
       pointer, ShapeUtil::ByteSizeOfPrimitiveType(primitive_type_), name());
 }
 
@@ -233,30 +226,28 @@ void VectorSupportLibrary::StoreVector(llvm::Value* value,
                                        llvm::Value* pointer) {
   AssertCorrectTypes({value});
   if (pointer->getType() != vector_pointer_type()) {
-    pointer = ir_builder()->CreateBitCast(pointer, vector_pointer_type());
+    pointer = b()->CreateBitCast(pointer, vector_pointer_type());
   }
-  ir_builder()->CreateAlignedStore(
-      value, pointer, ShapeUtil::ByteSizeOfPrimitiveType(primitive_type_));
+  b()->CreateAlignedStore(value, pointer,
+                          ShapeUtil::ByteSizeOfPrimitiveType(primitive_type_));
 }
 
 void VectorSupportLibrary::StoreScalar(llvm::Value* value,
                                        llvm::Value* pointer) {
   AssertCorrectTypes({value});
   if (pointer->getType() != scalar_pointer_type()) {
-    pointer =
-        ir_builder()->CreateBitCast(pointer, scalar_pointer_type(), name());
+    pointer = b()->CreateBitCast(pointer, scalar_pointer_type(), name());
   }
-  ir_builder()->CreateAlignedStore(
-      value, pointer, ShapeUtil::ByteSizeOfPrimitiveType(primitive_type_));
+  b()->CreateAlignedStore(value, pointer,
+                          ShapeUtil::ByteSizeOfPrimitiveType(primitive_type_));
 }
 
 llvm::Value* VectorSupportLibrary::LoadBroadcast(llvm::Value* pointer) {
   if (pointer->getType() != scalar_pointer_type()) {
-    pointer =
-        ir_builder()->CreateBitCast(pointer, scalar_pointer_type(), name());
+    pointer = b()->CreateBitCast(pointer, scalar_pointer_type(), name());
   }
-  return ir_builder()->CreateVectorSplat(
-      vector_size(), ir_builder()->CreateLoad(pointer), name());
+  return b()->CreateVectorSplat(vector_size(), b()->CreateLoad(pointer),
+                                name());
 }
 
 llvm::Value* VectorSupportLibrary::AddReduce(llvm::Value* vector) {
@@ -267,20 +258,19 @@ llvm::Value* VectorSupportLibrary::AddReduce(llvm::Value* vector) {
 
     for (unsigned j = 0; j < vector_size(); ++j) {
       if (j < (i / 2)) {
-        mask[j] = ir_builder()->getInt32(i / 2 + j);
+        mask[j] = b()->getInt32(i / 2 + j);
       } else {
-        mask[j] = llvm::UndefValue::get(ir_builder()->getInt32Ty());
+        mask[j] = llvm::UndefValue::get(b()->getInt32Ty());
       }
     }
 
-    llvm::Value* half_remaining_lanes = ir_builder()->CreateShuffleVector(
-        vector, llvm::UndefValue::get(vector_type()),
-        llvm::ConstantVector::get(mask), "");
+    llvm::Value* half_remaining_lanes =
+        b()->CreateShuffleVector(vector, llvm::UndefValue::get(vector_type()),
+                                 llvm::ConstantVector::get(mask), "");
     vector = Add(vector, half_remaining_lanes);
   }
 
-  return ir_builder()->CreateExtractElement(vector, ir_builder()->getInt32(0),
-                                            name());
+  return b()->CreateExtractElement(vector, b()->getInt32(0), name());
 }
 
 llvm::Value* VectorSupportLibrary::AvxStyleHorizontalAdd(llvm::Value* lhs,
@@ -307,19 +297,19 @@ llvm::Value* VectorSupportLibrary::AvxStyleHorizontalAdd(llvm::Value* lhs,
   // vector, which are the lanes 2 and 3 in the rhs vector.
   for (int i = 0; i < vector_size(); i += 2) {
     int increment = i < vector_size() / 2 ? 0 : (vector_size() / 2);
-    mask_a.push_back(ir_builder()->getInt32(increment + i));
-    mask_b.push_back(ir_builder()->getInt32(increment + i + 1));
+    mask_a.push_back(b()->getInt32(increment + i));
+    mask_b.push_back(b()->getInt32(increment + i + 1));
   }
   for (int i = 0; i < vector_size(); i += 2) {
     int increment = i < vector_size() / 2 ? (vector_size() / 2) : vector_size();
-    mask_a.push_back(ir_builder()->getInt32(increment + i));
-    mask_b.push_back(ir_builder()->getInt32(increment + i + 1));
+    mask_a.push_back(b()->getInt32(increment + i));
+    mask_b.push_back(b()->getInt32(increment + i + 1));
   }
 
-  llvm::Value* shuffle_0 = ir_builder()->CreateShuffleVector(
-      lhs, rhs, llvm::ConstantVector::get(mask_a));
-  llvm::Value* shuffle_1 = ir_builder()->CreateShuffleVector(
-      lhs, rhs, llvm::ConstantVector::get(mask_b));
+  llvm::Value* shuffle_0 =
+      b()->CreateShuffleVector(lhs, rhs, llvm::ConstantVector::get(mask_a));
+  llvm::Value* shuffle_1 =
+      b()->CreateShuffleVector(lhs, rhs, llvm::ConstantVector::get(mask_b));
 
   return Add(shuffle_0, shuffle_1);
 }
@@ -327,23 +317,21 @@ llvm::Value* VectorSupportLibrary::AvxStyleHorizontalAdd(llvm::Value* lhs,
 llvm::Value* VectorSupportLibrary::ExtractLowHalf(llvm::Value* vector) {
   llvm::SmallVector<llvm::Constant*, 32> mask;
   for (int i = 0; i < vector_size() / 2; i++) {
-    mask.push_back(ir_builder()->getInt32(i));
+    mask.push_back(b()->getInt32(i));
   }
 
-  return ir_builder()->CreateShuffleVector(vector,
-                                           llvm::UndefValue::get(vector_type()),
-                                           llvm::ConstantVector::get(mask));
+  return b()->CreateShuffleVector(vector, llvm::UndefValue::get(vector_type()),
+                                  llvm::ConstantVector::get(mask));
 }
 
 llvm::Value* VectorSupportLibrary::ExtractHighHalf(llvm::Value* vector) {
   llvm::SmallVector<llvm::Constant*, 32> mask;
   for (int i = 0; i < vector_size() / 2; i++) {
-    mask.push_back(ir_builder()->getInt32(i + vector_size() / 2));
+    mask.push_back(b()->getInt32(i + vector_size() / 2));
   }
 
-  return ir_builder()->CreateShuffleVector(vector,
-                                           llvm::UndefValue::get(vector_type()),
-                                           llvm::ConstantVector::get(mask));
+  return b()->CreateShuffleVector(vector, llvm::UndefValue::get(vector_type()),
+                                  llvm::ConstantVector::get(mask));
 }
 
 std::vector<llvm::Value*> VectorSupportLibrary::ComputeHorizontalSums(
@@ -360,8 +348,8 @@ std::vector<llvm::Value*> VectorSupportLibrary::ComputeHorizontalSums(
                  [this](llvm::Value* vector) { return AddReduce(vector); });
   if (init_values) {
     for (int64 i = 0, e = result.size(); i < e; i++) {
-      result[i] = Add(result[i], ir_builder()->CreateExtractElement(
-                                     init_values, ir_builder()->getInt32(i)));
+      result[i] = Add(result[i],
+                      b()->CreateExtractElement(init_values, b()->getInt32(i)));
     }
   }
   return result;
@@ -398,9 +386,9 @@ VectorSupportLibrary::ComputeAvxOptimizedHorizontalSums(
 
   std::vector<llvm::Value*> results;
   for (int i = 0; i < lane_width; i++) {
-    llvm::Value* scalar_result = ir_builder()->CreateExtractElement(
-        i < (lane_width / 2) ? low : high,
-        ir_builder()->getInt32(i % (lane_width / 2)), name());
+    llvm::Value* scalar_result =
+        b()->CreateExtractElement(i < (lane_width / 2) ? low : high,
+                                  b()->getInt32(i % (lane_width / 2)), name());
     results.push_back(scalar_result);
   }
 
@@ -415,17 +403,14 @@ llvm::Value* VectorSupportLibrary::GetZeroScalar() {
   return llvm::Constant::getNullValue(scalar_type());
 }
 
-LlvmVariable::LlvmVariable(llvm::Type* type, llvm::IRBuilder<>* ir_builder)
-    : ir_builder_(ir_builder) {
-  alloca_ = llvm_ir::EmitAllocaAtFunctionEntry(type, "", ir_builder_);
+LlvmVariable::LlvmVariable(llvm::Type* type, llvm::IRBuilder<>* b) : b_(b) {
+  alloca_ = llvm_ir::EmitAllocaAtFunctionEntry(type, "", b_);
 }
 
-llvm::Value* LlvmVariable::Get() const {
-  return ir_builder_->CreateLoad(alloca_);
-}
+llvm::Value* LlvmVariable::Get() const { return b_->CreateLoad(alloca_); }
 
 void LlvmVariable::Set(llvm::Value* new_value) {
-  ir_builder_->CreateStore(new_value, alloca_);
+  b_->CreateStore(new_value, alloca_);
 }
 
 TileVariable::TileVariable(VectorSupportLibrary* vector_support,
diff --git a/tensorflow/compiler/xla/service/cpu/vector_support_library.h b/tensorflow/compiler/xla/service/cpu/vector_support_library.h
index 49c2a4e2f4..c728f6df0a 100644
--- a/tensorflow/compiler/xla/service/cpu/vector_support_library.h
+++ b/tensorflow/compiler/xla/service/cpu/vector_support_library.h
@@ -46,11 +46,11 @@ class VectorSupportLibrary {
   // instance (i.e. LoadVector will load a vector of type <`vector_size` x
   // `primitive_type`>).
   VectorSupportLibrary(PrimitiveType primitive_type, int64 vector_size,
-                       llvm::IRBuilder<>* ir_builder, std::string name);
+                       llvm::IRBuilder<>* b, std::string name);
 
   llvm::Value* Mul(llvm::Value* lhs, llvm::Value* rhs);
   llvm::Value* Mul(int64 lhs, llvm::Value* rhs) {
-    return Mul(ir_builder()->getInt64(lhs), rhs);
+    return Mul(b()->getInt64(lhs), rhs);
   }
   llvm::Value* Mul(const llvm::APFloat& lhs, llvm::Value* rhs) {
     return Mul(GetConstantFloat(rhs->getType(), lhs), rhs);
@@ -63,7 +63,7 @@ class VectorSupportLibrary {
 
   llvm::Value* Add(llvm::Value* lhs, llvm::Value* rhs);
   llvm::Value* Add(int64 lhs, llvm::Value* rhs) {
-    return Add(ir_builder()->getInt64(lhs), rhs);
+    return Add(b()->getInt64(lhs), rhs);
   }
   llvm::Value* Add(const llvm::APFloat& lhs, llvm::Value* rhs) {
     return Add(GetConstantFloat(rhs->getType(), lhs), rhs);
@@ -147,13 +147,11 @@ class VectorSupportLibrary {
   llvm::Value* ComputeOffsetPointer(llvm::Value* base_pointer,
                                     llvm::Value* offset_elements, int64 scale) {
     return ComputeOffsetPointer(
-        base_pointer,
-        ir_builder_->CreateMul(ir_builder_->getInt64(scale), offset_elements));
+        base_pointer, b_->CreateMul(b_->getInt64(scale), offset_elements));
   }
   llvm::Value* ComputeOffsetPointer(llvm::Value* base_pointer,
                                     int64 offset_elements) {
-    return ComputeOffsetPointer(base_pointer,
-                                ir_builder()->getInt64(offset_elements));
+    return ComputeOffsetPointer(base_pointer, b()->getInt64(offset_elements));
   }
 
   llvm::Value* LoadVector(llvm::Value* pointer);
@@ -164,7 +162,7 @@ class VectorSupportLibrary {
   }
 
   llvm::Value* LoadVector(llvm::Value* base_pointer, int64 offset_elements) {
-    return LoadVector(base_pointer, ir_builder()->getInt64(offset_elements));
+    return LoadVector(base_pointer, b()->getInt64(offset_elements));
   }
 
   llvm::Value* LoadScalar(llvm::Value* pointer);
@@ -175,7 +173,7 @@ class VectorSupportLibrary {
   }
 
   llvm::Value* LoadScalar(llvm::Value* base_pointer, int64 offset_elements) {
-    return LoadScalar(base_pointer, ir_builder()->getInt64(offset_elements));
+    return LoadScalar(base_pointer, b()->getInt64(offset_elements));
   }
 
   void StoreVector(llvm::Value* value, llvm::Value* pointer);
@@ -187,7 +185,7 @@ class VectorSupportLibrary {
 
   void StoreVector(llvm::Value* value, llvm::Value* base_pointer,
                    int64 offset_elements) {
-    StoreVector(value, base_pointer, ir_builder()->getInt64(offset_elements));
+    StoreVector(value, base_pointer, b()->getInt64(offset_elements));
   }
 
   void StoreScalar(llvm::Value* value, llvm::Value* pointer);
@@ -198,7 +196,7 @@ class VectorSupportLibrary {
 
   void StoreScalar(llvm::Value* value, llvm::Value* base_pointer,
                    int64 offset_elements) {
-    StoreScalar(base_pointer, ir_builder()->getInt64(offset_elements));
+    StoreScalar(base_pointer, b()->getInt64(offset_elements));
   }
 
   llvm::Value* LoadBroadcast(llvm::Value* pointer);
@@ -207,7 +205,7 @@ class VectorSupportLibrary {
     return LoadBroadcast(ComputeOffsetPointer(base_pointer, offset_elements));
   }
   llvm::Value* LoadBroadcast(llvm::Value* base_pointer, int64 offset_elements) {
-    return LoadBroadcast(base_pointer, ir_builder()->getInt64(offset_elements));
+    return LoadBroadcast(base_pointer, b()->getInt64(offset_elements));
   }
 
   // Compute the horizontal sum of each vector in `vectors`.  The i'th element
@@ -220,7 +218,7 @@ class VectorSupportLibrary {
   llvm::Value* GetZeroVector();
   llvm::Value* GetZeroScalar();
 
-  llvm::IRBuilder<>* ir_builder() const { return ir_builder_; }
+  llvm::IRBuilder<>* b() const { return b_; }
   int64 vector_size() const { return vector_size_; }
   llvm::Type* vector_type() const { return vector_type_; }
   llvm::Type* vector_pointer_type() const { return vector_pointer_type_; }
@@ -277,7 +275,7 @@ class VectorSupportLibrary {
 
   int64 vector_size_;
   PrimitiveType primitive_type_;
-  llvm::IRBuilder<>* ir_builder_;
+  llvm::IRBuilder<>* b_;
   llvm::Type* vector_type_;
   llvm::Type* vector_pointer_type_;
   llvm::Type* scalar_type_;
@@ -289,22 +287,21 @@ class VectorSupportLibrary {
 // can later convert to a SSA value.
 class LlvmVariable {
  public:
-  LlvmVariable(llvm::Type*, llvm::IRBuilder<>* ir_builder);
+  LlvmVariable(llvm::Type*, llvm::IRBuilder<>* b);
 
   llvm::Value* Get() const;
   void Set(llvm::Value* new_value);
 
  private:
   llvm::AllocaInst* alloca_;
-  llvm::IRBuilder<>* ir_builder_;
+  llvm::IRBuilder<>* b_;
 };
 
 class VectorVariable : public LlvmVariable {
  public:
   VectorVariable(VectorSupportLibrary* vector_support,
                  llvm::Value* initial_value)
-      : LlvmVariable(vector_support->vector_type(),
-                     vector_support->ir_builder()) {
+      : LlvmVariable(vector_support->vector_type(), vector_support->b()) {
     Set(initial_value);
   }
 };
@@ -313,8 +310,7 @@ class ScalarVariable : public LlvmVariable {
  public:
   ScalarVariable(VectorSupportLibrary* vector_support,
                  llvm::Value* initial_value)
-      : LlvmVariable(vector_support->scalar_type(),
-                     vector_support->ir_builder()) {
+      : LlvmVariable(vector_support->scalar_type(), vector_support->b()) {
     Set(initial_value);
   }
 };
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
index 51f16bdc94..097fa23027 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
@@ -212,6 +212,7 @@ class DfsHloVisitorBase {
   virtual Status HandleReverse(HloInstructionPtr hlo) = 0;
   virtual Status HandleSort(HloInstructionPtr hlo) = 0;
   virtual Status HandleConstant(HloInstructionPtr hlo) = 0;
+  virtual Status HandleIota(HloInstructionPtr hlo) = 0;
   virtual Status HandleGetTupleElement(HloInstructionPtr hlo) = 0;
   virtual Status HandleReduce(HloInstructionPtr hlo) = 0;
   virtual Status HandleBitcast(HloInstructionPtr hlo) = 0;
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
index 0686ca74af..f4316e0fb7 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
@@ -115,6 +115,9 @@ class DfsHloVisitorWithDefaultBase
   Status HandleConstant(HloInstructionPtr constant) override {
     return DefaultAction(constant);
   }
+  Status HandleIota(HloInstructionPtr iota) override {
+    return DefaultAction(iota);
+  }
   Status HandleGetTupleElement(HloInstructionPtr get_tuple_element) override {
     return DefaultAction(get_tuple_element);
   }
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index c51632597a..f883eb828c 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -61,13 +61,13 @@ int64 GlobalRandomValue() {
 
 llvm::Value* EmitReducePrecisionFloat(llvm::Value* x, int64 exponent_bits,
                                       int64 mantissa_bits,
-                                      llvm::IRBuilder<>* ir_builder) {
+                                      llvm::IRBuilder<>* b) {
   // Integer and float types for casting and constant generation.
   llvm::Type* float_type = x->getType();
-  llvm::IntegerType* int_type = ir_builder->getInt32Ty();
+  llvm::IntegerType* int_type = b->getInt32Ty();
 
   // Cast the input value to an integer for bitwise manipulation.
-  llvm::Value* x_as_int = ir_builder->CreateBitCast(x, int_type);
+  llvm::Value* x_as_int = b->CreateBitCast(x, int_type);
 
   if (mantissa_bits < 23) {
     // Last remaining mantissa bit.
@@ -77,22 +77,22 @@ llvm::Value* EmitReducePrecisionFloat(llvm::Value* x, int64 exponent_bits,
     // equal to a base value of 0111... plus one bit if the last remaining
     // mantissa bit is 1.
     const uint32_t base_rounding_bias = (last_mantissa_bit_mask >> 1) - 1;
-    llvm::Value* x_last_mantissa_bit = ir_builder->CreateLShr(
-        ir_builder->CreateAnd(
-            x_as_int, llvm::ConstantInt::get(int_type, last_mantissa_bit_mask)),
+    llvm::Value* x_last_mantissa_bit = b->CreateLShr(
+        b->CreateAnd(x_as_int,
+                     llvm::ConstantInt::get(int_type, last_mantissa_bit_mask)),
         (23 - mantissa_bits));
-    llvm::Value* x_rounding_bias = ir_builder->CreateAdd(
-        x_last_mantissa_bit,
-        llvm::ConstantInt::get(int_type, base_rounding_bias));
+    llvm::Value* x_rounding_bias =
+        b->CreateAdd(x_last_mantissa_bit,
+                     llvm::ConstantInt::get(int_type, base_rounding_bias));
 
     // Add rounding bias, and mask out truncated bits.  Note that the case
     // where adding the rounding bias overflows into the exponent bits is
     // correct; the non-masked mantissa bits will all be zero, and the
     // exponent will be incremented by one.
     const uint32_t truncation_mask = ~(last_mantissa_bit_mask - 1);
-    x_as_int = ir_builder->CreateAdd(x_as_int, x_rounding_bias);
-    x_as_int = ir_builder->CreateAnd(
-        x_as_int, llvm::ConstantInt::get(int_type, truncation_mask));
+    x_as_int = b->CreateAdd(x_as_int, x_rounding_bias);
+    x_as_int = b->CreateAnd(x_as_int,
+                            llvm::ConstantInt::get(int_type, truncation_mask));
   }
 
   if (exponent_bits < 8) {
@@ -120,29 +120,29 @@ llvm::Value* EmitReducePrecisionFloat(llvm::Value* x, int64 exponent_bits,
         f32_exponent_bias - reduced_exponent_bias;
 
     // Do we overflow or underflow?
-    llvm::Value* x_exponent = ir_builder->CreateAnd(
+    llvm::Value* x_exponent = b->CreateAnd(
         x_as_int, llvm::ConstantInt::get(int_type, f32_exp_bits_mask));
-    llvm::Value* x_overflows = ir_builder->CreateICmpUGT(
+    llvm::Value* x_overflows = b->CreateICmpUGT(
         x_exponent,
         llvm::ConstantInt::get(int_type, reduced_max_exponent << 23));
-    llvm::Value* x_underflows = ir_builder->CreateICmpULE(
+    llvm::Value* x_underflows = b->CreateICmpULE(
         x_exponent,
         llvm::ConstantInt::get(int_type, reduced_min_exponent << 23));
 
     // Compute appropriately-signed values of zero and infinity.
-    llvm::Value* x_signed_zero = ir_builder->CreateAnd(
+    llvm::Value* x_signed_zero = b->CreateAnd(
         x_as_int, llvm::ConstantInt::get(int_type, f32_sign_bit_mask));
-    llvm::Value* x_signed_inf = ir_builder->CreateOr(
+    llvm::Value* x_signed_inf = b->CreateOr(
         x_signed_zero, llvm::ConstantInt::get(int_type, f32_exp_bits_mask));
 
     // Force to zero or infinity if overflow or underflow.  (Note that this
     // truncates all denormal values to zero, rather than rounding them.)
-    x_as_int = ir_builder->CreateSelect(x_overflows, x_signed_inf, x_as_int);
-    x_as_int = ir_builder->CreateSelect(x_underflows, x_signed_zero, x_as_int);
+    x_as_int = b->CreateSelect(x_overflows, x_signed_inf, x_as_int);
+    x_as_int = b->CreateSelect(x_underflows, x_signed_zero, x_as_int);
   }
 
   // Cast the result back to a floating-point type.
-  llvm::Value* result = ir_builder->CreateBitCast(x_as_int, float_type);
+  llvm::Value* result = b->CreateBitCast(x_as_int, float_type);
 
   // Correct result for NaN inputs.
   //
@@ -154,53 +154,49 @@ llvm::Value* EmitReducePrecisionFloat(llvm::Value* x, int64 exponent_bits,
   //
   // If the fast-math flags are set to assume no NaNs, the comparison is likely
   // to be optimized away, so there's no point in even emitting it.
-  if (!ir_builder->getFastMathFlags().noNaNs()) {
-    llvm::Value* x_is_nan = ir_builder->CreateFCmpUNO(x, x);
+  if (!b->getFastMathFlags().noNaNs()) {
+    llvm::Value* x_is_nan = b->CreateFCmpUNO(x, x);
 
     if (mantissa_bits > 0) {
-      result = ir_builder->CreateSelect(x_is_nan, x, result);
+      result = b->CreateSelect(x_is_nan, x, result);
     } else {
-      result = ir_builder->CreateSelect(
+      result = b->CreateSelect(
           x_is_nan, llvm::ConstantFP::getInfinity(float_type), result);
     }
   }
   return result;
 }
 
-llvm::Value* EmitF32ToBF16(llvm::Value* f32_value,
-                           llvm::IRBuilder<>* ir_builder) {
+llvm::Value* EmitF32ToBF16(llvm::Value* f32_value, llvm::IRBuilder<>* b) {
   auto reduced_precision = EmitReducePrecisionFloat(
       f32_value,
       /*exponent_bits=*/primitive_util::kBFloat16ExponentBits,
-      /*mantissa_bits=*/primitive_util::kBFloat16MantissaBits, ir_builder);
-  auto as_int32 =
-      ir_builder->CreateBitCast(reduced_precision, ir_builder->getInt32Ty());
-  auto shifted = ir_builder->CreateLShr(as_int32, 16);
-  auto truncated = ir_builder->CreateTrunc(shifted, ir_builder->getInt16Ty());
-  return ir_builder->CreateBitCast(truncated, ir_builder->getInt16Ty());
+      /*mantissa_bits=*/primitive_util::kBFloat16MantissaBits, b);
+  auto as_int32 = b->CreateBitCast(reduced_precision, b->getInt32Ty());
+  auto shifted = b->CreateLShr(as_int32, 16);
+  auto truncated = b->CreateTrunc(shifted, b->getInt16Ty());
+  return b->CreateBitCast(truncated, b->getInt16Ty());
 }
 
-llvm::Value* EmitBF16ToF32(llvm::Value* bf16_value,
-                           llvm::IRBuilder<>* ir_builder) {
-  auto as_int16 =
-      ir_builder->CreateBitCast(bf16_value, ir_builder->getInt16Ty());
-  auto as_int32 = ir_builder->CreateZExt(as_int16, ir_builder->getInt32Ty());
-  auto shifted = ir_builder->CreateShl(as_int32, 16);
-  return ir_builder->CreateBitCast(shifted, ir_builder->getFloatTy());
+llvm::Value* EmitBF16ToF32(llvm::Value* bf16_value, llvm::IRBuilder<>* b) {
+  auto as_int16 = b->CreateBitCast(bf16_value, b->getInt16Ty());
+  auto as_int32 = b->CreateZExt(as_int16, b->getInt32Ty());
+  auto shifted = b->CreateShl(as_int32, 16);
+  return b->CreateBitCast(shifted, b->getFloatTy());
 }
 
 llvm::Value* EmitIntegralToFloating(llvm::Value* integer_value,
                                     PrimitiveType from_type,
                                     PrimitiveType to_type, llvm::Module* module,
-                                    llvm::IRBuilder<>* ir_builder) {
+                                    llvm::IRBuilder<>* b) {
   if (primitive_util::IsSignedIntegralType(from_type)) {
-    return ir_builder->CreateSIToFP(
-        integer_value, llvm_ir::PrimitiveTypeToIrType(to_type, module));
+    return b->CreateSIToFP(integer_value,
+                           llvm_ir::PrimitiveTypeToIrType(to_type, module));
   } else {
     CHECK(primitive_util::IsUnsignedIntegralType(from_type) ||
           from_type == PRED);
-    return ir_builder->CreateUIToFP(
-        integer_value, llvm_ir::PrimitiveTypeToIrType(to_type, module));
+    return b->CreateUIToFP(integer_value,
+                           llvm_ir::PrimitiveTypeToIrType(to_type, module));
   }
 }
 
@@ -226,39 +222,43 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerUnaryOp(
     case HloOpcode::kConvert: {
       PrimitiveType from_type = op->operand(0)->shape().element_type();
       PrimitiveType to_type = op->shape().element_type();
-      CHECK(primitive_util::IsIntegralType(from_type) || from_type == PRED);
+      CHECK(primitive_util::IsIntegralType(from_type) || from_type == PRED)
+          << from_type;
       if (from_type == to_type) {
         return operand_value;
       }
+      if (to_type == PRED) {
+        return b_->CreateZExt(
+            b_->CreateICmpNE(operand_value, llvm::ConstantInt::get(
+                                                operand_value->getType(), 0)),
+            llvm_ir::PrimitiveTypeToIrType(PRED, module_));
+      }
       if (primitive_util::IsIntegralType(to_type)) {
-        return ir_builder_->CreateIntCast(
+        return b_->CreateIntCast(
             operand_value, llvm_ir::PrimitiveTypeToIrType(to_type, module_),
             primitive_util::IsSignedIntegralType(from_type));
       }
       if (primitive_util::IsFloatingPointType(to_type)) {
         if (to_type == BF16) {
-          return EmitF32ToBF16(
-              EmitIntegralToFloating(operand_value, from_type, F32, module_,
-                                     ir_builder_),
-              ir_builder_);
+          return EmitF32ToBF16(EmitIntegralToFloating(operand_value, from_type,
+                                                      F32, module_, b_),
+                               b_);
         }
         return EmitIntegralToFloating(operand_value, from_type, to_type,
-                                      module_, ir_builder_);
+                                      module_, b_);
       }
       if (primitive_util::IsComplexType(to_type)) {
         auto to_ir_component_type = llvm_ir::PrimitiveTypeToIrType(
             primitive_util::ComplexComponentType(to_type), module_);
         if (primitive_util::IsSignedIntegralType(from_type)) {
           return EmitComposeComplex(
-              op,
-              ir_builder_->CreateSIToFP(operand_value, to_ir_component_type),
+              op, b_->CreateSIToFP(operand_value, to_ir_component_type),
               nullptr);
         }
         if (primitive_util::IsUnsignedIntegralType(from_type) ||
             from_type == PRED) {
           return EmitComposeComplex(
-              op,
-              ir_builder_->CreateUIToFP(operand_value, to_ir_component_type),
+              op, b_->CreateUIToFP(operand_value, to_ir_component_type),
               nullptr);
         }
       }
@@ -275,7 +275,7 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerUnaryOp(
       }
       if (primitive_util::BitWidth(from_type) ==
           primitive_util::BitWidth(to_type)) {
-        return ir_builder_->CreateBitCast(
+        return b_->CreateBitCast(
             operand_value, llvm_ir::PrimitiveTypeToIrType(to_type, module_));
       }
       return InvalidArgument(
@@ -293,18 +293,18 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerUnaryOp(
         auto type =
             llvm_ir::PrimitiveTypeToIrType(op->shape().element_type(), module_);
         auto zero = llvm::ConstantInt::get(type, 0);
-        auto cmp = ir_builder_->CreateICmpSGE(operand_value, zero);
-        return ir_builder_->CreateSelect(cmp, operand_value,
-                                         ir_builder_->CreateNeg(operand_value));
+        auto cmp = b_->CreateICmpSGE(operand_value, zero);
+        return b_->CreateSelect(cmp, operand_value,
+                                b_->CreateNeg(operand_value));
       } else {
         return operand_value;
       }
     }
     case HloOpcode::kClz: {
-      auto is_zero_undef = ir_builder_->getFalse();
-      return llvm_ir::EmitCallToIntrinsic(
-          llvm::Intrinsic::ctlz, {operand_value, is_zero_undef},
-          {operand_value->getType()}, ir_builder_);
+      auto is_zero_undef = b_->getFalse();
+      return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::ctlz,
+                                          {operand_value, is_zero_undef},
+                                          {operand_value->getType()}, b_);
     }
     case HloOpcode::kSign: {
       bool is_signed =
@@ -312,31 +312,28 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerUnaryOp(
       auto type =
           llvm_ir::PrimitiveTypeToIrType(op->shape().element_type(), module_);
       auto zero = llvm::ConstantInt::get(type, 0);
-      auto cmp = ir_builder_->CreateICmpEQ(operand_value, zero);
+      auto cmp = b_->CreateICmpEQ(operand_value, zero);
       if (is_signed) {
-        auto ashr = ir_builder_->CreateAShr(operand_value,
-                                            type->getIntegerBitWidth() - 1);
-        return ir_builder_->CreateSelect(cmp, zero,
-                                         ir_builder_->CreateOr(ashr, 1));
+        auto ashr =
+            b_->CreateAShr(operand_value, type->getIntegerBitWidth() - 1);
+        return b_->CreateSelect(cmp, zero, b_->CreateOr(ashr, 1));
       } else {
-        return ir_builder_->CreateSelect(cmp, zero,
-                                         llvm::ConstantInt::get(type, 1));
+        return b_->CreateSelect(cmp, zero, llvm::ConstantInt::get(type, 1));
       }
     }
     case HloOpcode::kNegate:
-      return ir_builder_->CreateNeg(operand_value);
+      return b_->CreateNeg(operand_value);
     case HloOpcode::kNot: {
       auto type = op->shape().element_type();
       if (type == PRED) {
         // It is not sufficient to just call CreateNot() here because a PRED
         // is represented as an i8 and the truth value is stored only in the
         // bottom bit.
-        return ir_builder_->CreateZExt(
-            ir_builder_->CreateNot(ir_builder_->CreateTrunc(
-                operand_value, ir_builder_->getInt1Ty())),
+        return b_->CreateZExt(
+            b_->CreateNot(b_->CreateTrunc(operand_value, b_->getInt1Ty())),
             llvm_ir::PrimitiveTypeToIrType(PRED, module_));
       } else if (primitive_util::IsIntegralType(type)) {
-        return ir_builder_->CreateNot(operand_value);
+        return b_->CreateNot(operand_value);
       }
       return Unimplemented("unary op Not is not defined for type '%d'", type);
     }
@@ -352,7 +349,7 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatUnaryOp(
     case HloOpcode::kConvert: {
       PrimitiveType from_type = op->operand(0)->shape().element_type();
       PrimitiveType to_type = op->shape().element_type();
-      CHECK(primitive_util::IsFloatingPointType(from_type));
+      CHECK(primitive_util::IsFloatingPointType(from_type)) << from_type;
       if (from_type == to_type) {
         return operand_value;
       }
@@ -364,32 +361,38 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatUnaryOp(
         }
         return EmitComposeComplex(
             op,
-            ir_builder_->CreateFPCast(
-                operand_value,
-                llvm_ir::PrimitiveTypeToIrType(to_component_type, module_)),
+            b_->CreateFPCast(operand_value, llvm_ir::PrimitiveTypeToIrType(
+                                                to_component_type, module_)),
             nullptr);
       }
       if (from_type == BF16) {
         TF_RET_CHECK(to_type != BF16);
-        operand_value = EmitBF16ToF32(operand_value, ir_builder_);
+        operand_value = EmitBF16ToF32(operand_value, b_);
         from_type = F32;
         if (from_type == to_type) {
           return operand_value;
         }
       }
       if (from_type == F32 && to_type == BF16) {
-        return EmitF32ToBF16(operand_value, ir_builder_);
+        return EmitF32ToBF16(operand_value, b_);
+      }
+      if (to_type == PRED) {
+        return b_->CreateZExt(
+            b_->CreateFCmpUNE(
+                operand_value,
+                llvm::ConstantFP::get(operand_value->getType(), 0.0)),
+            llvm_ir::PrimitiveTypeToIrType(PRED, module_));
       }
       if (primitive_util::IsFloatingPointType(to_type)) {
-        return ir_builder_->CreateFPCast(
+        return b_->CreateFPCast(
             operand_value, llvm_ir::PrimitiveTypeToIrType(to_type, module_));
       }
       if (primitive_util::IsSignedIntegralType(to_type)) {
-        return ir_builder_->CreateFPToSI(
+        return b_->CreateFPToSI(
             operand_value, llvm_ir::PrimitiveTypeToIrType(to_type, module_));
       }
       if (primitive_util::IsUnsignedIntegralType(to_type)) {
-        return ir_builder_->CreateFPToUI(
+        return b_->CreateFPToUI(
             operand_value, llvm_ir::PrimitiveTypeToIrType(to_type, module_));
       }
       return Unimplemented("unhandled conversion operation: %s => %s",
@@ -405,7 +408,7 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatUnaryOp(
       }
       if (primitive_util::BitWidth(from_type) ==
           primitive_util::BitWidth(to_type)) {
-        return ir_builder_->CreateBitCast(
+        return b_->CreateBitCast(
             operand_value, llvm_ir::PrimitiveTypeToIrType(to_type, module_));
       }
       return InvalidArgument(
@@ -429,45 +432,45 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatUnaryOp(
     case HloOpcode::kSin:
       return EmitSin(op->shape().element_type(), operand_value);
     case HloOpcode::kFloor:
-      return llvm_ir::EmitCallToIntrinsic(
-          llvm::Intrinsic::floor, {operand_value}, {operand_value->getType()},
-          ir_builder_);
+      return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::floor,
+                                          {operand_value},
+                                          {operand_value->getType()}, b_);
     case HloOpcode::kCeil:
-      return llvm_ir::EmitCallToIntrinsic(
-          llvm::Intrinsic::ceil, {operand_value}, {operand_value->getType()},
-          ir_builder_);
+      return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::ceil,
+                                          {operand_value},
+                                          {operand_value->getType()}, b_);
     case HloOpcode::kAbs:
-      return llvm_ir::EmitCallToIntrinsic(
-          llvm::Intrinsic::fabs, {operand_value}, {operand_value->getType()},
-          ir_builder_);
+      return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::fabs,
+                                          {operand_value},
+                                          {operand_value->getType()}, b_);
     case HloOpcode::kRoundNearestAfz:
-      return llvm_ir::EmitCallToIntrinsic(
-          llvm::Intrinsic::round, {operand_value}, {operand_value->getType()},
-          ir_builder_);
+      return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::round,
+                                          {operand_value},
+                                          {operand_value->getType()}, b_);
     case HloOpcode::kSign: {
       // TODO(b/32151903): Ensure consistent sign behavior for -0.0.
       auto type = operand_value->getType();
       auto zero = llvm::ConstantFP::get(type, 0.0);
-      auto oeq = ir_builder_->CreateFCmpOEQ(operand_value, zero);
-      auto olt = ir_builder_->CreateFCmpOLT(operand_value, zero);
-      return ir_builder_->CreateSelect(
+      auto oeq = b_->CreateFCmpOEQ(operand_value, zero);
+      auto olt = b_->CreateFCmpOLT(operand_value, zero);
+      return b_->CreateSelect(
           oeq, zero,
-          ir_builder_->CreateSelect(olt, llvm::ConstantFP::get(type, -1.0),
-                                    llvm::ConstantFP::get(type, 1.0)));
+          b_->CreateSelect(olt, llvm::ConstantFP::get(type, -1.0),
+                           llvm::ConstantFP::get(type, 1.0)));
     }
     case HloOpcode::kIsFinite: {
       // abs(x) o!= inf, this works because the comparison returns false if
       // either operand is NaN.
       auto type = operand_value->getType();
       auto abs_value = llvm_ir::EmitCallToIntrinsic(
-          llvm::Intrinsic::fabs, {operand_value}, {type}, ir_builder_);
+          llvm::Intrinsic::fabs, {operand_value}, {type}, b_);
       auto infinity = llvm::ConstantFP::getInfinity(type);
-      auto not_infinite = ir_builder_->CreateFCmpONE(abs_value, infinity);
-      return ir_builder_->CreateZExt(
-          not_infinite, llvm_ir::PrimitiveTypeToIrType(PRED, module_));
+      auto not_infinite = b_->CreateFCmpONE(abs_value, infinity);
+      return b_->CreateZExt(not_infinite,
+                            llvm_ir::PrimitiveTypeToIrType(PRED, module_));
     }
     case HloOpcode::kNegate:
-      return ir_builder_->CreateFNeg(operand_value);
+      return b_->CreateFNeg(operand_value);
     case HloOpcode::kReal:
       return operand_value;
     case HloOpcode::kImag:
@@ -491,13 +494,12 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
       auto a = EmitExtractReal(operand_value);
       auto b = EmitExtractImag(operand_value);
       llvm::Type* llvm_ty = a->getType();
-      auto sum_sq = ir_builder_->CreateFAdd(ir_builder_->CreateFMul(a, a),
-                                            ir_builder_->CreateFMul(b, b));
+      auto sum_sq = b_->CreateFAdd(b_->CreateFMul(a, a), b_->CreateFMul(b, b));
       TF_ASSIGN_OR_RETURN(auto log_sum_sq, EmitLog(component_type, sum_sq));
       TF_ASSIGN_OR_RETURN(auto angle, EmitAtan2(component_type, b, a));
       auto one_half = llvm::ConstantFP::get(llvm_ty, 0.5);
-      return EmitComposeComplex(
-          op, ir_builder_->CreateFMul(one_half, log_sum_sq), angle);
+      return EmitComposeComplex(op, b_->CreateFMul(one_half, log_sum_sq),
+                                angle);
     }
     case HloOpcode::kLog1p: {
       // log1p(a+bi) = .5*log((a+1)^2+b^2) + i*atan2(b, a + 1)
@@ -505,15 +507,14 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
       auto b = EmitExtractImag(operand_value);
       llvm::Type* llvm_ty = a->getType();
       auto one = llvm::ConstantFP::get(llvm_ty, 1.0);
-      auto a_plus_one = ir_builder_->CreateFAdd(a, one);
-      auto sum_sq = ir_builder_->CreateFAdd(
-          ir_builder_->CreateFMul(a_plus_one, a_plus_one),
-          ir_builder_->CreateFMul(b, b));
+      auto a_plus_one = b_->CreateFAdd(a, one);
+      auto sum_sq = b_->CreateFAdd(b_->CreateFMul(a_plus_one, a_plus_one),
+                                   b_->CreateFMul(b, b));
       TF_ASSIGN_OR_RETURN(auto log_sum_sq, EmitLog(component_type, sum_sq));
       TF_ASSIGN_OR_RETURN(auto angle, EmitAtan2(component_type, b, a_plus_one));
       auto one_half = llvm::ConstantFP::get(llvm_ty, 0.5);
-      return EmitComposeComplex(
-          op, ir_builder_->CreateFMul(one_half, log_sum_sq), angle);
+      return EmitComposeComplex(op, b_->CreateFMul(one_half, log_sum_sq),
+                                angle);
     }
     case HloOpcode::kConvert: {
       PrimitiveType from_type = op->operand(0)->shape().element_type();
@@ -527,12 +528,11 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
           primitive_util::ComplexComponentType(to_type);
       auto to_ir_component_type =
           llvm_ir::PrimitiveTypeToIrType(to_component_type, module_);
-      return EmitComposeComplex(
-          op,
-          ir_builder_->CreateFPCast(EmitExtractReal(operand_value),
-                                    to_ir_component_type),
-          ir_builder_->CreateFPCast(EmitExtractImag(operand_value),
-                                    to_ir_component_type));
+      return EmitComposeComplex(op,
+                                b_->CreateFPCast(EmitExtractReal(operand_value),
+                                                 to_ir_component_type),
+                                b_->CreateFPCast(EmitExtractImag(operand_value),
+                                                 to_ir_component_type));
     }
     case HloOpcode::kExp: {
       // e^(a+bi) = e^a*(cos(b)+sin(b)i)
@@ -542,8 +542,8 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
           auto cos_b, EmitCos(component_type, EmitExtractImag(operand_value)));
       TF_ASSIGN_OR_RETURN(
           auto sin_b, EmitSin(component_type, EmitExtractImag(operand_value)));
-      return EmitComposeComplex(op, ir_builder_->CreateFMul(exp_a, cos_b),
-                                ir_builder_->CreateFMul(exp_a, sin_b));
+      return EmitComposeComplex(op, b_->CreateFMul(exp_a, cos_b),
+                                b_->CreateFMul(exp_a, sin_b));
     }
     case HloOpcode::kExpm1: {
       // e^(a+bi)-1 = (e^a*cos(b)-1)+e^a*sin(b)i
@@ -554,9 +554,8 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
       TF_ASSIGN_OR_RETURN(
           auto sin_b, EmitSin(component_type, EmitExtractImag(operand_value)));
       auto one = llvm::ConstantFP::get(exp_a->getType(), 1.0);
-      auto real_result =
-          ir_builder_->CreateFSub(ir_builder_->CreateFMul(exp_a, cos_b), one);
-      auto imag_result = ir_builder_->CreateFMul(exp_a, sin_b);
+      auto real_result = b_->CreateFSub(b_->CreateFMul(exp_a, cos_b), one);
+      auto imag_result = b_->CreateFMul(exp_a, sin_b);
       return EmitComposeComplex(op, real_result, imag_result);
     }
     case HloOpcode::kCos: {
@@ -571,18 +570,14 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
       auto b = EmitExtractImag(operand_value);
       auto type = a->getType();
       TF_ASSIGN_OR_RETURN(auto exp_b, EmitExp(component_type, b));
-      auto half_exp_b =
-          ir_builder_->CreateFMul(llvm::ConstantFP::get(type, 0.5), exp_b);
+      auto half_exp_b = b_->CreateFMul(llvm::ConstantFP::get(type, 0.5), exp_b);
       auto half_exp_neg_b =
-          ir_builder_->CreateFDiv(llvm::ConstantFP::get(type, 0.5), exp_b);
+          b_->CreateFDiv(llvm::ConstantFP::get(type, 0.5), exp_b);
       TF_ASSIGN_OR_RETURN(auto cos_a, EmitCos(component_type, a));
       TF_ASSIGN_OR_RETURN(auto sin_a, EmitSin(component_type, a));
       return EmitComposeComplex(
-          op,
-          ir_builder_->CreateFMul(
-              cos_a, ir_builder_->CreateFAdd(half_exp_neg_b, half_exp_b)),
-          ir_builder_->CreateFMul(
-              sin_a, ir_builder_->CreateFSub(half_exp_neg_b, half_exp_b)));
+          op, b_->CreateFMul(cos_a, b_->CreateFAdd(half_exp_neg_b, half_exp_b)),
+          b_->CreateFMul(sin_a, b_->CreateFSub(half_exp_neg_b, half_exp_b)));
     }
     case HloOpcode::kSin: {
       // sin(z) = .5i(e^(-iz) - e^(iz))
@@ -598,18 +593,14 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
       auto b = EmitExtractImag(operand_value);
       auto type = a->getType();
       TF_ASSIGN_OR_RETURN(auto exp_b, EmitExp(component_type, b));
-      auto half_exp_b =
-          ir_builder_->CreateFMul(llvm::ConstantFP::get(type, 0.5), exp_b);
+      auto half_exp_b = b_->CreateFMul(llvm::ConstantFP::get(type, 0.5), exp_b);
       auto half_exp_neg_b =
-          ir_builder_->CreateFDiv(llvm::ConstantFP::get(type, 0.5), exp_b);
+          b_->CreateFDiv(llvm::ConstantFP::get(type, 0.5), exp_b);
       TF_ASSIGN_OR_RETURN(auto cos_a, EmitCos(component_type, a));
       TF_ASSIGN_OR_RETURN(auto sin_a, EmitSin(component_type, a));
       return EmitComposeComplex(
-          op,
-          ir_builder_->CreateFMul(
-              sin_a, ir_builder_->CreateFAdd(half_exp_b, half_exp_neg_b)),
-          ir_builder_->CreateFMul(
-              cos_a, ir_builder_->CreateFSub(half_exp_b, half_exp_neg_b)));
+          op, b_->CreateFMul(sin_a, b_->CreateFAdd(half_exp_b, half_exp_neg_b)),
+          b_->CreateFMul(cos_a, b_->CreateFSub(half_exp_b, half_exp_neg_b)));
     }
     case HloOpcode::kTanh: {
       /*
@@ -637,64 +628,61 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
       TF_ASSIGN_OR_RETURN(auto exp_a, EmitExp(component_type, a));
       TF_ASSIGN_OR_RETURN(auto cos_b, EmitCos(component_type, b));
       TF_ASSIGN_OR_RETURN(auto sin_b, EmitSin(component_type, b));
-      auto exp_neg_a = ir_builder_->CreateFDiv(
-          llvm::ConstantFP::get(exp_a->getType(), 1), exp_a);
-      auto exp_2a_minus_exp_neg_2a = ir_builder_->CreateFSub(
-          ir_builder_->CreateFMul(exp_a, exp_a),
-          ir_builder_->CreateFMul(exp_neg_a, exp_neg_a));
-      auto cos_b_sq = ir_builder_->CreateFMul(cos_b, cos_b);
-      auto sin_b_sq = ir_builder_->CreateFMul(sin_b, sin_b);
-      auto real_num = ir_builder_->CreateFAdd(
-          ir_builder_->CreateFMul(cos_b_sq, exp_2a_minus_exp_neg_2a),
-          ir_builder_->CreateFMul(sin_b_sq, exp_2a_minus_exp_neg_2a));
-      auto cos_b_sin_b = ir_builder_->CreateFMul(cos_b, sin_b);
-      auto exp_a_plus_exp_neg_a = ir_builder_->CreateFAdd(exp_a, exp_neg_a);
+      auto exp_neg_a =
+          b_->CreateFDiv(llvm::ConstantFP::get(exp_a->getType(), 1), exp_a);
+      auto exp_2a_minus_exp_neg_2a = b_->CreateFSub(
+          b_->CreateFMul(exp_a, exp_a), b_->CreateFMul(exp_neg_a, exp_neg_a));
+      auto cos_b_sq = b_->CreateFMul(cos_b, cos_b);
+      auto sin_b_sq = b_->CreateFMul(sin_b, sin_b);
+      auto real_num =
+          b_->CreateFAdd(b_->CreateFMul(cos_b_sq, exp_2a_minus_exp_neg_2a),
+                         b_->CreateFMul(sin_b_sq, exp_2a_minus_exp_neg_2a));
+      auto cos_b_sin_b = b_->CreateFMul(cos_b, sin_b);
+      auto exp_a_plus_exp_neg_a = b_->CreateFAdd(exp_a, exp_neg_a);
       auto exp_a_plus_exp_neg_a_sq =
-          ir_builder_->CreateFMul(exp_a_plus_exp_neg_a, exp_a_plus_exp_neg_a);
-      auto exp_a_minus_exp_neg_a = ir_builder_->CreateFSub(exp_a, exp_neg_a);
+          b_->CreateFMul(exp_a_plus_exp_neg_a, exp_a_plus_exp_neg_a);
+      auto exp_a_minus_exp_neg_a = b_->CreateFSub(exp_a, exp_neg_a);
       auto exp_a_minus_exp_neg_a_sq =
-          ir_builder_->CreateFMul(exp_a_minus_exp_neg_a, exp_a_minus_exp_neg_a);
-      auto imag_num = ir_builder_->CreateFMul(
-          cos_b_sin_b, ir_builder_->CreateFSub(exp_a_plus_exp_neg_a_sq,
-                                               exp_a_minus_exp_neg_a_sq));
-      auto denom = ir_builder_->CreateFAdd(
-          ir_builder_->CreateFMul(cos_b_sq, exp_a_plus_exp_neg_a_sq),
-          ir_builder_->CreateFMul(sin_b_sq, exp_a_minus_exp_neg_a_sq));
-      return EmitComposeComplex(op, ir_builder_->CreateFDiv(real_num, denom),
-                                ir_builder_->CreateFDiv(imag_num, denom));
+          b_->CreateFMul(exp_a_minus_exp_neg_a, exp_a_minus_exp_neg_a);
+      auto imag_num = b_->CreateFMul(
+          cos_b_sin_b,
+          b_->CreateFSub(exp_a_plus_exp_neg_a_sq, exp_a_minus_exp_neg_a_sq));
+      auto denom =
+          b_->CreateFAdd(b_->CreateFMul(cos_b_sq, exp_a_plus_exp_neg_a_sq),
+                         b_->CreateFMul(sin_b_sq, exp_a_minus_exp_neg_a_sq));
+      return EmitComposeComplex(op, b_->CreateFDiv(real_num, denom),
+                                b_->CreateFDiv(imag_num, denom));
     }
     case HloOpcode::kAbs: {
-      auto sum_sq = ir_builder_->CreateFAdd(
-          ir_builder_->CreateFMul(EmitExtractReal(operand_value),
-                                  EmitExtractReal(operand_value)),
-          ir_builder_->CreateFMul(EmitExtractImag(operand_value),
-                                  EmitExtractImag(operand_value)));
+      auto sum_sq =
+          b_->CreateFAdd(b_->CreateFMul(EmitExtractReal(operand_value),
+                                        EmitExtractReal(operand_value)),
+                         b_->CreateFMul(EmitExtractImag(operand_value),
+                                        EmitExtractImag(operand_value)));
       return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::sqrt, {sum_sq},
-                                          {sum_sq->getType()}, ir_builder_);
+                                          {sum_sq->getType()}, b_);
     }
     case HloOpcode::kSign: {  // Sign(c) = c / |c|
-      auto sum_sq = ir_builder_->CreateFAdd(
-          ir_builder_->CreateFMul(EmitExtractReal(operand_value),
-                                  EmitExtractReal(operand_value)),
-          ir_builder_->CreateFMul(EmitExtractImag(operand_value),
-                                  EmitExtractImag(operand_value)));
+      auto sum_sq =
+          b_->CreateFAdd(b_->CreateFMul(EmitExtractReal(operand_value),
+                                        EmitExtractReal(operand_value)),
+                         b_->CreateFMul(EmitExtractImag(operand_value),
+                                        EmitExtractImag(operand_value)));
       auto cplx_abs = llvm_ir::EmitCallToIntrinsic(
-          llvm::Intrinsic::sqrt, {sum_sq}, {sum_sq->getType()}, ir_builder_);
+          llvm::Intrinsic::sqrt, {sum_sq}, {sum_sq->getType()}, b_);
       auto type = cplx_abs->getType();
       auto zero = llvm::ConstantFP::get(type, 0.0);
-      auto oeq = ir_builder_->CreateFCmpOEQ(cplx_abs, zero);
-      return ir_builder_->CreateSelect(
+      auto oeq = b_->CreateFCmpOEQ(cplx_abs, zero);
+      return b_->CreateSelect(
           oeq, EmitComposeComplex(op, zero, zero),
           EmitComposeComplex(
-              op,
-              ir_builder_->CreateFDiv(EmitExtractReal(operand_value), cplx_abs),
-              ir_builder_->CreateFDiv(EmitExtractImag(operand_value),
-                                      cplx_abs)));
+              op, b_->CreateFDiv(EmitExtractReal(operand_value), cplx_abs),
+              b_->CreateFDiv(EmitExtractImag(operand_value), cplx_abs)));
     }
     case HloOpcode::kNegate:
-      return EmitComposeComplex(
-          op, ir_builder_->CreateFNeg(EmitExtractReal(operand_value)),
-          ir_builder_->CreateFNeg(EmitExtractImag(operand_value)));
+      return EmitComposeComplex(op,
+                                b_->CreateFNeg(EmitExtractReal(operand_value)),
+                                b_->CreateFNeg(EmitExtractImag(operand_value)));
     case HloOpcode::kReal:
       return EmitExtractReal(operand_value);
     case HloOpcode::kImag:
@@ -728,15 +716,15 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatBinaryOp(
     case HloOpcode::kComplex:
       return EmitComposeComplex(op, lhs_value, rhs_value);
     case HloOpcode::kAdd:
-      return ir_builder_->CreateFAdd(lhs_value, rhs_value);
+      return b_->CreateFAdd(lhs_value, rhs_value);
     case HloOpcode::kSubtract:
-      return ir_builder_->CreateFSub(lhs_value, rhs_value);
+      return b_->CreateFSub(lhs_value, rhs_value);
     case HloOpcode::kMultiply:
-      return ir_builder_->CreateFMul(lhs_value, rhs_value);
+      return b_->CreateFMul(lhs_value, rhs_value);
     case HloOpcode::kDivide:
-      return ir_builder_->CreateFDiv(lhs_value, rhs_value);
+      return b_->CreateFDiv(lhs_value, rhs_value);
     case HloOpcode::kRemainder:
-      return ir_builder_->CreateFRem(lhs_value, rhs_value);
+      return b_->CreateFRem(lhs_value, rhs_value);
     // LLVM comparisons can be "unordered" (U) or "ordered" (O) -- ordered
     // comparisons always return false when one of the operands is NaN, whereas
     // unordered comparisons return true.
@@ -746,22 +734,22 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatBinaryOp(
     // matches C++'s semantics.
     case HloOpcode::kEq:
       return llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OEQ, lhs_value,
-                                     rhs_value, ir_builder_);
+                                     rhs_value, b_);
     case HloOpcode::kNe:
       return llvm_ir::EmitComparison(llvm::CmpInst::FCMP_UNE, lhs_value,
-                                     rhs_value, ir_builder_);
+                                     rhs_value, b_);
     case HloOpcode::kLt:
       return llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OLT, lhs_value,
-                                     rhs_value, ir_builder_);
+                                     rhs_value, b_);
     case HloOpcode::kGt:
       return llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OGT, lhs_value,
-                                     rhs_value, ir_builder_);
+                                     rhs_value, b_);
     case HloOpcode::kLe:
       return llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OLE, lhs_value,
-                                     rhs_value, ir_builder_);
+                                     rhs_value, b_);
     case HloOpcode::kGe:
       return llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OGE, lhs_value,
-                                     rhs_value, ir_builder_);
+                                     rhs_value, b_);
 
     case HloOpcode::kMaximum:
       return EmitFloatMax(lhs_value, rhs_value);
@@ -782,64 +770,56 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexBinaryOp(
     llvm::Value* rhs_value) const {
   switch (op->opcode()) {
     case HloOpcode::kAdd:
-      return EmitComposeComplex(
-          op,
-          ir_builder_->CreateFAdd(EmitExtractReal(lhs_value),
-                                  EmitExtractReal(rhs_value)),
-          ir_builder_->CreateFAdd(EmitExtractImag(lhs_value),
-                                  EmitExtractImag(rhs_value)));
+      return EmitComposeComplex(op,
+                                b_->CreateFAdd(EmitExtractReal(lhs_value),
+                                               EmitExtractReal(rhs_value)),
+                                b_->CreateFAdd(EmitExtractImag(lhs_value),
+                                               EmitExtractImag(rhs_value)));
     case HloOpcode::kSubtract:
-      return EmitComposeComplex(
-          op,
-          ir_builder_->CreateFSub(EmitExtractReal(lhs_value),
-                                  EmitExtractReal(rhs_value)),
-          ir_builder_->CreateFSub(EmitExtractImag(lhs_value),
-                                  EmitExtractImag(rhs_value)));
+      return EmitComposeComplex(op,
+                                b_->CreateFSub(EmitExtractReal(lhs_value),
+                                               EmitExtractReal(rhs_value)),
+                                b_->CreateFSub(EmitExtractImag(lhs_value),
+                                               EmitExtractImag(rhs_value)));
     case HloOpcode::kMultiply:
       return EmitComposeComplex(
           op,
-          ir_builder_->CreateFSub(
-              ir_builder_->CreateFMul(EmitExtractReal(lhs_value),
-                                      EmitExtractReal(rhs_value)),
-              ir_builder_->CreateFMul(EmitExtractImag(lhs_value),
-                                      EmitExtractImag(rhs_value))),
-          ir_builder_->CreateFAdd(
-              ir_builder_->CreateFMul(EmitExtractReal(lhs_value),
-                                      EmitExtractImag(rhs_value)),
-              ir_builder_->CreateFMul(EmitExtractImag(lhs_value),
-                                      EmitExtractReal(rhs_value))));
+          b_->CreateFSub(b_->CreateFMul(EmitExtractReal(lhs_value),
+                                        EmitExtractReal(rhs_value)),
+                         b_->CreateFMul(EmitExtractImag(lhs_value),
+                                        EmitExtractImag(rhs_value))),
+          b_->CreateFAdd(b_->CreateFMul(EmitExtractReal(lhs_value),
+                                        EmitExtractImag(rhs_value)),
+                         b_->CreateFMul(EmitExtractImag(lhs_value),
+                                        EmitExtractReal(rhs_value))));
     case HloOpcode::kDivide: {
       // (a+bi) / (c+di) = ((a+bi)(c-di)) / ((c+di)(c-di))
       // = ((ac + bd) + (bc - ad)i) / (c^2 + d^2)
-      auto rhs_sum_sq = ir_builder_->CreateFAdd(
-          ir_builder_->CreateFMul(EmitExtractReal(rhs_value),
-                                  EmitExtractReal(rhs_value)),
-          ir_builder_->CreateFMul(EmitExtractImag(rhs_value),
-                                  EmitExtractImag(rhs_value)));
+      auto rhs_sum_sq =
+          b_->CreateFAdd(b_->CreateFMul(EmitExtractReal(rhs_value),
+                                        EmitExtractReal(rhs_value)),
+                         b_->CreateFMul(EmitExtractImag(rhs_value),
+                                        EmitExtractImag(rhs_value)));
       auto type = rhs_sum_sq->getType();
       auto zero = llvm::ConstantFP::get(type, 0.0);
-      auto oeq = ir_builder_->CreateFCmpOEQ(rhs_sum_sq, zero);
-      auto real_inf_or_nan =
-          ir_builder_->CreateFDiv(EmitExtractReal(lhs_value), zero);
-      auto imag_inf_or_nan =
-          ir_builder_->CreateFDiv(EmitExtractImag(lhs_value), zero);
-      return ir_builder_->CreateSelect(
+      auto oeq = b_->CreateFCmpOEQ(rhs_sum_sq, zero);
+      auto real_inf_or_nan = b_->CreateFDiv(EmitExtractReal(lhs_value), zero);
+      auto imag_inf_or_nan = b_->CreateFDiv(EmitExtractImag(lhs_value), zero);
+      return b_->CreateSelect(
           oeq, EmitComposeComplex(op, real_inf_or_nan, imag_inf_or_nan),
           EmitComposeComplex(
               op,
-              ir_builder_->CreateFDiv(
-                  ir_builder_->CreateFAdd(
-                      ir_builder_->CreateFMul(EmitExtractReal(lhs_value),
-                                              EmitExtractReal(rhs_value)),
-                      ir_builder_->CreateFMul(EmitExtractImag(lhs_value),
-                                              EmitExtractImag(rhs_value))),
+              b_->CreateFDiv(
+                  b_->CreateFAdd(b_->CreateFMul(EmitExtractReal(lhs_value),
+                                                EmitExtractReal(rhs_value)),
+                                 b_->CreateFMul(EmitExtractImag(lhs_value),
+                                                EmitExtractImag(rhs_value))),
                   rhs_sum_sq),
-              ir_builder_->CreateFDiv(
-                  ir_builder_->CreateFSub(
-                      ir_builder_->CreateFMul(EmitExtractImag(lhs_value),
-                                              EmitExtractReal(rhs_value)),
-                      ir_builder_->CreateFMul(EmitExtractReal(lhs_value),
-                                              EmitExtractImag(rhs_value))),
+              b_->CreateFDiv(
+                  b_->CreateFSub(b_->CreateFMul(EmitExtractImag(lhs_value),
+                                                EmitExtractReal(rhs_value)),
+                                 b_->CreateFMul(EmitExtractReal(lhs_value),
+                                                EmitExtractImag(rhs_value))),
                   rhs_sum_sq)));
     }
     // LLVM comparisons can be "unordered" (U) or "ordered" (O) -- ordered
@@ -850,21 +830,21 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexBinaryOp(
     // unordered comparison.  This makes x != y equivalent to !(x == y), and
     // matches C++'s semantics.
     case HloOpcode::kEq:
-      return ir_builder_->CreateAnd(
+      return b_->CreateAnd(
           llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OEQ,
                                   EmitExtractReal(lhs_value),
-                                  EmitExtractReal(rhs_value), ir_builder_),
+                                  EmitExtractReal(rhs_value), b_),
           llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OEQ,
                                   EmitExtractImag(lhs_value),
-                                  EmitExtractImag(rhs_value), ir_builder_));
+                                  EmitExtractImag(rhs_value), b_));
     case HloOpcode::kNe:
-      return ir_builder_->CreateOr(
+      return b_->CreateOr(
           llvm_ir::EmitComparison(llvm::CmpInst::FCMP_UNE,
                                   EmitExtractReal(lhs_value),
-                                  EmitExtractReal(rhs_value), ir_builder_),
+                                  EmitExtractReal(rhs_value), b_),
           llvm_ir::EmitComparison(llvm::CmpInst::FCMP_UNE,
                                   EmitExtractImag(lhs_value),
-                                  EmitExtractImag(rhs_value), ir_builder_));
+                                  EmitExtractImag(rhs_value), b_));
 
     case HloOpcode::kPower: {
       // (a+bi)^(c+di) =
@@ -876,29 +856,26 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexBinaryOp(
       auto b = EmitExtractImag(lhs_value);
       auto c = EmitExtractReal(rhs_value);
       auto d = EmitExtractImag(rhs_value);
-      auto aa_p_bb = ir_builder_->CreateFAdd(ir_builder_->CreateFMul(a, a),
-                                             ir_builder_->CreateFMul(b, b));
+      auto aa_p_bb = b_->CreateFAdd(b_->CreateFMul(a, a), b_->CreateFMul(b, b));
       auto one_half = llvm::ConstantFP::get(a->getType(), 0.5);
-      auto half_c = ir_builder_->CreateFMul(one_half, c);
+      auto half_c = b_->CreateFMul(one_half, c);
 
       TF_ASSIGN_OR_RETURN(auto aa_p_bb_to_half_c,
                           EmitPow(component_type, aa_p_bb, half_c));
-      auto neg_d = ir_builder_->CreateFNeg(d);
+      auto neg_d = b_->CreateFNeg(d);
       TF_ASSIGN_OR_RETURN(auto arg_lhs, EmitAtan2(component_type, b, a));
-      auto neg_d_arg_lhs = ir_builder_->CreateFMul(neg_d, arg_lhs);
+      auto neg_d_arg_lhs = b_->CreateFMul(neg_d, arg_lhs);
       TF_ASSIGN_OR_RETURN(auto e_to_neg_d_arg_lhs,
                           EmitExp(component_type, neg_d_arg_lhs));
-      auto coeff =
-          ir_builder_->CreateFMul(aa_p_bb_to_half_c, e_to_neg_d_arg_lhs);
+      auto coeff = b_->CreateFMul(aa_p_bb_to_half_c, e_to_neg_d_arg_lhs);
       TF_ASSIGN_OR_RETURN(auto ln_aa_p_bb, EmitLog(component_type, aa_p_bb));
-      auto half_d = ir_builder_->CreateFMul(one_half, d);
-      auto q =
-          ir_builder_->CreateFAdd(ir_builder_->CreateFMul(c, arg_lhs),
-                                  ir_builder_->CreateFMul(half_d, ln_aa_p_bb));
+      auto half_d = b_->CreateFMul(one_half, d);
+      auto q = b_->CreateFAdd(b_->CreateFMul(c, arg_lhs),
+                              b_->CreateFMul(half_d, ln_aa_p_bb));
       TF_ASSIGN_OR_RETURN(auto cos_q, EmitCos(component_type, q));
       TF_ASSIGN_OR_RETURN(auto sin_q, EmitSin(component_type, q));
-      return EmitComposeComplex(op, ir_builder_->CreateFMul(coeff, cos_q),
-                                ir_builder_->CreateFMul(coeff, sin_q));
+      return EmitComposeComplex(op, b_->CreateFMul(coeff, cos_q),
+                                b_->CreateFMul(coeff, sin_q));
     }
     default:
       return Unimplemented("binary complex op '%s'",
@@ -908,12 +885,12 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexBinaryOp(
 
 llvm::Value* ElementalIrEmitter::EmitFloatMax(llvm::Value* lhs_value,
                                               llvm::Value* rhs_value) const {
-  return llvm_ir::EmitFloatMax(lhs_value, rhs_value, ir_builder_);
+  return llvm_ir::EmitFloatMax(lhs_value, rhs_value, b_);
 }
 
 llvm::Value* ElementalIrEmitter::EmitFloatMin(llvm::Value* lhs_value,
                                               llvm::Value* rhs_value) const {
-  return llvm_ir::EmitFloatMin(lhs_value, rhs_value, ir_builder_);
+  return llvm_ir::EmitFloatMin(lhs_value, rhs_value, b_);
 }
 
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitErfInv(PrimitiveType prim_type,
@@ -925,15 +902,14 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitErfInv(PrimitiveType prim_type,
         "type F32.");
   }
   auto getFloat = [&](const float f) {
-    return llvm::ConstantFP::get(ir_builder_->getFloatTy(), f);
+    return llvm::ConstantFP::get(b_->getFloatTy(), f);
   };
   auto multiply_add = [&](tensorflow::gtl::ArraySlice<float> coefficients,
                           llvm::Value* w) {
     llvm::Value* p = getFloat(coefficients.front());
     coefficients.pop_front();
     for (float coefficient : coefficients) {
-      p = ir_builder_->CreateFAdd(ir_builder_->CreateFMul(p, w),
-                                  getFloat(coefficient));
+      p = b_->CreateFAdd(b_->CreateFMul(p, w), getFloat(coefficient));
     }
     return p;
   };
@@ -951,50 +927,48 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitErfInv(PrimitiveType prim_type,
   //   }
   //   return p*x
   llvm::Function* logf_fn = llvm::Intrinsic::getDeclaration(
-      module_, llvm::Intrinsic::log, {ir_builder_->getFloatTy()});
+      module_, llvm::Intrinsic::log, {b_->getFloatTy()});
 
-  llvm::Value* w = ir_builder_->CreateFNeg(ir_builder_->CreateCall(
-      logf_fn,
-      {ir_builder_->CreateFMul(ir_builder_->CreateFSub(getFloat(1.0f), x),
-                               ir_builder_->CreateFAdd(getFloat(1.0f), x))}));
+  llvm::Value* w = b_->CreateFNeg(b_->CreateCall(
+      logf_fn, {b_->CreateFMul(b_->CreateFSub(getFloat(1.0f), x),
+                               b_->CreateFAdd(getFloat(1.0f), x))}));
 
-  llvm::Value* p_addr = llvm_ir::EmitAllocaAtFunctionEntry(
-      ir_builder_->getFloatTy(), "p.addr", ir_builder_);
+  llvm::Value* p_addr =
+      llvm_ir::EmitAllocaAtFunctionEntry(b_->getFloatTy(), "p.addr", b_);
 
-  llvm_ir::LlvmIfData if_data =
-      llvm_ir::EmitIfThenElse(ir_builder_->CreateFCmpOLT(w, getFloat(5.0f)),
-                              "w_less_than_five", ir_builder_);
+  llvm_ir::LlvmIfData if_data = llvm_ir::EmitIfThenElse(
+      b_->CreateFCmpOLT(w, getFloat(5.0f)), "w_less_than_five", b_);
   // Handle true BB.
-  SetToFirstInsertPoint(if_data.true_block, ir_builder_);
+  SetToFirstInsertPoint(if_data.true_block, b_);
   {
-    llvm::Value* lw = ir_builder_->CreateFSub(w, getFloat(2.5f));
+    llvm::Value* lw = b_->CreateFSub(w, getFloat(2.5f));
     tensorflow::gtl::ArraySlice<float> lq{
         2.81022636e-08f,  3.43273939e-07f, -3.5233877e-06f,
         -4.39150654e-06f, 0.00021858087f,  -0.00125372503f,
         -0.00417768164f,  0.246640727f,    1.50140941f};
     llvm::Value* p = multiply_add(lq, lw);
-    ir_builder_->CreateStore(p, p_addr);
+    b_->CreateStore(p, p_addr);
   }
 
   // Handle false BB.
-  SetToFirstInsertPoint(if_data.false_block, ir_builder_);
+  SetToFirstInsertPoint(if_data.false_block, b_);
   {
     llvm::Function* sqrtf_fn = llvm::Intrinsic::getDeclaration(
-        module_, llvm::Intrinsic::sqrt, {ir_builder_->getFloatTy()});
+        module_, llvm::Intrinsic::sqrt, {b_->getFloatTy()});
 
-    llvm::Value* gw = ir_builder_->CreateFSub(
-        ir_builder_->CreateCall(sqrtf_fn, {w}), getFloat(3.0f));
+    llvm::Value* gw =
+        b_->CreateFSub(b_->CreateCall(sqrtf_fn, {w}), getFloat(3.0f));
     tensorflow::gtl::ArraySlice<float> gq{
         -0.000200214257f, 0.000100950558f, 0.00134934322f,
         -0.00367342844f,  0.00573950773f,  -0.0076224613f,
         0.00943887047f,   1.00167406f,     2.83297682f};
     llvm::Value* p = multiply_add(gq, gw);
-    ir_builder_->CreateStore(p, p_addr);
+    b_->CreateStore(p, p_addr);
   }
 
-  SetToFirstInsertPoint(if_data.after_block, ir_builder_);
-  llvm::Value* p = ir_builder_->CreateLoad(p_addr);
-  return ir_builder_->CreateFMul(p, x);
+  SetToFirstInsertPoint(if_data.after_block, b_);
+  llvm::Value* p = b_->CreateLoad(p_addr);
+  return b_->CreateFMul(p, x);
 }
 
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitErfcInv(
@@ -1002,13 +976,13 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitErfcInv(
   // Compute erfcinv(value) by calculating erfinv(1.0 - value).
   auto type = llvm_ir::PrimitiveTypeToIrType(prim_type, module_);
   auto one = llvm::ConstantFP::get(type, 1.0);
-  return EmitErfInv(prim_type, ir_builder_->CreateFSub(one, value));
+  return EmitErfInv(prim_type, b_->CreateFSub(one, value));
 }
 
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitLog(PrimitiveType prim_type,
                                                    llvm::Value* value) const {
   return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::log, {value},
-                                      {value->getType()}, ir_builder_);
+                                      {value->getType()}, b_);
 }
 
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitLog1p(PrimitiveType prim_type,
@@ -1020,35 +994,34 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitLog1p(PrimitiveType prim_type,
   // When x is large, the naive evaluation of ln(x + 1) is more
   // accurate than the Taylor series.
   TF_ASSIGN_OR_RETURN(auto for_large_x,
-                      EmitLog(prim_type, ir_builder_->CreateFAdd(x, one)));
+                      EmitLog(prim_type, b_->CreateFAdd(x, one)));
   // The Taylor series for ln(x+1) is x - x^2/2 - x^3/3 + ….
-  auto for_small_x = ir_builder_->CreateFMul(
-      ir_builder_->CreateFAdd(ir_builder_->CreateFMul(negative_half, x), one),
-      x);
+  auto for_small_x =
+      b_->CreateFMul(b_->CreateFAdd(b_->CreateFMul(negative_half, x), one), x);
   const auto kAntilogarithmIsSmallThreshold = 1e-4;
-  auto abs_x = llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::fabs, {value},
-                                            {type}, ir_builder_);
-  auto x_is_small = ir_builder_->CreateFCmpOLT(
+  auto abs_x =
+      llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::fabs, {value}, {type}, b_);
+  auto x_is_small = b_->CreateFCmpOLT(
       abs_x, llvm::ConstantFP::get(type, kAntilogarithmIsSmallThreshold));
-  return ir_builder_->CreateSelect(x_is_small, for_small_x, for_large_x);
+  return b_->CreateSelect(x_is_small, for_small_x, for_large_x);
 }
 
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitSin(PrimitiveType prim_type,
                                                    llvm::Value* value) const {
   return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::sin, {value},
-                                      {value->getType()}, ir_builder_);
+                                      {value->getType()}, b_);
 }
 
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitCos(PrimitiveType prim_type,
                                                    llvm::Value* value) const {
   return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::cos, {value},
-                                      {value->getType()}, ir_builder_);
+                                      {value->getType()}, b_);
 }
 
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitExp(PrimitiveType prim_type,
                                                    llvm::Value* value) const {
   return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::exp, {value},
-                                      {value->getType()}, ir_builder_);
+                                      {value->getType()}, b_);
 }
 
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitExpm1(PrimitiveType prim_type,
@@ -1060,25 +1033,25 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitExpm1(PrimitiveType prim_type,
   // When the exponent is large, the naive evaluation of e^(x) - 1 is more
   // accurate than the Taylor series.
   TF_ASSIGN_OR_RETURN(auto exp_x, EmitExp(prim_type, value));
-  auto for_large_x = ir_builder_->CreateFSub(exp_x, one);
+  auto for_large_x = b_->CreateFSub(exp_x, one);
   // The Taylor series for exp(x) is 1 + x + x^2/2 + x^3/6 + ….
   // We want exp(x)-1 which is x + x^2/2 + x^3/6 + ….
-  auto x_squared = ir_builder_->CreateFAdd(x, x);
-  auto x_squared_over_two = ir_builder_->CreateFMul(x_squared, half);
-  auto for_small_x = ir_builder_->CreateFAdd(x, x_squared_over_two);
+  auto x_squared = b_->CreateFAdd(x, x);
+  auto x_squared_over_two = b_->CreateFMul(x_squared, half);
+  auto for_small_x = b_->CreateFAdd(x, x_squared_over_two);
   const auto kExponentIsSmallThreshold = 1e-5;
-  auto abs_x = llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::fabs, {value},
-                                            {type}, ir_builder_);
-  auto x_is_small = ir_builder_->CreateFCmpOLT(
+  auto abs_x =
+      llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::fabs, {value}, {type}, b_);
+  auto x_is_small = b_->CreateFCmpOLT(
       abs_x, llvm::ConstantFP::get(type, kExponentIsSmallThreshold));
-  return ir_builder_->CreateSelect(x_is_small, for_small_x, for_large_x);
+  return b_->CreateSelect(x_is_small, for_small_x, for_large_x);
 }
 
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitPow(PrimitiveType prim_type,
                                                    llvm::Value* lhs,
                                                    llvm::Value* rhs) const {
   return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::pow, {lhs, rhs},
-                                      {lhs->getType()}, ir_builder_);
+                                      {lhs->getType()}, b_);
 }
 
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitAtan2(PrimitiveType prim_type,
@@ -1093,11 +1066,10 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitReducePrecision(
     return Unimplemented("reduce-precision only implemented for F32");
   }
   return EmitReducePrecisionFloat(x, /*exponent_bits=*/hlo->exponent_bits(),
-                                  /*mantissa_bits=*/hlo->mantissa_bits(),
-                                  ir_builder_);
+                                  /*mantissa_bits=*/hlo->mantissa_bits(), b_);
 }
 
-static llvm::Value* SaturateShiftIfNecessary(llvm::IRBuilder<>* ir_builder,
+static llvm::Value* SaturateShiftIfNecessary(llvm::IRBuilder<>* b,
                                              llvm::Value* lhs, llvm::Value* rhs,
                                              llvm::Value* shift_result,
                                              bool saturate_to_sign_bit) {
@@ -1110,15 +1082,14 @@ static llvm::Value* SaturateShiftIfNecessary(llvm::IRBuilder<>* ir_builder,
   llvm::ConstantInt* minus_one = llvm::ConstantInt::get(integer_type, -1);
   llvm::Value* saturated_value;
   if (saturate_to_sign_bit) {
-    saturated_value = ir_builder->CreateSelect(
-        ir_builder->CreateICmpSLT(lhs, zero), minus_one, zero);
+    saturated_value =
+        b->CreateSelect(b->CreateICmpSLT(lhs, zero), minus_one, zero);
   } else {
     saturated_value = zero;
   }
   llvm::Value* shift_amt_in_range =
-      ir_builder->CreateICmpULT(rhs, integer_bitsize_constant, "shft.chk");
-  return ir_builder->CreateSelect(shift_amt_in_range, shift_result,
-                                  saturated_value);
+      b->CreateICmpULT(rhs, integer_bitsize_constant, "shft.chk");
+  return b->CreateSelect(shift_amt_in_range, shift_result, saturated_value);
 }
 
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerBinaryOp(
@@ -1127,49 +1098,49 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerBinaryOp(
   switch (op->opcode()) {
     // TODO(jingyue): add the "nsw" attribute for signed types.
     case HloOpcode::kAdd:
-      return ir_builder_->CreateAdd(lhs_value, rhs_value);
+      return b_->CreateAdd(lhs_value, rhs_value);
     case HloOpcode::kSubtract:
-      return ir_builder_->CreateSub(lhs_value, rhs_value);
+      return b_->CreateSub(lhs_value, rhs_value);
     case HloOpcode::kMultiply:
-      return ir_builder_->CreateMul(lhs_value, rhs_value);
+      return b_->CreateMul(lhs_value, rhs_value);
     case HloOpcode::kDivide:
-      return is_signed ? ir_builder_->CreateSDiv(lhs_value, rhs_value)
-                       : ir_builder_->CreateUDiv(lhs_value, rhs_value);
+      return is_signed ? b_->CreateSDiv(lhs_value, rhs_value)
+                       : b_->CreateUDiv(lhs_value, rhs_value);
     case HloOpcode::kRemainder:
-      return is_signed ? ir_builder_->CreateSRem(lhs_value, rhs_value)
-                       : ir_builder_->CreateURem(lhs_value, rhs_value);
+      return is_signed ? b_->CreateSRem(lhs_value, rhs_value)
+                       : b_->CreateURem(lhs_value, rhs_value);
     case HloOpcode::kEq:
       return llvm_ir::EmitComparison(llvm::CmpInst::ICMP_EQ, lhs_value,
-                                     rhs_value, ir_builder_);
+                                     rhs_value, b_);
     case HloOpcode::kNe:
       return llvm_ir::EmitComparison(llvm::CmpInst::ICMP_NE, lhs_value,
-                                     rhs_value, ir_builder_);
+                                     rhs_value, b_);
     case HloOpcode::kLt:
       return llvm_ir::EmitComparison(
           is_signed ? llvm::CmpInst::ICMP_SLT : llvm::CmpInst::ICMP_ULT,
-          lhs_value, rhs_value, ir_builder_);
+          lhs_value, rhs_value, b_);
     case HloOpcode::kGt:
       return llvm_ir::EmitComparison(
           is_signed ? llvm::CmpInst::ICMP_SGT : llvm::CmpInst::ICMP_UGT,
-          lhs_value, rhs_value, ir_builder_);
+          lhs_value, rhs_value, b_);
     case HloOpcode::kLe:
       return llvm_ir::EmitComparison(
           is_signed ? llvm::CmpInst::ICMP_SLE : llvm::CmpInst::ICMP_ULE,
-          lhs_value, rhs_value, ir_builder_);
+          lhs_value, rhs_value, b_);
     case HloOpcode::kGe:
       return llvm_ir::EmitComparison(
           is_signed ? llvm::CmpInst::ICMP_SGE : llvm::CmpInst::ICMP_UGE,
-          lhs_value, rhs_value, ir_builder_);
+          lhs_value, rhs_value, b_);
     case HloOpcode::kMinimum:
       return EmitIntegralMin(lhs_value, rhs_value, is_signed);
     case HloOpcode::kMaximum:
       return EmitIntegralMax(lhs_value, rhs_value, is_signed);
     case HloOpcode::kAnd:
-      return ir_builder_->CreateAnd(lhs_value, rhs_value);
+      return b_->CreateAnd(lhs_value, rhs_value);
     case HloOpcode::kOr:
-      return ir_builder_->CreateOr(lhs_value, rhs_value);
+      return b_->CreateOr(lhs_value, rhs_value);
     case HloOpcode::kXor:
-      return ir_builder_->CreateXor(lhs_value, rhs_value);
+      return b_->CreateXor(lhs_value, rhs_value);
 
     // Shifting out bits >= the number of bits in the type being shifted
     // produces a poison value in LLVM which is basically "deferred undefined
@@ -1177,20 +1148,17 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerBinaryOp(
     // UB.  We replace the poison value with a constant to avoid this deferred
     // UB.
     case HloOpcode::kShiftRightArithmetic:
-      return SaturateShiftIfNecessary(
-          ir_builder_, lhs_value, rhs_value,
-          ir_builder_->CreateAShr(lhs_value, rhs_value),
-          /*saturate_to_sign_bit=*/true);
+      return SaturateShiftIfNecessary(b_, lhs_value, rhs_value,
+                                      b_->CreateAShr(lhs_value, rhs_value),
+                                      /*saturate_to_sign_bit=*/true);
     case HloOpcode::kShiftLeft:
-      return SaturateShiftIfNecessary(
-          ir_builder_, lhs_value, rhs_value,
-          ir_builder_->CreateShl(lhs_value, rhs_value),
-          /*saturate_to_sign_bit=*/false);
+      return SaturateShiftIfNecessary(b_, lhs_value, rhs_value,
+                                      b_->CreateShl(lhs_value, rhs_value),
+                                      /*saturate_to_sign_bit=*/false);
     case HloOpcode::kShiftRightLogical:
-      return SaturateShiftIfNecessary(
-          ir_builder_, lhs_value, rhs_value,
-          ir_builder_->CreateLShr(lhs_value, rhs_value),
-          /*saturate_to_sign_bit=*/false);
+      return SaturateShiftIfNecessary(b_, lhs_value, rhs_value,
+                                      b_->CreateLShr(lhs_value, rhs_value),
+                                      /*saturate_to_sign_bit=*/false);
     default:
       return Unimplemented("binary integer op '%s'",
                            HloOpcodeString(op->opcode()).c_str());
@@ -1200,21 +1168,19 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerBinaryOp(
 llvm::Value* ElementalIrEmitter::EmitIntegralMax(llvm::Value* lhs_value,
                                                  llvm::Value* rhs_value,
                                                  bool is_signed) const {
-  return ir_builder_->CreateSelect(
-      ir_builder_->CreateICmp(
-          is_signed ? llvm::ICmpInst::ICMP_SGE : llvm::ICmpInst::ICMP_UGE,
-          lhs_value, rhs_value),
-      lhs_value, rhs_value);
+  return b_->CreateSelect(b_->CreateICmp(is_signed ? llvm::ICmpInst::ICMP_SGE
+                                                   : llvm::ICmpInst::ICMP_UGE,
+                                         lhs_value, rhs_value),
+                          lhs_value, rhs_value);
 }
 
 llvm::Value* ElementalIrEmitter::EmitIntegralMin(llvm::Value* lhs_value,
                                                  llvm::Value* rhs_value,
                                                  bool is_signed) const {
-  return ir_builder_->CreateSelect(
-      ir_builder_->CreateICmp(
-          is_signed ? llvm::ICmpInst::ICMP_SLE : llvm::ICmpInst::ICMP_ULE,
-          lhs_value, rhs_value),
-      lhs_value, rhs_value);
+  return b_->CreateSelect(b_->CreateICmp(is_signed ? llvm::ICmpInst::ICMP_SLE
+                                                   : llvm::ICmpInst::ICMP_ULE,
+                                         lhs_value, rhs_value),
+                          lhs_value, rhs_value);
 }
 
 llvm_ir::IrArray::Index ElementalIrEmitter::ElementwiseSourceIndex(
@@ -1257,180 +1223,254 @@ llvm_ir::IrArray::Index ElementalIrEmitter::ElementwiseSourceIndex(
   return source_index;
 }
 
-llvm_ir::ElementGenerator ElementalIrEmitter::MakeRngElementGenerator(
+StatusOr<llvm::Value*> ElementalIrEmitter::ConvertValueForDistribution(
     const HloInstruction* hlo,
-    const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator)
-    const {
-  PrimitiveType param_prim_type = hlo->operand(0)->shape().element_type();
-  llvm::Type* param_ir_type =
-      llvm_ir::PrimitiveTypeToIrType(param_prim_type, module_);
-
-  // Same values as PCG library
-  // https://github.com/imneme/pcg-c/blob/master/include/pcg_variants.h
-  llvm::Value* multiplier = ir_builder_->getInt(
-      llvm::APInt(128, {0x4385DF649FCCF645, 0x2360ED051FC65DA4}));
-  llvm::Value* increment = ir_builder_->getInt(
-      llvm::APInt(128, {0x14057B7EF767814F, 0x5851F42D4C957F2D}));
-
-  auto random_value_from_hlo = [hlo]() {
-    const HloModule* module =
-        hlo->IsFused() ? hlo->parent()->FusionInstruction()->parent()->parent()
-                       : hlo->parent()->parent();
-    return module->RandomNew64();
-  };
+    const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator,
+    const llvm_ir::IrArray::Index& index, llvm::Value* raw_value) const {
+  TF_ASSIGN_OR_RETURN(llvm::Value * a_or_mean,
+                      operand_to_generator.at(hlo->operand(0))(index));
+  TF_ASSIGN_OR_RETURN(llvm::Value * b_or_sigma,
+                      operand_to_generator.at(hlo->operand(1))(index));
+  PrimitiveType elem_prim_ty = hlo->shape().element_type();
+  llvm::Type* elem_ir_ty =
+      llvm_ir::PrimitiveTypeToIrType(elem_prim_ty, module_);
+  llvm::Type* raw_value_ty = raw_value->getType();
+
+  // Convert raw integer to float in range [0, 1) if the element is a float.
+  llvm::Value* elem_value = raw_value;
+  if (elem_ir_ty->isFloatingPointTy()) {
+    elem_value = b_->CreateUIToFP(elem_value, elem_ir_ty);
+    unsigned raw_value_size_in_bits = raw_value_ty->getPrimitiveSizeInBits();
+    CHECK(raw_value_size_in_bits == 32 || raw_value_size_in_bits == 64);
+    elem_value = b_->CreateFDiv(
+        elem_value,
+        llvm::ConstantFP::get(elem_ir_ty,
+                              raw_value_size_in_bits == 64 ? 0x1p64 : 0x1p32));
+  }
+
+  // Convert the value for the requested distribution.
+  switch (hlo->random_distribution()) {
+    case RNG_UNIFORM: {
+      if (elem_ir_ty->isFloatingPointTy()) {
+        return b_->CreateFAdd(
+            b_->CreateFMul(b_->CreateFSub(b_or_sigma, a_or_mean), elem_value),
+            a_or_mean);
+      } else {
+        // To generate a uniform random value in [a, b) from a raw random sample
+        // in range [0, 2^N), we let range = b - a and return
+        // (a + raw_value % range). If range is not a power of 2, raw values
+        // larger than (2^N - 2^N % range) are biased toward results in
+        // [a, a + (limit % range)). An unbiased algorithm would need to drop
+        // raw values and re-sample, but we don't do this because re-sampling in
+        // an efficient way is complex, and it's not clear that users need it.
+        // In particular, if one thread in a GPU warp needs to re-sample, we pay
+        // the same cost as if the whole warp were to re-sample.  So an
+        // efficient re-sampling implementation on GPU would need to do
+        // nontrivial work to share entropy between threads in the warp.
+        auto range = b_->CreateSub(b_or_sigma, a_or_mean);
+        return b_->CreateAdd(a_or_mean, b_->CreateURem(elem_value, range));
+      }
+    }
+    case RNG_NORMAL: {
+      TF_ASSIGN_OR_RETURN(
+          llvm::Value * r,
+          EmitErfcInv(elem_prim_ty,
+                      b_->CreateFMul(llvm::ConstantFP::get(elem_ir_ty, 2.0),
+                                     elem_value)));
+      return b_->CreateFAdd(b_->CreateFMul(r, b_or_sigma), a_or_mean);
+    }
+    default:
+      return InvalidArgument(
+          "unhandled distribution %s",
+          RandomDistribution_Name(hlo->random_distribution()).c_str());
+  }
+}
+
+namespace {
+
+// Checks that the primitive type is supported by the elemental IR emitter for
+// Philox RNG and returns the number of elements in each 128 bit sample of the
+// Philox RNG algorithm.
+int32 GetNumberOfElementsPerPhiloxRngSample(PrimitiveType elem_prim_ty) {
+  // Calculate the number of elements, that is the number of random numbers, in
+  // a 128 bit sample.
+  switch (elem_prim_ty) {
+    case U32:
+    case S32:
+    case F32:
+    // The algorithm uses 32 bits to generate values for F16.
+    case F16:
+      return 4;
+    case U64:
+    case F64:
+      return 2;
+    default:
+      // BF16 is converted to F16 by the hlo pass HloElementTypeConverter.
+      // Other data types are not supported by XLA random operation.
+      LOG(FATAL) << "Unrecognized primitive type for RNG " << elem_prim_ty;
+  }
+  return 0;
+}
+
+// Calculates the four uint32 values for the 128-bit Philox sample.
+std::array<llvm::Value*, 4> CalculateSampleValues(
+    llvm::Value* sample_idx, llvm::Value* hlo_random_value,
+    llvm::Value* global_random_number, llvm::Value* rng_state,
+    llvm::IRBuilder<>* b) {
+  llvm::Type* index_ty = sample_idx->getType();
+
+  std::array<llvm::Value*, 4> counter_values;
+
+  // Use the sample index to initialize counter[0] and counter[1].
+  unsigned index_ty_size_in_bits = index_ty->getPrimitiveSizeInBits();
+  CHECK(index_ty_size_in_bits == 32 || index_ty_size_in_bits == 64);
+  if (index_ty_size_in_bits == 32) {
+    counter_values[0] = sample_idx;
+    counter_values[1] = b->getInt32(0);
+  } else {
+    std::tie(counter_values[0], counter_values[1]) =
+        llvm_ir::SplitInt64ToInt32s(b, sample_idx);
+  }
+
+  // Xor the global state variable with the global random number seed and use
+  // the result to initialize counter[2] and counter[3].
+  std::tie(counter_values[2], counter_values[3]) = llvm_ir::SplitInt64ToInt32s(
+      b, b->CreateXor(rng_state, global_random_number));
 
-  // Seed each RNG emitter with a new 64-bit seed from the HloModule. If the
-  // compilation order is deterministic (i.e., RandomNew64 invocation order is
-  // deterministic), then the order of RNG is deterministic for a given seed and
-  // hence tests will be deterministic.
-  // If the user provides a global seed instruction then we only use 64-bits of
-  // the host's random number generator to seed the 128 bit value with the other
-  // 64-bits is due to a user specified global seed instruction.
-  // Create a GlobalVariable to maintain state between invocations. There is a
-  // bug in NVPTX with GlobalVariable and 128 bit values, so using 2 64-bit
+  // The algorithm uses a 64 bit key, which is also interpreted as two uint32
   // values.
-  llvm::GlobalVariable* state_ptr0 = new llvm::GlobalVariable(
-      /*M=*/*module_,
-      /*Ty=*/ir_builder_->getInt64Ty(),
-      /*isConstant=*/false,
-      /*Linkage=*/llvm::GlobalValue::PrivateLinkage,
-      /*Initializer=*/ir_builder_->getInt64(random_value_from_hlo()),
-      /*Name=*/"state_ptr0");
-
-  // When the module config seed is 0, the expected result of a prng is a random
-  // value. Instead of using the random_value_from_hlo, we need a global random
-  // value as the graph seed. This is because if we use random_value_from_hlo
-  // here, then for a newly built hlo graph, it always gives the same number.
-  uint64 graph_seed = hlo_module_config_.seed() != 0 ? hlo_module_config_.seed()
-                                                     : GlobalRandomValue();
-  llvm::GlobalVariable* state_ptr1 = new llvm::GlobalVariable(
-      /*M=*/*module_,
-      /*Ty=*/ir_builder_->getInt64Ty(),
-      /*isConstant=*/false,
-      /*Linkage=*/llvm::GlobalValue::PrivateLinkage,
-      /*Initializer=*/ir_builder_->getInt64(graph_seed),
-      /*Name=*/"state_ptr1");
-
-  // We want each thread to use its own stream, so we modify the increment per
-  // thread. We want the increment to remain odd, so we shift the thread id left
-  // 1 and add it to the increment.
-  increment = ir_builder_->CreateAdd(increment,
-                                     ir_builder_->CreateShl(EmitThreadId(), 1));
-
-  // PCG-XSL-RR algorithm
-  // http://www.pcg-random.org/pdf/toms-oneill-pcg-family-v1.02.pdf
-  //   state = multiplier * state + increment
-  //   return uint64_t(state ^ (state >> 64))) >>> (state >> 122)
-  // where ">>>" is bitwise rotation
-  auto get_next_i64 = [=]() {
-    llvm::Value* state0 = ir_builder_->CreateZExtOrTrunc(
-        ir_builder_->CreateLoad(state_ptr0, "state0"),
-        ir_builder_->getInt128Ty());
-    llvm::Value* state1 = ir_builder_->CreateShl(
-        ir_builder_->CreateZExtOrTrunc(
-            ir_builder_->CreateLoad(state_ptr1, "state1"),
-            ir_builder_->getInt128Ty()),
-        64);
-    llvm::Value* state = ir_builder_->CreateOr(state0, state1);
-    llvm::Value* updated = ir_builder_->CreateAdd(
-        ir_builder_->CreateMul(state, multiplier), increment);
-    ir_builder_->CreateStore(
-        ir_builder_->CreateTrunc(updated, ir_builder_->getInt64Ty()),
-        state_ptr0);
-    ir_builder_->CreateStore(
-        ir_builder_->CreateTrunc(ir_builder_->CreateLShr(updated, 64),
-                                 ir_builder_->getInt64Ty()),
-        state_ptr1);
-
-    return llvm_ir::CreateRor(
-        ir_builder_->CreateTrunc(
-            ir_builder_->CreateXor(state, ir_builder_->CreateLShr(state, 64)),
-            ir_builder_->getInt64Ty()),
-        ir_builder_->CreateTrunc(ir_builder_->CreateLShr(state, 122),
-                                 ir_builder_->getInt64Ty()),
-        ir_builder_);
-  };
+  llvm::Value* key_values[2];
+
+  // Use a module random number to initialize the key.
+  std::tie(key_values[0], key_values[1]) =
+      llvm_ir::SplitInt64ToInt32s(b, hlo_random_value);
+
+  // Prepare the constants used in the Philox RNG Algorithm.
+  llvm::Value* philoxW32A = b->getInt32(0x9E3779B9);
+  llvm::Value* philoxW32B = b->getInt32(0xBB67AE85);
+  llvm::Value* philoxM4xW32A = b->getInt32(0xD2511F53);
+  llvm::Value* philoxM4xW32B = b->getInt32(0xCD9E8D57);
+
+  // Compute the 128 bit value for the current sample by repeating the
+  // single round computation and key raising computation for ten times.
+  for (int round = 0; round < 10; ++round) {
+    // A single round of computation of the counter values is as follows:
+    //  MultiplyHighLow(kPhiloxM4x32A, counter[0], &lo0, &hi0);
+    //  MultiplyHighLow(kPhiloxM4x32B, counter[2], &lo1, &hi1);
+    //  counter[0] = hi1 ^ counter[1] ^ key[0];
+    //  counter[1] = lo1;
+    //  counter[2] = hi0 ^ counter[3] ^ key[1];
+    //  counter[3] = lo0;
+    llvm::Value* lo0;
+    llvm::Value* hi0;
+    std::tie(lo0, hi0) =
+        llvm_ir::UMulLowHigh32(b, philoxM4xW32A, counter_values[0]);
+    llvm::Value* lo1;
+    llvm::Value* hi1;
+    std::tie(lo1, hi1) =
+        llvm_ir::UMulLowHigh32(b, philoxM4xW32B, counter_values[2]);
+    counter_values[0] =
+        b->CreateXor(hi1, b->CreateXor(counter_values[1], key_values[0]));
+    counter_values[1] = lo1;
+    counter_values[2] =
+        b->CreateXor(hi0, b->CreateXor(counter_values[3], key_values[1]));
+    counter_values[3] = lo0;
+    key_values[0] = b->CreateAdd(key_values[0], philoxW32A);
+    key_values[1] = b->CreateAdd(key_values[1], philoxW32B);
+  }
 
-  auto get_next_uniform_float = [=]() {
-    return ir_builder_->CreateFDiv(
-        ir_builder_->CreateUIToFP(get_next_i64(), param_ir_type),
-        llvm::ConstantFP::get(param_ir_type, 0x1p64));
-  };
+  return counter_values;
+}
 
+}  // namespace
+
+// Implements the Philox algorithm to generate random numbers in parallel.
+// Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3.
+//   http://www.thesalmons.org/john/random123/papers/random123sc11.pdf
+//
+// The paper presents a few variants of the Philox algorithm, we picked the
+// 4x32_10 version of the algorithm for the following reasons:
+//   . 4x32 uses 32-bit multiplication which is fast on GPUs.
+//   . The authors recommend the 10-round variant, and TensorFlow also uses it.
+//
+// Precondition: the RNG instruction is not fused.
+llvm_ir::ElementGenerator ElementalIrEmitter::MakePhiloxRngElementGenerator(
+    const HloInstruction* hlo,
+    const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator)
+    const {
+  VLOG(3) << "Using philox RNG algorithm";
+  CHECK(!hlo->IsFused());
+  // A random number generated by the per module random number generator.
+  // This ensures that each RNG HLO generates a different random sequence.
+  llvm::Value* hlo_random_value = b_->getInt64(hlo->GetModule()->RandomNew64());
+  // A value specified by the configuration or generated by a global random
+  // number generator.
+  llvm::Value* global_random_number =
+      b_->getInt64(hlo_module_config_.seed() != 0 ? hlo_module_config_.seed()
+                                                  : GlobalRandomValue());
+
+  int elems_per_sample =
+      GetNumberOfElementsPerPhiloxRngSample(hlo->shape().element_type());
+
+  // Allocate stack storage for the 128 bit sample as four int32.
+  llvm::Type* int32_ty = b_->getInt32Ty();
+  llvm::Value* sample_address = llvm_ir::EmitAllocaAtFunctionEntryWithCount(
+      int32_ty, /*element_count=*/b_->getInt32(4), "sample", b_);
+
+  // Load the global state variable for the Philox RNG algorithm.
+  llvm::GlobalVariable* rng_state_ptr =
+      llvm_ir::GetOrCreateVariableForPhiloxRngState(module_, b_);
+  llvm::Value* rng_state = b_->CreateLoad(rng_state_ptr, "rng_state_value");
+
+  // Build and return the elemental IR generator to generate a random value for
+  // the element corresponding to the current thread.
+  //
+  // This elemental IR generator computes one sample with multiple random
+  // numbers but only returns one random number. As a result, neighboring
+  // threads may calculate the same sample unnecessarily. However, if the
+  // kernel containing the RNG hlo is unrolled, LLVM is able to optimize away
+  // the duplicated computation of the same sample. In particular, if the unroll
+  // factor is a multiplier of elems_per_sample, LLVM is able to completely
+  // remove such duplicated computation. If the unroll factor is a non-trivial
+  // factor of elems_per_sample, LLVM can only partially remove such duplicated
+  // computation.
   return [=](const llvm_ir::IrArray::Index& index) -> StatusOr<llvm::Value*> {
-    switch (hlo->random_distribution()) {
-      case RNG_UNIFORM: {
-        TF_ASSIGN_OR_RETURN(llvm::Value * p,
-                            operand_to_generator.at(hlo->operand(0))(index));
-        TF_ASSIGN_OR_RETURN(llvm::Value * q,
-                            operand_to_generator.at(hlo->operand(1))(index));
-        if (primitive_util::IsFloatingPointType(param_prim_type)) {
-          return ir_builder_->CreateFAdd(
-              ir_builder_->CreateFMul(ir_builder_->CreateFSub(q, p),
-                                      get_next_uniform_float()),
-              p);
-        } else {
-          auto r = ir_builder_->CreateSub(q, p);
-          auto leading_zeros = llvm_ir::EmitCallToIntrinsic(
-              llvm::Intrinsic::ctlz, {r, ir_builder_->getInt1(true)},
-              {param_ir_type}, ir_builder_);
-          auto in_block = ir_builder_->GetInsertBlock();
-
-          // A terminator should be present iff we're emitting code
-          // into the middle (as opposed to the end) of a basic block.
-          CHECK_EQ(ir_builder_->GetInsertPoint() == in_block->end(),
-                   in_block->getTerminator() == nullptr);
-
-          llvm::BasicBlock* body_block;
-          llvm::BasicBlock* out_block;
-
-          if (ir_builder_->GetInsertPoint() == in_block->end()) {
-            body_block = llvm_ir::CreateBasicBlock(
-                nullptr, IrName(hlo, "rng_body"), ir_builder_);
-            out_block = llvm_ir::CreateBasicBlock(
-                nullptr, IrName(hlo, "rng_out"), ir_builder_);
-            llvm::BranchInst::Create(body_block, in_block);
-          } else {
-            body_block = in_block->splitBasicBlock(
-                ir_builder_->GetInsertPoint(), "rng_body");
-            out_block = body_block->splitBasicBlock(
-                ir_builder_->GetInsertPoint(), "rng_out");
-            body_block->getTerminator()->eraseFromParent();
-          }
-
-          SetToFirstInsertPoint(body_block, ir_builder_);
-          auto random = ir_builder_->CreateAnd(
-              ir_builder_->CreateZExtOrTrunc(get_next_i64(), param_ir_type),
-              ir_builder_->CreateLShr(llvm::ConstantInt::get(param_ir_type, ~0),
-                                      leading_zeros));
-          llvm::BranchInst::Create(out_block, body_block,
-                                   ir_builder_->CreateICmpULT(random, r),
-                                   body_block);
-          SetToFirstInsertPoint(out_block, ir_builder_);
-          return ir_builder_->CreateAdd(
-              p, ir_builder_->CreateSelect(
-                     ir_builder_->CreateICmpEQ(p, q),
-                     llvm::ConstantInt::get(param_ir_type, 0), random));
-        }
-      }
-      case RNG_NORMAL: {
-        TF_ASSIGN_OR_RETURN(llvm::Value * m,
-                            operand_to_generator.at(hlo->operand(0))(index));
-        TF_ASSIGN_OR_RETURN(llvm::Value * s,
-                            operand_to_generator.at(hlo->operand(1))(index));
-        TF_ASSIGN_OR_RETURN(
-            llvm::Value * r,
-            EmitErfcInv(param_prim_type,
-                        ir_builder_->CreateFMul(
-                            llvm::ConstantFP::get(param_ir_type, 2.0),
-                            get_next_uniform_float())));
-        return ir_builder_->CreateFAdd(ir_builder_->CreateFMul(r, s), m);
-      }
-      default:
-        return InvalidArgument(
-            "unhandled distribution %s",
-            RandomDistribution_Name(hlo->random_distribution()).c_str());
+    llvm::Type* index_ty = index.GetType();
+    // Calculate the linear element index.
+    llvm::Value* elem_idx = index.linear();
+    if (elem_idx == nullptr) {
+      elem_idx = index.Linearize(AsInt64Slice(hlo->shape().dimensions()), b_);
     }
+
+    // Calculate the index for the 128 bit sample and the offset of the current
+    // element within the sample.
+    llvm::Value* elems_per_sample_value =
+        llvm::ConstantInt::get(index_ty, elems_per_sample);
+    llvm::Value* sample_idx = b_->CreateUDiv(elem_idx, elems_per_sample_value);
+    llvm::Value* elem_offset = b_->CreateURem(elem_idx, elems_per_sample_value);
+
+    std::array<llvm::Value*, 4> counter_values = CalculateSampleValues(
+        sample_idx, hlo_random_value, global_random_number, rng_state, b_);
+
+    // Store the four counter_values into the sample_address alloca so we can
+    // load the elem_offset'th one below.
+    for (int idx = 0; idx < 4; ++idx) {
+      b_->CreateStore(counter_values[idx],
+                      b_->CreateInBoundsGEP(sample_address, b_->getInt32(idx)));
+    }
+
+    llvm::Type* int64_ty = b_->getInt64Ty();
+    CHECK(elems_per_sample == 2 || elems_per_sample == 4);
+    llvm::Type* raw_value_ty = elems_per_sample == 2 ? int64_ty : int32_ty;
+    // Retrieve the raw value for the current element from the current sample.
+    llvm::Value* raw_elem_value = b_->CreateLoad(
+        b_->CreateInBoundsGEP(
+            b_->CreatePointerCast(sample_address, raw_value_ty->getPointerTo()),
+            elem_offset),
+        "raw_elem_value");
+
+    return ConvertValueForDistribution(hlo, operand_to_generator, index,
+                                       raw_elem_value);
   };
 }
 
@@ -1447,9 +1487,8 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalSelect(
   TF_ASSIGN_OR_RETURN(llvm::Value * on_false_value,
                       operand_to_generator.at(hlo->operand(2))(
                           ElementwiseSourceIndex(index, *hlo, 2)));
-  return ir_builder_->CreateSelect(
-      ir_builder_->CreateTrunc(pred_value, ir_builder_->getInt1Ty()),
-      on_true_value, on_false_value);
+  return b_->CreateSelect(b_->CreateTrunc(pred_value, b_->getInt1Ty()),
+                          on_true_value, on_false_value);
 }
 
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalClamp(
@@ -1485,64 +1524,62 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalConcatenate(
   const int64 concat_dim = hlo->dimensions(0);
   auto source_index = target_index;
 
-  llvm::BasicBlock* init_block = ir_builder_->GetInsertBlock();
+  llvm::BasicBlock* init_block = b_->GetInsertBlock();
 
   // A terminator should be present iff we're emitting code
   // into the middle (as opposed to the end) of a basic block.
-  CHECK_EQ(ir_builder_->GetInsertPoint() == init_block->end(),
+  CHECK_EQ(b_->GetInsertPoint() == init_block->end(),
            init_block->getTerminator() == nullptr);
 
   llvm::BasicBlock* exit_block;
-  if (ir_builder_->GetInsertPoint() == init_block->end()) {
+  if (b_->GetInsertPoint() == init_block->end()) {
     exit_block = llvm_ir::CreateBasicBlock(
-        /*insert_before=*/nullptr, IrName(hlo, "merge"), ir_builder_);
+        /*insert_before=*/nullptr, IrName(hlo, "merge"), b_);
   } else {
-    exit_block = init_block->splitBasicBlock(ir_builder_->GetInsertPoint(),
+    exit_block = init_block->splitBasicBlock(b_->GetInsertPoint(),
                                              AsStringRef(IrName(hlo, "merge")));
     init_block->getTerminator()->eraseFromParent();
   }
 
-  llvm_ir::SetToFirstInsertPoint(exit_block, ir_builder_);
-  llvm::PHINode* output = ir_builder_->CreatePHI(
+  llvm_ir::SetToFirstInsertPoint(exit_block, b_);
+  llvm::PHINode* output = b_->CreatePHI(
       llvm_ir::PrimitiveTypeToIrType(hlo->shape().element_type(), module_),
       hlo->operands().size());
-  auto prior_insert_point = ir_builder_->GetInsertPoint();
+  auto prior_insert_point = b_->GetInsertPoint();
 
-  ir_builder_->SetInsertPoint(init_block);
+  b_->SetInsertPoint(init_block);
 
   for (int64 operand_idx = 0; operand_idx < hlo->operand_count();
        ++operand_idx) {
     const HloInstruction* operand = hlo->operand(operand_idx);
     auto true_block = llvm_ir::CreateBasicBlock(
-        exit_block, StrCat("concat_index_from_operand", operand_idx),
-        ir_builder_);
+        exit_block, StrCat("concat_index_from_operand", operand_idx), b_);
     auto false_block = llvm_ir::CreateBasicBlock(
-        exit_block, StrCat("concat_index_not_from_operand", operand_idx),
-        ir_builder_);
+        exit_block, StrCat("concat_index_not_from_operand", operand_idx), b_);
     auto concat_dim_size =
         llvm::ConstantInt::get(source_index[concat_dim]->getType(),
                                operand->shape().dimensions(concat_dim));
-    ir_builder_->CreateCondBr(
-        ir_builder_->CreateICmpULT(source_index[concat_dim], concat_dim_size),
+    b_->CreateCondBr(
+        b_->CreateICmpULT(source_index[concat_dim], concat_dim_size),
         true_block, false_block);
 
     // Create the terminator of the true block before calling operand
     // generators, because they require non-degenerate basic blocks.
-    ir_builder_->SetInsertPoint(
+    b_->SetInsertPoint(
         llvm::BranchInst::Create(exit_block, /*InsertAtEnd=*/true_block));
     TF_ASSIGN_OR_RETURN(llvm::Value * value,
                         operand_to_generator.at(operand)(source_index));
-    output->addIncoming(value, ir_builder_->GetInsertBlock());
+    output->addIncoming(value, b_->GetInsertBlock());
 
     // Subtract the size of the concat dimension of the current operand
     // from the source index.
-    ir_builder_->SetInsertPoint(false_block);
+    b_->SetInsertPoint(false_block);
     source_index[concat_dim] =
-        ir_builder_->CreateSub(source_index[concat_dim], concat_dim_size);
+        b_->CreateSub(source_index[concat_dim], concat_dim_size);
   }
 
-  ir_builder_->CreateUnreachable();
-  ir_builder_->SetInsertPoint(exit_block, prior_insert_point);
+  b_->CreateUnreachable();
+  b_->SetInsertPoint(exit_block, prior_insert_point);
   return output;
 }
 
@@ -1566,12 +1603,7 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDynamicSlice(
 
     // Clamp the start index so that the sliced portion fits in the operand:
     // start_index = clamp(start_index, 0, operand_dim_size - output_dim_size)
-
-    // TODO(b/74360564): This is implementation defined behavior, but is
-    // currently respected by all implementations. Change this if we ever decide
-    // to officially document different behavior.
-    start_index_value =
-        ir_builder_->CreateSExtOrTrunc(start_index_value, index_type);
+    start_index_value = b_->CreateSExtOrTrunc(start_index_value, index_type);
     int64 largest_valid_start_index =
         input_hlo->shape().dimensions(i) - hlo->shape().dimensions(i);
     CHECK_GE(largest_valid_start_index, 0);
@@ -1591,7 +1623,7 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDynamicSlice(
   for (int64 i = 0; i < rank; ++i) {
     // Emit IR which computes:
     //   input_index = start_index + offset_index
-    input_index[i] = ir_builder_->CreateAdd(slice_start_index[i], index[i]);
+    input_index[i] = b_->CreateAdd(slice_start_index[i], index[i]);
   }
   return operand_to_generator.at(input_hlo)(input_index);
 }
@@ -1649,7 +1681,7 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalGather(
 
   auto add_to_operand_index = [&](llvm::Value* index_component, int64 dim) {
     llvm::Value* gather_dim_component_extended =
-        ir_builder_->CreateSExtOrTrunc(index_component, index_type);
+        b_->CreateSExtOrTrunc(index_component, index_type);
     int64 operand_dim = dim_numbers.gather_dims_to_operand_dims(dim);
     int64 output_dim = operand_to_output_dim[operand_dim];
     // If 'output_dim' is -1, it means 'operand_dim' is an elided window dim.
@@ -1673,7 +1705,7 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalGather(
                         gather_dim_component_extended, is_signed),
         is_signed);
 
-    operand_index[operand_dim] = ir_builder_->CreateAdd(
+    operand_index[operand_dim] = b_->CreateAdd(
         operand_index[operand_dim], gather_dim_component_extended_inbound);
   };
 
@@ -1708,7 +1740,7 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDynamicUpdateSlice(
   llvm_ir::IrArray::Index slice_limit_index(index.GetType(), rank);
   // Slice intersection gathers (ANDs) conditions on all ranks for which
   // 'input' is set to 'update'
-  llvm::Value* slice_intersection = ir_builder_->getTrue();
+  llvm::Value* slice_intersection = b_->getTrue();
 
   for (int64 i = 0; i < rank; ++i) {
     llvm::Type* index_type = index[0]->getType();
@@ -1721,12 +1753,7 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDynamicUpdateSlice(
 
     // Clamp the start index so that the update region fits in the operand.
     // start_index = clamp(start_index, 0, input_dim_size - update_dim_size)
-
-    // TODO(b/74360564): This is implementation defined behavior, but is
-    // currently respected by all implementations. Change this if we ever decide
-    // to officially document different behavior.
-    start_index_value =
-        ir_builder_->CreateSExtOrTrunc(start_index_value, index_type);
+    start_index_value = b_->CreateSExtOrTrunc(start_index_value, index_type);
     llvm::Value* update_dim_size =
         index_typed_const(update_hlo->shape().dimensions(i));
     int64 largest_valid_start_index =
@@ -1742,16 +1769,13 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDynamicUpdateSlice(
     start_index_value->setName(
         AsStringRef(IrName(hlo, StrCat("start_idx", i))));
     slice_start_index[i] = start_index_value;
-    slice_limit_index[i] =
-        ir_builder_->CreateAdd(slice_start_index[i], update_dim_size);
+    slice_limit_index[i] = b_->CreateAdd(slice_start_index[i], update_dim_size);
 
-    slice_intersection = ir_builder_->CreateAnd(
-        slice_intersection,
-        ir_builder_->CreateICmpSGE(index[i], slice_start_index[i]),
+    slice_intersection = b_->CreateAnd(
+        slice_intersection, b_->CreateICmpSGE(index[i], slice_start_index[i]),
         "slice_intersection");
-    slice_intersection = ir_builder_->CreateAnd(
-        slice_intersection,
-        ir_builder_->CreateICmpSLT(index[i], slice_limit_index[i]),
+    slice_intersection = b_->CreateAnd(
+        slice_intersection, b_->CreateICmpSLT(index[i], slice_limit_index[i]),
         "slice_intersection");
   }
 
@@ -1760,29 +1784,29 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDynamicUpdateSlice(
   // else                    -> return data from 'input'.
   llvm::Value* ret_value_addr = llvm_ir::EmitAllocaAtFunctionEntry(
       llvm_ir::PrimitiveTypeToIrType(hlo->shape().element_type(), module_),
-      "ret_value_addr", ir_builder_);
-  llvm_ir::LlvmIfData if_data = llvm_ir::EmitIfThenElse(
-      slice_intersection, "slice_intersection", ir_builder_);
+      "ret_value_addr", b_);
+  llvm_ir::LlvmIfData if_data =
+      llvm_ir::EmitIfThenElse(slice_intersection, "slice_intersection", b_);
 
   // Handle true BB (return data from 'update')
-  SetToFirstInsertPoint(if_data.true_block, ir_builder_);
+  SetToFirstInsertPoint(if_data.true_block, b_);
   // Compute update index for intersection case.
   llvm_ir::IrArray::Index update_index(index.GetType(), rank);
   for (int64 i = 0; i < rank; ++i) {
-    update_index[i] = ir_builder_->CreateSub(index[i], slice_start_index[i]);
+    update_index[i] = b_->CreateSub(index[i], slice_start_index[i]);
   }
   TF_ASSIGN_OR_RETURN(llvm::Value * true_value,
                       operand_to_generator.at(update_hlo)(update_index));
-  ir_builder_->CreateStore(true_value, ret_value_addr);
+  b_->CreateStore(true_value, ret_value_addr);
 
   // Handle false BB (return data from 'input')
-  SetToFirstInsertPoint(if_data.false_block, ir_builder_);
+  SetToFirstInsertPoint(if_data.false_block, b_);
   TF_ASSIGN_OR_RETURN(llvm::Value * false_value,
                       operand_to_generator.at(input_hlo)(index));
-  ir_builder_->CreateStore(false_value, ret_value_addr);
+  b_->CreateStore(false_value, ret_value_addr);
 
-  SetToFirstInsertPoint(if_data.after_block, ir_builder_);
-  return ir_builder_->CreateLoad(ret_value_addr);
+  SetToFirstInsertPoint(if_data.after_block, b_);
+  return b_->CreateLoad(ret_value_addr);
 }
 
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalPad(
@@ -1790,29 +1814,29 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalPad(
     const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator,
     const llvm_ir::IrArray::Index& padded_index) const {
   auto index = padded_index;
-  llvm::Value* in_bounds = ir_builder_->getTrue();
+  llvm::Value* in_bounds = b_->getTrue();
   for (size_t i = 0; i < index.size(); ++i) {
     auto index_typed_const = [=](int64 n) {
       return llvm::ConstantInt::get(index[i]->getType(), n);
     };
     const auto& pad_dim = hlo->padding_config().dimensions(i);
-    index[i] = ir_builder_->CreateSub(
-        index[i], index_typed_const(pad_dim.edge_padding_low()));
-    in_bounds = ir_builder_->CreateAnd(
-        in_bounds, ir_builder_->CreateICmpSGE(index[i], index_typed_const(0)),
-        "in_bounds");
-    in_bounds = ir_builder_->CreateAnd(
+    index[i] =
+        b_->CreateSub(index[i], index_typed_const(pad_dim.edge_padding_low()));
+    in_bounds = b_->CreateAnd(in_bounds,
+                              b_->CreateICmpSGE(index[i], index_typed_const(0)),
+                              "in_bounds");
+    in_bounds = b_->CreateAnd(
         in_bounds,
-        ir_builder_->CreateICmpEQ(
+        b_->CreateICmpEQ(
             index_typed_const(0),
-            ir_builder_->CreateURem(
-                index[i], index_typed_const(pad_dim.interior_padding() + 1))),
+            b_->CreateURem(index[i],
+                           index_typed_const(pad_dim.interior_padding() + 1))),
         "in_bounds");
-    index[i] = ir_builder_->CreateSDiv(
+    index[i] = b_->CreateSDiv(
         index[i], index_typed_const(pad_dim.interior_padding() + 1));
-    in_bounds = ir_builder_->CreateAnd(
+    in_bounds = b_->CreateAnd(
         in_bounds,
-        ir_builder_->CreateICmpSLT(
+        b_->CreateICmpSLT(
             index[i],
             index_typed_const(hlo->operand(0)->shape().dimensions(i))),
         "in_bounds");
@@ -1825,26 +1849,26 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalPad(
   // }
   llvm::Value* ret_value_addr = llvm_ir::EmitAllocaAtFunctionEntry(
       llvm_ir::PrimitiveTypeToIrType(hlo->shape().element_type(), module_),
-      "pad_result_addr", ir_builder_);
+      "pad_result_addr", b_);
   llvm_ir::LlvmIfData if_data =
-      llvm_ir::EmitIfThenElse(in_bounds, "in_bounds", ir_builder_);
-  SetToFirstInsertPoint(if_data.true_block, ir_builder_);
+      llvm_ir::EmitIfThenElse(in_bounds, "in_bounds", b_);
+  SetToFirstInsertPoint(if_data.true_block, b_);
   TF_ASSIGN_OR_RETURN(llvm::Value * operand_value,
                       operand_to_generator.at(hlo->operand(0))(index));
-  ir_builder_->CreateStore(operand_value, ret_value_addr);
+  b_->CreateStore(operand_value, ret_value_addr);
 
-  SetToFirstInsertPoint(if_data.false_block, ir_builder_);
+  SetToFirstInsertPoint(if_data.false_block, b_);
   TF_ASSIGN_OR_RETURN(llvm::Value * padding_value,
                       operand_to_generator.at(hlo->operand(1))(
                           IrArray::Index(index.GetType())));
-  ir_builder_->CreateStore(padding_value, ret_value_addr);
+  b_->CreateStore(padding_value, ret_value_addr);
 
-  SetToFirstInsertPoint(if_data.after_block, ir_builder_);
+  SetToFirstInsertPoint(if_data.after_block, b_);
   // Don't create phi(operand_value, padding_value) here, because invoking
   // operand_to_generator may create new basic blocks, making the parent
   // of operand_value or padding_value no longer a predecessor of
   // if_data.after_block.
-  return ir_builder_->CreateLoad(ret_value_addr);
+  return b_->CreateLoad(ret_value_addr);
 }
 
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDot(
@@ -1868,21 +1892,20 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDot(
     return llvm::ConstantInt::get(index_type, c);
   };
 
-  std::unique_ptr<llvm_ir::ForLoop> inner_loop =
-      llvm_ir::ForLoop::EmitForLoop(IrName(hlo, "inner"), index_typed_const(0),
-                                    index_typed_const(contracted_dim_size),
-                                    index_typed_const(1), ir_builder_);
+  std::unique_ptr<llvm_ir::ForLoop> inner_loop = llvm_ir::ForLoop::EmitForLoop(
+      IrName(hlo, "inner"), index_typed_const(0),
+      index_typed_const(contracted_dim_size), index_typed_const(1), b_);
 
-  SetToFirstInsertPoint(inner_loop->GetPreheaderBasicBlock(), ir_builder_);
+  SetToFirstInsertPoint(inner_loop->GetPreheaderBasicBlock(), b_);
   PrimitiveType primitive_type = hlo->shape().element_type();
   llvm::Type* primitive_type_llvm =
       llvm_ir::PrimitiveTypeToIrType(primitive_type, module_);
-  llvm::Value* accumulator_alloca = llvm_ir::EmitAllocaAtFunctionEntry(
-      primitive_type_llvm, "dot_acc", ir_builder_);
-  ir_builder_->CreateStore(llvm::Constant::getNullValue(primitive_type_llvm),
-                           accumulator_alloca);
+  llvm::Value* accumulator_alloca =
+      llvm_ir::EmitAllocaAtFunctionEntry(primitive_type_llvm, "dot_acc", b_);
+  b_->CreateStore(llvm::Constant::getNullValue(primitive_type_llvm),
+                  accumulator_alloca);
 
-  SetToFirstInsertPoint(inner_loop->GetBodyBasicBlock(), ir_builder_);
+  SetToFirstInsertPoint(inner_loop->GetBodyBasicBlock(), b_);
 
   // This is the inner reduction loop for a dot operation that produces
   // one element in the output.  If the operands to the dot operation have
@@ -1902,43 +1925,36 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDot(
   }
   rhs_index.InsertAt(rhs_contracting_dim, inner_loop->GetIndVarValue());
 
-  llvm::Value* current_accumulator =
-      ir_builder_->CreateLoad(accumulator_alloca);
+  llvm::Value* current_accumulator = b_->CreateLoad(accumulator_alloca);
   TF_ASSIGN_OR_RETURN(llvm::Value * lhs_value, lhs_generator(lhs_index));
   TF_ASSIGN_OR_RETURN(llvm::Value * rhs_value, rhs_generator(rhs_index));
   llvm::Value* next_accumulator;
   if (primitive_util::IsComplexType(primitive_type)) {
-    llvm::Value* product_real = ir_builder_->CreateFSub(
-        ir_builder_->CreateFMul(EmitExtractReal(lhs_value),
-                                EmitExtractReal(rhs_value)),
-        ir_builder_->CreateFMul(EmitExtractImag(lhs_value),
-                                EmitExtractImag(rhs_value)));
-    llvm::Value* product_imag = ir_builder_->CreateFAdd(
-        ir_builder_->CreateFMul(EmitExtractReal(lhs_value),
-                                EmitExtractImag(rhs_value)),
-        ir_builder_->CreateFMul(EmitExtractImag(lhs_value),
-                                EmitExtractReal(rhs_value)));
-    next_accumulator = ir_builder_->CreateInsertValue(
+    llvm::Value* product_real = b_->CreateFSub(
+        b_->CreateFMul(EmitExtractReal(lhs_value), EmitExtractReal(rhs_value)),
+        b_->CreateFMul(EmitExtractImag(lhs_value), EmitExtractImag(rhs_value)));
+    llvm::Value* product_imag = b_->CreateFAdd(
+        b_->CreateFMul(EmitExtractReal(lhs_value), EmitExtractImag(rhs_value)),
+        b_->CreateFMul(EmitExtractImag(lhs_value), EmitExtractReal(rhs_value)));
+    next_accumulator = b_->CreateInsertValue(
         current_accumulator,
-        ir_builder_->CreateFAdd(EmitExtractReal(current_accumulator),
-                                product_real),
+        b_->CreateFAdd(EmitExtractReal(current_accumulator), product_real),
         {0});
-    next_accumulator = ir_builder_->CreateInsertValue(
+    next_accumulator = b_->CreateInsertValue(
         next_accumulator,
-        ir_builder_->CreateFAdd(EmitExtractImag(current_accumulator),
-                                product_imag),
+        b_->CreateFAdd(EmitExtractImag(current_accumulator), product_imag),
         {1});
   } else if (primitive_util::IsFloatingPointType(primitive_type)) {
-    next_accumulator = ir_builder_->CreateFAdd(
-        current_accumulator, ir_builder_->CreateFMul(lhs_value, rhs_value));
+    next_accumulator = b_->CreateFAdd(current_accumulator,
+                                      b_->CreateFMul(lhs_value, rhs_value));
   } else {
-    next_accumulator = ir_builder_->CreateAdd(
-        current_accumulator, ir_builder_->CreateMul(lhs_value, rhs_value));
+    next_accumulator =
+        b_->CreateAdd(current_accumulator, b_->CreateMul(lhs_value, rhs_value));
   }
-  ir_builder_->CreateStore(next_accumulator, accumulator_alloca);
+  b_->CreateStore(next_accumulator, accumulator_alloca);
 
-  SetToFirstInsertPoint(inner_loop->GetExitBasicBlock(), ir_builder_);
-  return ir_builder_->CreateLoad(accumulator_alloca);
+  SetToFirstInsertPoint(inner_loop->GetExitBasicBlock(), b_);
+  return b_->CreateLoad(accumulator_alloca);
 }
 
 llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
@@ -2038,7 +2054,7 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
         const HloInstruction* operand = hlo->operand(0);
         auto source_index = target_index;
         for (int64 dim : hlo->dimensions()) {
-          source_index[dim] = ir_builder_->CreateSub(
+          source_index[dim] = b_->CreateSub(
               llvm::ConstantInt::get(target_index[dim]->getType(),
                                      hlo->shape().dimensions(dim) - 1),
               target_index[dim]);
@@ -2051,16 +2067,16 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
         const HloInstruction* operand = hlo->operand(0);
         // The `dimensions` member of the broadcast instruction maps from
         // input dimensions to output dimensions.
-        return operand_to_generator.at(
-            operand)(target_index.SourceIndexOfBroadcast(
-            hlo->shape(), operand->shape(), hlo->dimensions(), ir_builder_));
+        return operand_to_generator.at(operand)(
+            target_index.SourceIndexOfBroadcast(hlo->shape(), operand->shape(),
+                                                hlo->dimensions(), b_));
       };
     case HloOpcode::kSlice:
       return [this, hlo, &operand_to_generator](
                  const IrArray::Index& index) -> StatusOr<llvm::Value*> {
         IrArray::Index sliced_index = index.SourceIndexOfSlice(
             /*shape=*/hlo->shape(), /*starts=*/hlo->slice_starts(),
-            /*strides=*/hlo->slice_strides(), /*builder=*/ir_builder_);
+            /*strides=*/hlo->slice_strides(), /*builder=*/b_);
         return operand_to_generator.at(hlo->operand(0))(sliced_index);
       };
     case HloOpcode::kDynamicSlice:
@@ -2085,27 +2101,26 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
                ShapeUtil::ElementsIn(hlo->operand(0)->shape()));
       return [this, hlo, &operand_to_generator](const IrArray::Index& index) {
         const HloInstruction* operand = hlo->operand(0);
-        return operand_to_generator.at(operand)(index.SourceIndexOfBitcast(
-            hlo->shape(), operand->shape(), ir_builder_));
+        return operand_to_generator.at(operand)(
+            index.SourceIndexOfBitcast(hlo->shape(), operand->shape(), b_));
       };
     case HloOpcode::kReshape:
       CHECK_EQ(ShapeUtil::ElementsIn(hlo->shape()),
                ShapeUtil::ElementsIn(hlo->operand(0)->shape()));
       return [this, hlo, &operand_to_generator](const IrArray::Index& index) {
         const HloInstruction* operand = hlo->operand(0);
-        return operand_to_generator.at(operand)(index.SourceIndexOfReshape(
-            hlo->shape(), operand->shape(), ir_builder_));
+        return operand_to_generator.at(operand)(
+            index.SourceIndexOfReshape(hlo->shape(), operand->shape(), b_));
       };
     case HloOpcode::kTranspose:
       return [this, hlo,
               &operand_to_generator](const IrArray::Index& target_index) {
         return operand_to_generator.at(hlo->operand(0))(
             target_index.SourceIndexOfTranspose(
-                hlo->shape(), hlo->operand(0)->shape(), hlo->dimensions(),
-                ir_builder_));
+                hlo->shape(), hlo->operand(0)->shape(), hlo->dimensions(), b_));
       };
     case HloOpcode::kRng:
-      return MakeRngElementGenerator(hlo, operand_to_generator);
+      return MakePhiloxRngElementGenerator(hlo, operand_to_generator);
     case HloOpcode::kPad:
       return [this, hlo, &operand_to_generator](
                  const IrArray::Index& padded_index) -> StatusOr<llvm::Value*> {
@@ -2127,11 +2142,11 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
 }
 
 llvm::Value* ElementalIrEmitter::EmitExtractReal(llvm::Value* value) const {
-  return ir_builder_->CreateExtractValue(value, {0});
+  return b_->CreateExtractValue(value, {0});
 }
 
 llvm::Value* ElementalIrEmitter::EmitExtractImag(llvm::Value* value) const {
-  return ir_builder_->CreateExtractValue(value, {1});
+  return b_->CreateExtractValue(value, {1});
 }
 
 llvm::Value* ElementalIrEmitter::EmitComposeComplex(const HloInstruction* op,
@@ -2139,10 +2154,10 @@ llvm::Value* ElementalIrEmitter::EmitComposeComplex(const HloInstruction* op,
                                                     llvm::Value* imag) const {
   auto cplx_type =
       llvm_ir::PrimitiveTypeToIrType(op->shape().element_type(), module_);
-  auto complex = ir_builder_->CreateInsertValue(
+  auto complex = b_->CreateInsertValue(
       llvm::ConstantAggregateZero::get(cplx_type), real, {0});
   if (imag != nullptr) {
-    complex = ir_builder_->CreateInsertValue(complex, imag, {1});
+    complex = b_->CreateInsertValue(complex, imag, {1});
   }
   return complex;
 }
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
index d199473374..fcb34557a5 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
@@ -34,10 +34,8 @@ class ElementalIrEmitter {
       std::unordered_map<const HloInstruction*, llvm_ir::ElementGenerator>;
 
   ElementalIrEmitter(const HloModuleConfig& hlo_module_config,
-                     llvm::Module* module, llvm::IRBuilder<>* ir_builder)
-      : ir_builder_(ir_builder),
-        module_(module),
-        hlo_module_config_(hlo_module_config) {}
+                     llvm::Module* module, llvm::IRBuilder<>* b)
+      : b_(b), module_(module), hlo_module_config_(hlo_module_config) {}
 
   virtual ~ElementalIrEmitter() = default;
 
@@ -54,7 +52,7 @@ class ElementalIrEmitter {
       const HloInstruction* hlo,
       const HloToElementGeneratorMap& operand_to_generator) const;
 
-  llvm::IRBuilder<>* ir_builder() const { return ir_builder_; }
+  llvm::IRBuilder<>* b() const { return b_; }
   llvm::Module* module() const { return module_; }
 
  protected:
@@ -144,9 +142,7 @@ class ElementalIrEmitter {
       int64 operand_no) const;
 
   // Identifier of the thread unique among all threads on the device
-  virtual llvm::Value* EmitThreadId() const {
-    return ir_builder_->getIntN(128, 0);
-  }
+  virtual llvm::Value* EmitThreadId() const { return b_->getIntN(128, 0); }
 
   StatusOr<llvm::Value*> EmitElementalSelect(
       const HloInstruction* hlo,
@@ -188,7 +184,7 @@ class ElementalIrEmitter {
       const HloToElementGeneratorMap& operand_to_generator,
       const llvm_ir::IrArray::Index& dot_result_index) const;
 
-  llvm::IRBuilder<>* const ir_builder_;
+  llvm::IRBuilder<>* const b_;
 
   llvm::Module* module_;
 
@@ -197,10 +193,17 @@ class ElementalIrEmitter {
   const HloModuleConfig& hlo_module_config_;
 
  private:
-  // Returns a ElementGenerator for a RNG HloInstruction.
-  llvm_ir::ElementGenerator MakeRngElementGenerator(
+  // Returns a ElementGenerator for an RNG HloInstruction using the Philox
+  // random number generation algorithm.
+  llvm_ir::ElementGenerator MakePhiloxRngElementGenerator(
       const HloInstruction* hlo,
       const HloToElementGeneratorMap& operand_to_generator) const;
+  // Converts the raw value generated by a random number generation algorithm
+  // to the distribution requested by the RNG HloInstruction.
+  StatusOr<llvm::Value*> ConvertValueForDistribution(
+      const HloInstruction* hlo,
+      const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator,
+      const llvm_ir::IrArray::Index& index, llvm::Value* raw_value) const;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc
index 7cf2746947..fd75847d0c 100644
--- a/tensorflow/compiler/xla/service/executable.cc
+++ b/tensorflow/compiler/xla/service/executable.cc
@@ -82,7 +82,18 @@ StatusOr<ScopedShapedBuffer> Executable::ExecuteOnStreamWrapper(
 
   StatusOr<ScopedShapedBuffer> return_value =
       ExecuteOnStream(run_options, arguments, profile_ptr.get());
-  TF_RETURN_IF_ERROR(return_value.status());
+  if (!return_value.status().ok()) {
+    if (profile != nullptr) {
+      // Ensure the ThenStartTimer call has completed before we destroy timer.
+      // We already have a failure status to return, so just log this if it
+      // fails.
+      Status status = stream->BlockHostUntilDone();
+      if (!status.ok()) {
+        LOG(ERROR) << "Failed to BlockHostUntilDone: " << status;
+      }
+    }
+    return return_value.status();
+  }
 
   if (profile != nullptr) {
     VLOG(1) << "enqueueing 'stop timer' and blocking host until done...";
diff --git a/tensorflow/compiler/xla/service/execution_tracker.cc b/tensorflow/compiler/xla/service/execution_tracker.cc
index 6794cfe297..228c3fac95 100644
--- a/tensorflow/compiler/xla/service/execution_tracker.cc
+++ b/tensorflow/compiler/xla/service/execution_tracker.cc
@@ -25,7 +25,7 @@ limitations under the License.
 namespace xla {
 
 AsyncExecution::AsyncExecution(Backend* backend,
-                               std::vector<Backend::StreamPtr> streams,
+                               std::vector<StreamPool::Ptr> streams,
                                const ExecutionProfile& profile,
                                GlobalDataHandle result)
     : backend_(CHECK_NOTNULL(backend)),
@@ -46,9 +46,10 @@ Status AsyncExecution::BlockUntilDone() const {
 
 ExecutionTracker::ExecutionTracker() : next_handle_(1) {}
 
-ExecutionHandle ExecutionTracker::Register(
-    Backend* backend, std::vector<Backend::StreamPtr> streams,
-    const ExecutionProfile& profile, GlobalDataHandle result) {
+ExecutionHandle ExecutionTracker::Register(Backend* backend,
+                                           std::vector<StreamPool::Ptr> streams,
+                                           const ExecutionProfile& profile,
+                                           GlobalDataHandle result) {
   tensorflow::mutex_lock lock(execution_mutex_);
   int64 handle = next_handle_++;
   auto inserted = handle_to_execution_.emplace(
diff --git a/tensorflow/compiler/xla/service/execution_tracker.h b/tensorflow/compiler/xla/service/execution_tracker.h
index 4458152dd9..4e9b9f883e 100644
--- a/tensorflow/compiler/xla/service/execution_tracker.h
+++ b/tensorflow/compiler/xla/service/execution_tracker.h
@@ -22,7 +22,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/executable_run_options.h"
 #include "tensorflow/compiler/xla/service/backend.h"
-#include "tensorflow/compiler/xla/service/pool.h"
+#include "tensorflow/compiler/xla/service/stream_pool.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -40,7 +40,7 @@ namespace xla {
 // the stream when destructed.
 class AsyncExecution {
  public:
-  AsyncExecution(Backend* backend, std::vector<Backend::StreamPtr> streams,
+  AsyncExecution(Backend* backend, std::vector<StreamPool::Ptr> streams,
                  const ExecutionProfile& profile, GlobalDataHandle result);
 
   Status BlockUntilDone() const;
@@ -54,7 +54,7 @@ class AsyncExecution {
   Backend* backend_;
 
   // Stream on which the execution is launched.
-  std::vector<Backend::StreamPtr> streams_;
+  std::vector<StreamPool::Ptr> streams_;
 
   // Profile object of the execution to be returned to the user.
   ExecutionProfile profile_;
@@ -72,7 +72,7 @@ class ExecutionTracker {
   // Registers an execution with its backend, streams, and data handle to the
   // execution result. Returns a handle for the registered execution.
   ExecutionHandle Register(Backend* backend,
-                           std::vector<Backend::StreamPtr> stream,
+                           std::vector<StreamPool::Ptr> stream,
                            const ExecutionProfile& profile,
                            GlobalDataHandle data);
 
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index ca39797e81..a73a341fdb 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -36,6 +36,7 @@ cc_library(
     hdrs = ["gpu_constants.h"],
     deps = [
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/core:framework",
     ],
 )
 
@@ -113,11 +114,13 @@ cc_library(
     srcs = ["hlo_to_ir_bindings.cc"],
     hdrs = ["hlo_to_ir_bindings.h"],
     deps = [
+        ":buffer_allocations",
         ":ir_emission_utils",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service/llvm_ir:alias_analysis",
+        "//tensorflow/compiler/xla/service/llvm_ir:buffer_assignment_util",
         "//tensorflow/compiler/xla/service/llvm_ir:ir_array",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
         "//tensorflow/compiler/xla/service/llvm_ir:tuple_ops",
@@ -141,6 +144,7 @@ cc_library(
     ],
     deps = [
         ":backend_configs",
+        ":buffer_allocations",
         ":cudnn_convolution_runner",
         ":elemental_ir_emitter",
         ":gpu_constants",
@@ -162,6 +166,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:elemental_ir_emitter",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:name_uniquer",
+        "//tensorflow/compiler/xla/service/llvm_ir:buffer_assignment_util",
         "//tensorflow/compiler/xla/service/llvm_ir:dynamic_update_slice_util",
         "//tensorflow/compiler/xla/service/llvm_ir:fused_ir_emitter",
         "//tensorflow/compiler/xla/service/llvm_ir:ir_array",
@@ -216,6 +221,7 @@ cc_library(
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_loop",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
         "//tensorflow/compiler/xla/service/llvm_ir:loop_emitter",
+        "//tensorflow/compiler/xla/service/llvm_ir:math_ops",
         "//tensorflow/core:lib",
         "@llvm//:core",
         "@llvm//:support",
@@ -246,7 +252,7 @@ cc_library(
     deps = [
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_execution_profile",
-        "//tensorflow/compiler/xla/service:pool",
+        "//tensorflow/compiler/xla/service:stream_pool",
         "//tensorflow/core:lib",
         "//tensorflow/core:ptr_util",
         "//tensorflow/core:stream_executor_no_cuda",
@@ -269,6 +275,7 @@ cc_library(
         "memset_thunk.cc",
         "outfeed_thunk.cc",
         "sequential_thunk.cc",
+        "thunk.cc",
         "thunk_schedule.cc",
         "tuple_thunk.cc",
         "while_thunk.cc",
@@ -320,6 +327,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/compiler/xla/service:transfer_manager",
         "//tensorflow/compiler/xla/service:tuple_points_to_analysis",
+        "//tensorflow/compiler/xla/service/llvm_ir:buffer_assignment_util",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:stream_executor_no_cuda",
@@ -449,6 +457,7 @@ cc_library(
     srcs = ["multi_output_fusion.cc"],
     hdrs = ["multi_output_fusion.h"],
     deps = [
+        ":instruction_fusion",
         ":ir_emission_utils",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/service:hlo",
@@ -461,6 +470,7 @@ tf_cc_test(
     name = "multi_output_fusion_test",
     srcs = ["multi_output_fusion_test.cc"],
     deps = [
+        ":instruction_fusion",
         ":multi_output_fusion",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
@@ -804,6 +814,7 @@ cc_library(
     deps = [
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:stream_executor_no_cuda",
     ],
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc b/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
index ab5149dcdb..537295292b 100644
--- a/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
+++ b/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
@@ -44,17 +44,27 @@ StatusOr<std::unique_ptr<BufferAllocations>> BufferAllocations::Builder::Build(
       num_buffers, device_ordinal, memory_allocator, buffer_assignment));
 
   for (BufferAllocation::Index i = 0; i < num_buffers; ++i) {
+    const BufferAllocation& allocation = buffer_assignment->GetAllocation(i);
+    const int64 expected_alignment = [&] {
+      if (allocation.is_entry_computation_parameter()) {
+        return kEntryParameterAlignBytes;
+      } else if (allocation.is_constant()) {
+        return kConstantBufferAlignBytes;
+      } else {
+        return kXlaAllocatedBufferAlignBytes;
+      }
+    }();
+
     // If buffer #i's address is already registered (e.g. external arguments or
     // result buffers), use that registered buffer.
     if (registered_buffers_.count(i)) {
       se::DeviceMemoryBase address = FindOrDie(registered_buffers_, i);
-      if (reinterpret_cast<uintptr_t>(address.opaque()) %
-              kCudaMallocAlignBytes !=
+      if (reinterpret_cast<uintptr_t>(address.opaque()) % expected_alignment !=
           0) {
         return InternalError(
             "Address of registered buffer %lld must be a multiple of %llx, but "
             "was %p",
-            i, kCudaMallocAlignBytes, address.opaque());
+            i, kEntryParameterAlignBytes, address.opaque());
       }
       buffer_allocations->SetBuffer(i, FindOrDie(registered_buffers_, i));
       continue;
@@ -62,7 +72,6 @@ StatusOr<std::unique_ptr<BufferAllocations>> BufferAllocations::Builder::Build(
 
     // Allocate each allocation that might escape, or is the temp buffer.
     bool seen_temp_buffer = false;
-    const BufferAllocation& allocation = buffer_assignment->GetAllocation(i);
     if (allocation.maybe_live_out() || allocation.IsPreallocatedTempBuffer()) {
       const int64 buffer_size = allocation.size();
       se::DeviceMemoryBase buffer_address;
@@ -70,13 +79,12 @@ StatusOr<std::unique_ptr<BufferAllocations>> BufferAllocations::Builder::Build(
         OwningDeviceMemory buffer;
         TF_ASSIGN_OR_RETURN(
             buffer, memory_allocator->Allocate(device_ordinal, buffer_size));
-        if (reinterpret_cast<uintptr_t>(buffer.opaque()) %
-                kCudaMallocAlignBytes !=
+        if (reinterpret_cast<uintptr_t>(buffer.opaque()) % expected_alignment !=
             0) {
           return InternalError(
               "Address returned by memory_allocator->Allocate must be a "
               "multiple of %llx, but was %p",
-              kCudaMallocAlignBytes, buffer.opaque());
+              kXlaAllocatedBufferAlignBytes, buffer.opaque());
         }
         // We do manual memory management within BufferAllocations.  Be sure not
         // to do a TF_RETURN_IF_ERROR between this line and the
@@ -165,5 +173,10 @@ void BufferAllocations::SetBuffer(BufferAllocation::Index buffer_index,
   buffers_[buffer_index] = buffer;
 }
 
+bool ShouldEmitLiteralInLlvmIr(const Literal& literal) {
+  // LLVM can sometimes do interesting optimizations using scalar constants.
+  return ShapeUtil::IsScalar(literal.shape());
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_allocations.h b/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
index 6366235025..f13eab0dd7 100644
--- a/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
+++ b/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
@@ -107,6 +107,12 @@ class BufferAllocations {
   bool torn_down_ = false;
 };
 
+// LLVM and PTXAS don't deal well with large constants, so we only emit very
+// small constants directly in LLVM IR.  Larger constants are emitted with zero
+// initializers in LLVM IR and are later overwritten when the PTX/CUBIN is
+// loaded.
+bool ShouldEmitLiteralInLlvmIr(const Literal& literal);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
index e594cec2f8..cc38db27e2 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/math_ops.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -67,8 +68,8 @@ bool IsFPLiteralWithValue(const HloInstruction* operand, float value) {
 
 GpuElementalIrEmitter::GpuElementalIrEmitter(
     const HloModuleConfig& hlo_module_config, llvm::Module* module,
-    llvm::IRBuilder<>* ir_builder, NestedComputer compute_nested)
-    : ElementalIrEmitter(hlo_module_config, module, ir_builder),
+    llvm::IRBuilder<>* b, NestedComputer compute_nested)
+    : ElementalIrEmitter(hlo_module_config, module, b),
       hlo_module_config_(hlo_module_config),
       compute_nested_(std::move(compute_nested)) {}
 
@@ -92,8 +93,8 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitLibdeviceMathCall(
       cast_result_to_fp16 = true;
       for (int64 i = 0; i < operands.size(); ++i) {
         if (input_types[i] == F16) {
-          converted_operands[i] = ir_builder_->CreateFPCast(
-              converted_operands[i], ir_builder_->getFloatTy());
+          converted_operands[i] =
+              b_->CreateFPCast(converted_operands[i], b_->getFloatTy());
           converted_input_types[i] = F32;
         }
       }
@@ -112,7 +113,7 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitLibdeviceMathCall(
                                      converted_input_types, output_type)
                             .ValueOrDie();
   if (cast_result_to_fp16) {
-    result = ir_builder_->CreateFPCast(result, ir_builder_->getHalfTy());
+    result = b_->CreateFPCast(result, b_->getHalfTy());
   }
   return result;
 }
@@ -215,7 +216,7 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitPowerOp(
     // LLVM's NVPTX backend knows how to transform 1/sqrt(A) into the NVPTX
     // rsqrt.approx instruction.
     TF_ASSIGN_OR_RETURN(auto* sqrt, make_sqrt());
-    return ir_builder_->CreateFDiv(llvm::ConstantFP::get(llvm_ty, 1), sqrt);
+    return b_->CreateFDiv(llvm::ConstantFP::get(llvm_ty, 1), sqrt);
   }
 
   VLOG(10) << "emitting pow as regular call to pow(): " << op->ToString();
@@ -277,6 +278,16 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitFloatUnaryOp(
   PrimitiveType output_type = op->shape().element_type();
   switch (op->opcode()) {
     case HloOpcode::kTanh:
+      // If we don't care much about precision, emit a fast approximation of
+      // tanh.
+      if (hlo_module_config_.debug_options().xla_enable_fast_math()) {
+        // Upcast F16 to F32 if necessary.
+        llvm::Type* type =
+            input_type == F16 ? b_->getFloatTy() : operand_value->getType();
+        llvm::Value* input = b_->CreateFPCast(operand_value, type);
+        llvm::Value* fast_tanh = llvm_ir::EmitFastTanh(b_, input);
+        return b_->CreateFPCast(fast_tanh, operand_value->getType());
+      }
       return EmitLibdeviceMathCall("__nv_tanh", {operand_value}, {input_type},
                                    output_type);
     default:
@@ -302,32 +313,31 @@ llvm::Value* GpuElementalIrEmitter::EmitDeviceFunctionCall(
 
   // Declares the callee if it is not declared already.
   llvm::Function* callee = llvm::cast<llvm::Function>(
-      ir_builder_->GetInsertBlock()->getModule()->getOrInsertFunction(
+      b_->GetInsertBlock()->getModule()->getOrInsertFunction(
           llvm_ir::AsStringRef(callee_name), callee_type));
 
   for (auto attribute : attributes) {
     callee->addFnAttr(attribute);
   }
 
-  return ir_builder_->CreateCall(callee, llvm_ir::AsArrayRef(operands));
+  return b_->CreateCall(callee, llvm_ir::AsArrayRef(operands));
 }
 
 llvm::Value* GpuElementalIrEmitter::EmitThreadId() const {
-  llvm::Value* block_id = ir_builder_->CreateIntCast(
+  llvm::Value* block_id = b_->CreateIntCast(
       llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x,
-                                   {}, {}, ir_builder_),
-      ir_builder_->getIntNTy(128), /*isSigned=*/true, "block.id");
-  llvm::Value* thread_id_in_block = ir_builder_->CreateIntCast(
+                                   {}, {}, b_),
+      b_->getIntNTy(128), /*isSigned=*/true, "block.id");
+  llvm::Value* thread_id_in_block = b_->CreateIntCast(
       llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x,
-                                   {}, {}, ir_builder_),
-      ir_builder_->getIntNTy(128), /*isSigned=*/true, "thread.id");
-  llvm::Value* threads_per_block = ir_builder_->CreateIntCast(
+                                   {}, {}, b_),
+      b_->getIntNTy(128), /*isSigned=*/true, "thread.id");
+  llvm::Value* threads_per_block = b_->CreateIntCast(
       llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x,
-                                   {}, {}, ir_builder_),
-      ir_builder_->getIntNTy(128), /*isSigned=*/true, "threads_per_block");
-  return ir_builder_->CreateNSWAdd(
-      ir_builder_->CreateNSWMul(block_id, threads_per_block),
-      thread_id_in_block);
+                                   {}, {}, b_),
+      b_->getIntNTy(128), /*isSigned=*/true, "threads_per_block");
+  return b_->CreateNSWAdd(b_->CreateNSWMul(block_id, threads_per_block),
+                          thread_id_in_block);
 }
 
 llvm_ir::ElementGenerator GpuElementalIrEmitter::MakeElementGenerator(
@@ -373,12 +383,12 @@ llvm_ir::ElementGenerator GpuElementalIrEmitter::MakeElementGenerator(
         PrimitiveType operand_element_type = operand->shape().element_type();
         llvm::Value* accum_ptr = llvm_ir::EmitAllocaAtFunctionEntry(
             llvm_ir::PrimitiveTypeToIrType(operand_element_type, module_),
-            "reduce_window_accum_ptr", ir_builder_);
+            "reduce_window_accum_ptr", b_);
         {
           TF_ASSIGN_OR_RETURN(llvm::Value * init_value,
                               operand_to_generator.at(hlo->operand(1))(
                                   IrArray::Index(index.GetType())));
-          ir_builder_->CreateStore(init_value, accum_ptr);
+          b_->CreateStore(init_value, accum_ptr);
         }
 
         llvm::Type* index_type = index.GetType();
@@ -386,7 +396,7 @@ llvm_ir::ElementGenerator GpuElementalIrEmitter::MakeElementGenerator(
           return index.GetConstantWithIndexType(c);
         };
 
-        llvm_ir::ForLoopNest loops(IrName(hlo), ir_builder_, index_type);
+        llvm_ir::ForLoopNest loops(IrName(hlo), b_, index_type);
         std::vector<int64> window_size;
         for (const auto& dim : window.dimensions()) {
           window_size.push_back(dim.size());
@@ -395,15 +405,15 @@ llvm_ir::ElementGenerator GpuElementalIrEmitter::MakeElementGenerator(
             ShapeUtil::MakeShape(operand_element_type, window_size), "window");
         CHECK_EQ(window_index.size(), index.size());
 
-        SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), ir_builder_);
+        SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), b_);
 
         IrArray::Index input_index(index_type, index.size());
-        llvm::Value* in_bounds = ir_builder_->getInt1(true);
+        llvm::Value* in_bounds = b_->getInt1(true);
         for (size_t i = 0; i < index.size(); ++i) {
-          llvm::Value* stridden_index = ir_builder_->CreateNSWMul(
+          llvm::Value* stridden_index = b_->CreateNSWMul(
               index[i], index_typed_const(window.dimensions(i).stride()));
-          input_index[i] = ir_builder_->CreateNSWSub(
-              ir_builder_->CreateNSWAdd(stridden_index, window_index[i]),
+          input_index[i] = b_->CreateNSWSub(
+              b_->CreateNSWAdd(stridden_index, window_index[i]),
               index_typed_const(window.dimensions(i).padding_low()));
 
           // We must check whether 0 ≤ input_index[i] < bound, as otherwise
@@ -411,16 +421,16 @@ llvm_ir::ElementGenerator GpuElementalIrEmitter::MakeElementGenerator(
           // comparison is equivalent to the unsigned comparison
           // input_index[i] < bound, as a negative value wraps to a large
           // positive value.
-          in_bounds = ir_builder_->CreateAnd(
+          in_bounds = b_->CreateAnd(
               in_bounds,
-              ir_builder_->CreateICmpULT(
+              b_->CreateICmpULT(
                   input_index[i],
                   index_typed_const(operand->shape().dimensions(i))));
         }
 
         llvm_ir::LlvmIfData if_data =
-            llvm_ir::EmitIfThenElse(in_bounds, "in_bounds", ir_builder_);
-        SetToFirstInsertPoint(if_data.true_block, ir_builder_);
+            llvm_ir::EmitIfThenElse(in_bounds, "in_bounds", b_);
+        SetToFirstInsertPoint(if_data.true_block, b_);
 
         // We are not in pad, so do the computation.
         TF_ASSIGN_OR_RETURN(llvm::Value * input_value,
@@ -428,26 +438,26 @@ llvm_ir::ElementGenerator GpuElementalIrEmitter::MakeElementGenerator(
         TF_ASSIGN_OR_RETURN(
             llvm::Value * accum_value,
             compute_nested_(*hlo->to_apply(),
-                            {ir_builder_->CreateLoad(accum_ptr), input_value}));
-        ir_builder_->CreateStore(accum_value, accum_ptr);
+                            {b_->CreateLoad(accum_ptr), input_value}));
+        b_->CreateStore(accum_value, accum_ptr);
 
-        SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), ir_builder_);
-        return ir_builder_->CreateLoad(accum_ptr);
+        SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), b_);
+        return b_->CreateLoad(accum_ptr);
       };
     case HloOpcode::kReduce:
       return [=, &operand_to_generator](
                  const IrArray::Index& output_index) -> StatusOr<llvm::Value*> {
         const HloInstruction* operand = hlo->operand(0);
         llvm::Value* accum_ptr =
-            ir_builder()->CreateAlloca(llvm_ir::PrimitiveTypeToIrType(
+            b()->CreateAlloca(llvm_ir::PrimitiveTypeToIrType(
                 hlo->shape().element_type(), module_));
         llvm::Type* index_type = output_index.GetType();
         TF_ASSIGN_OR_RETURN(llvm::Value * init_value,
                             operand_to_generator.at(hlo->operand(1))(
                                 IrArray::Index(index_type)));
-        ir_builder()->CreateStore(init_value, accum_ptr);
+        b()->CreateStore(init_value, accum_ptr);
 
-        llvm_ir::ForLoopNest loops(IrName(hlo), ir_builder_, index_type);
+        llvm_ir::ForLoopNest loops(IrName(hlo), b_, index_type);
         IrArray::Index input_index = loops.AddLoopsForShapeOnDimensions(
             operand->shape(), hlo->dimensions(), "reduction_dim");
         if (!ShapeUtil::IsScalar(hlo->shape())) {
@@ -462,18 +472,17 @@ llvm_ir::ElementGenerator GpuElementalIrEmitter::MakeElementGenerator(
           CHECK_EQ(output_index.size(), j);
         }
 
-        SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), ir_builder());
+        SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), b());
         TF_ASSIGN_OR_RETURN(
             llvm::Value * input_value,
             operand_to_generator.at(hlo->operand(0))(input_index));
         TF_ASSIGN_OR_RETURN(
             llvm::Value * accum_value,
-            compute_nested_(
-                *hlo->to_apply(),
-                {ir_builder()->CreateLoad(accum_ptr), input_value}));
-        ir_builder()->CreateStore(accum_value, accum_ptr);
-        SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), ir_builder());
-        return ir_builder()->CreateLoad(accum_ptr);
+            compute_nested_(*hlo->to_apply(),
+                            {b()->CreateLoad(accum_ptr), input_value}));
+        b()->CreateStore(accum_value, accum_ptr);
+        SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), b());
+        return b()->CreateLoad(accum_ptr);
       };
     default:
       return ElementalIrEmitter::MakeElementGenerator(hlo,
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
index 91f4d960aa..e3eacef133 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
@@ -43,7 +43,7 @@ class GpuElementalIrEmitter : public ElementalIrEmitter {
       const HloComputation&, tensorflow::gtl::ArraySlice<llvm::Value*>)>;
 
   GpuElementalIrEmitter(const HloModuleConfig& hlo_module_config,
-                        llvm::Module* module, llvm::IRBuilder<>* ir_builder,
+                        llvm::Module* module, llvm::IRBuilder<>* b,
                         NestedComputer compute_nested);
 
   llvm_ir::ElementGenerator MakeElementGenerator(
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_constants.cc b/tensorflow/compiler/xla/service/gpu/gpu_constants.cc
index aa360c7f73..7f0b030fec 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_constants.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_constants.cc
@@ -14,12 +14,23 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
+#include "tensorflow/core/framework/allocator.h"
 
 namespace xla {
 namespace gpu {
 
-// http://docs.nvidia.com/cuda/cuda-c-programming-guide/#device-memory-accesses
-const int64 kCudaMallocAlignBytes = 256;
+// kEntryParameterAlignBytes is equal to EIGEN_MAX_ALIGN_BYTES, though including
+// Eigen headers here to get that symbol may not be a good idea.
+// EIGEN_MAX_ALIGN_BYTES may differ between CUDA-enabled builds vs CUDA-disabled
+// builds and we don't want the IR generated by XLA:GPU to depend on that.
+//
+// TODO(b/111767313): Consider raising EIGEN_MAX_ALIGN_BYTES if it helps.
+const int64 kEntryParameterAlignBytes = 16;
+
+const int64 kXlaAllocatedBufferAlignBytes =
+    tensorflow::Allocator::kAllocatorAlignment;
+
+const int64 kConstantBufferAlignBytes = kXlaAllocatedBufferAlignBytes;
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_constants.h b/tensorflow/compiler/xla/service/gpu/gpu_constants.h
index eb1ca4c6c9..6f5f1fa09c 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_constants.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_constants.h
@@ -21,9 +21,15 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-// Minimum alignment of cudaMalloc.  We require that buffers created by our
-// DeviceMemoryAllocator, and all input/output buffers, have this alignment.
-extern const int64 kCudaMallocAlignBytes;
+// Minimum alignment for buffers passed as incoming arguments by TensorFlow.
+extern const int64 kEntryParameterAlignBytes;
+
+// Minimum alignment for buffers allocated by XLA: the temp buffers and the live
+// out (result) buffers.
+extern const int64 kXlaAllocatedBufferAlignBytes;
+
+// Minimum alignment for constant buffers.
+extern const int64 kConstantBufferAlignBytes;
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc b/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc
index fbc1303085..75f414e47f 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc
@@ -48,80 +48,17 @@ StatusOr<bool> GpuCopyInsertion::Run(HloModule* module) {
 
   TF_ASSIGN_OR_RETURN(bool changed, generic_copy_insertion.Run(module));
 
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloDataflowAnalysis> dataflow,
-                      HloDataflowAnalysis::Run(*module));
-
-  // Make sure all operands of a library call are in memory instead of constants
-  // in IR. Also, init values of while and conditional nodes cannot be
-  // constants. Insert copies for any constants found at the operands of these
-  // nodes.
-  tensorflow::gtl::FlatSet<HloInstruction*> inserted_copies;
+  // Check the assumption that the epsilon and feature_index constants of the
+  // CUDNN batchnorm op are not shared with other ops where we would replace
+  // them with a copy. These custom op calls are generated with the
+  // CudnnBatchNormRewriter, so this would only happen if HloCSE merges them.
   for (HloComputation* computation : module->computations()) {
     for (HloInstruction* hlo : computation->instructions()) {
-      // Inserts a copy of hlo->operand(n) if it's a constant.
-      auto copy_operand_if_constant = [&](int64 n) -> Status {
-        HloInstruction* operand = hlo->mutable_operand(n);
-        // Skip the operands that have already been replaced with a copy in a
-        // previous iteration (which is possible when a constant is used as an
-        // operand in multiple places).
-        if (ContainsKey(inserted_copies, operand)) {
-          return Status::OK();
-        }
-        for (auto& pair : dataflow->GetInstructionValueSet(operand)) {
-          const HloValueSet& value_set = pair.second;
-          for (const HloValue* value : value_set.values()) {
-            if (value->defining_instruction()->IsConstant() &&
-                !ContainsKey(hlo_to_copy_map_, value->defining_instruction())) {
-              HloInstruction* constant = value->defining_instruction();
-              TF_ASSIGN_OR_RETURN(HloInstruction * copy,
-                                  FindOrInsertCopy(constant));
-              TF_RETURN_IF_ERROR(constant->ReplaceAllUsesWith(copy));
-              inserted_copies.insert(copy);
-              changed = true;
-            }
-          }
-        }
-        return Status::OK();
-      };
-
-      if (IsCustomCallToDnnBatchNorm(*hlo)) {
-        // The epsilon and feature_index operands to a CUDNN batchnorm op don't
-        // need to be materialized in memory -- in fact, they must be constants.
-        // These are the last two operands of all three batchnorm ops.
-        for (int64 i = 0; i < hlo->operand_count() - 2; ++i) {
-          TF_RETURN_IF_ERROR(copy_operand_if_constant(i));
-        }
-      } else if (ImplementedAsLibraryCall(*hlo) ||
-                 hlo->opcode() == HloOpcode::kCrossReplicaSum ||
-                 hlo->opcode() == HloOpcode::kWhile ||
-                 hlo->opcode() == HloOpcode::kConditional) {
-        // For all other library calls, cross-replica-sum, while and conditional
-        // ops materialize all the operands into memory.  (Cross-replica-sum
-        // gets its constant args materialized even if it's not implemented as a
-        // libcall to simplify the implementation.  It's slower, but we can
-        // constant fold away constant args *anyway*, so we just need to make it
-        // work.)
-        for (int64 i = 0; i < hlo->operand_count(); ++i) {
-          TF_RETURN_IF_ERROR(copy_operand_if_constant(i));
-        }
+      if (!IsCustomCallToDnnBatchNorm(*hlo)) {
+        continue;
       }
-    }
-  }
-
-  if (changed) {
-    // Check the assumption that the epsilon and feature_index constants of the
-    // CUDNN batchnorm op are not shared with other ops where we would replace
-    // them with a copy. These custom op calls are generated with the
-    // CudnnBatchNormRewriter, so this would only happen if HloCSE merges them.
-    for (HloComputation* computation : module->computations()) {
-      for (HloInstruction* hlo : computation->instructions()) {
-        if (!IsCustomCallToDnnBatchNorm(*hlo)) {
-          continue;
-        }
-        for (int64 i = hlo->operand_count() - 2; i < hlo->operand_count();
-             ++i) {
-          CHECK_EQ(hlo->operand(i)->opcode(), HloOpcode::kConstant);
-        }
+      for (int64 i = hlo->operand_count() - 2; i < hlo->operand_count(); ++i) {
+        CHECK_EQ(hlo->operand(i)->opcode(), HloOpcode::kConstant);
       }
     }
   }
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index 0cad2958c7..bb71c79fd7 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
@@ -84,7 +85,7 @@ Status GpuExecutable::ExecuteThunks(
   }
 
   // Stream 0 indicates `main_stream` and substreams start from stream 1.
-  std::vector<Pool<se::Stream>::SmartPtr> sub_streams;
+  std::vector<StreamPool::Ptr> sub_streams;
   sub_streams.reserve(thunk_schedule_->StreamCount() - 1);
   while (sub_streams.size() + 1 < thunk_schedule_->StreamCount()) {
     sub_streams.emplace_back();
@@ -181,6 +182,55 @@ Status GpuExecutable::ExecuteThunks(
   return Status::OK();
 }
 
+StatusOr<const GpuExecutable::BufferAllocToDeviceMemoryMap*>
+GpuExecutable::ResolveConstantGlobals(se::StreamExecutor* executor) {
+  tensorflow::mutex_lock lock(module_handle_mutex_);
+  auto it = module_globals_.find(executor);
+  if (it != module_globals_.end()) {
+    return &it->second;
+  }
+
+  se::MultiModuleLoaderSpec module_spec;
+  if (!cubin().empty()) {
+    module_spec.AddCudaCubinInMemory(cubin());
+  }
+  module_spec.AddCudaPtxInMemory(ptx().c_str());
+
+  tensorflow::gtl::FlatMap<int64, se::DeviceMemoryBase> globals;
+  se::ModuleHandle module_handle;
+  executor->LoadModule(module_spec, &module_handle);
+
+  for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size();
+       ++i) {
+    const BufferAllocation& allocation = assignment_->GetAllocation(i);
+    if (allocation.is_constant()) {
+      TF_ASSIGN_OR_RETURN(
+          se::DeviceMemoryBase global,
+          executor->GetUntypedSymbol(
+              llvm_ir::ConstantBufferAllocationToGlobalName(allocation),
+              module_handle));
+      VLOG(3) << "Resolved global "
+              << llvm_ir::ConstantBufferAllocationToGlobalName(allocation)
+              << " to " << global.opaque();
+      InsertOrDie(&globals, i, global);
+
+      const Literal& literal =
+          llvm_ir::LiteralForConstantAllocation(allocation);
+      CHECK(ShapeUtil::IsArray(literal.shape()));
+      if (!ShouldEmitLiteralInLlvmIr(literal)) {
+        VLOG(3) << "H2D memcpy for constant with shape "
+                << ShapeUtil::HumanString(literal.shape());
+        TF_RETURN_IF_ERROR(executor->SynchronousMemcpyH2D(
+            literal.untyped_data(), allocation.size(), &global));
+      }
+    }
+  }
+
+  module_handles_.emplace(executor,
+                          se::ScopedModuleHandle(executor, module_handle));
+  return &module_globals_.emplace(executor, std::move(globals)).first->second;
+}
+
 StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteOnStream(
     const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
@@ -192,6 +242,10 @@ StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteOnStream(
   }
 
   BufferAllocations::Builder buffer_allocations_builder;
+  se::StreamExecutor* executor = run_options->stream()->parent();
+
+  TF_ASSIGN_OR_RETURN(auto* const globals, ResolveConstantGlobals(executor));
+
   for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size();
        ++i) {
     const BufferAllocation& allocation = assignment_->GetAllocation(i);
@@ -213,8 +267,12 @@ StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteOnStream(
 
       buffer_allocations_builder.RegisterBuffer(i, buffer);
     }
+
+    if (allocation.is_constant()) {
+      buffer_allocations_builder.RegisterBuffer(i, FindOrDie(*globals, i));
+    }
   }
-  se::StreamExecutor* executor = run_options->stream()->parent();
+
   TF_ASSIGN_OR_RETURN(
       auto buffer_allocations,
       buffer_allocations_builder.Build(
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
index 80ec38c3ac..c7ce6d0acb 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@@ -34,6 +34,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
@@ -66,7 +68,7 @@ class GpuExecutable : public Executable {
   }
 
   // Returns the compiled PTX for the computation.
-  tensorflow::StringPiece ptx() const { return ptx_; }
+  const string& ptx() const { return ptx_; }
 
   // Returns the cubin (compiled PTX) stored in this GpuExecutable.  May be
   // empty, in which case compilation is left up to the GPU driver.
@@ -98,6 +100,15 @@ class GpuExecutable : public Executable {
   // computation. Uses points-to analysis from buffer assignment.
   const PointsToSet& GetRootPointsToSet() const;
 
+  using BufferAllocToDeviceMemoryMap =
+      tensorflow::gtl::FlatMap<BufferAllocation::Index, se::DeviceMemoryBase>;
+
+  // Loads the PTX or CUBIN for this executable into `executor` and resolves the
+  // globals corresponding to constant buffers.  Returns a map mapping buffer
+  // allocation indices to GPU pointers.
+  StatusOr<const BufferAllocToDeviceMemoryMap*> ResolveConstantGlobals(
+      stream_executor::StreamExecutor* executor);
+
   // The LLVM IR, in string format, of the unoptimized module generated for this
   // GpuExecutable. We save a string instead of an llvm::Module* because leaving
   // llvm::Module* in a singleton can cause the heap checker to emit false
@@ -126,6 +137,14 @@ class GpuExecutable : public Executable {
   // memory for every output/temp buffers.
   const std::unique_ptr<const BufferAssignment> assignment_;
 
+  // Cache of module handles and constant buffer allocation maps used by
+  // `ResolveConstantGlobals`.
+  tensorflow::mutex module_handle_mutex_;
+  std::map<stream_executor::StreamExecutor*, se::ScopedModuleHandle>
+      module_handles_ GUARDED_BY(module_handle_mutex_);
+  std::map<stream_executor::StreamExecutor*, BufferAllocToDeviceMemoryMap>
+      module_globals_ GUARDED_BY(module_handle_mutex_);
+
   TF_DISALLOW_COPY_AND_ASSIGN(GpuExecutable);
 };
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
index 09ef62c87f..cc67804d56 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
@@ -31,8 +31,8 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-using stream_executor::dnn::DataLayout;
-using stream_executor::dnn::FilterLayout;
+using se::dnn::DataLayout;
+using se::dnn::FilterLayout;
 
 static bool IsVoltaOrLater(const se::StreamExecutor& stream_executor) {
   int major, minor;
@@ -44,7 +44,7 @@ static bool IsVoltaOrLater(const se::StreamExecutor& stream_executor) {
 // Returns (input, filter, output) layouts.
 static std::tuple<DataLayout, FilterLayout, DataLayout>
 HeuristicLayoutAssignment(const HloInstruction* instr,
-                          stream_executor::StreamExecutor* stream_executor) {
+                          se::StreamExecutor* stream_executor) {
   // DataLayout and FilterLayout uses weird enum names. Translations:
   //   N <=> Batch or Output
   //   C <=> Depth or Input
@@ -52,31 +52,44 @@ HeuristicLayoutAssignment(const HloInstruction* instr,
   //   W <=> X
   //
   // Therefore kOutputInputYX and kBatchDepthYX mean NCHW.
+  //
+  // If you have trouble keeping these straight, consider that all that matters
+  // is the location of the channel dim: Is it major (NCHW), or minor (NHWC)?
+
+  constexpr auto kAllNCHW =
+      std::make_tuple(DataLayout::kBatchDepthYX, FilterLayout::kOutputInputYX,
+                      DataLayout::kBatchDepthYX);
+  constexpr auto kAllNHWC =
+      std::make_tuple(DataLayout::kBatchYXDepth, FilterLayout::kOutputYXInput,
+                      DataLayout::kBatchYXDepth);
 
-  // As of today, our empirical evidence is that cudnn 7.0 is faster on V100 x
-  // fp16 with the mostly-NHWC layout. The heuristic may change as cudnn version
-  // changes, as well as the hardware updates.
+  // If we're not Volta or not fp16, the decision is easy: Use NCHW.
   if (!(instr->operand(0)->shape().element_type() == xla::PrimitiveType::F16 &&
         IsVoltaOrLater(*stream_executor))) {
-    return std::make_tuple(DataLayout::kBatchDepthYX,
-                           FilterLayout::kOutputInputYX,
-                           DataLayout::kBatchDepthYX);
+    return kAllNCHW;
   }
+
   VLOG(2) << "Using heuristic to figure out layouts for " << instr->ToString();
-  // For BackwardInput that has stride, full NHWC layouts run significantly
-  // slower than (NHWC, NCHW, NCHW) or (NHWC, NCHW, NHWC).
+
+  // Empirically we've found with Volta and cudnn 7 that backward-input convs
+  // with stride are significantly faster with NCHW layouts.
   //
-  // TODO(timshen): more closely compare (NHWC, NCHW, NCHW) and (NHWC, NCHW,
-  // NHWC).
+  // We could have used a mixed layout combination, e.g. (NHWC, NCHW, NCHW),
+  // which on paper gives good performance. However, there are two observations:
+  // * a mixed layout combination is more cuDNN-bug prone, based on empirical
+  //   envidence.
+  // * we've also observed that for mixed layouts, cuDNN transposes data back
+  //   and forth from a different layout combination. If we end up with
+  //   transposes anyway, we prefer to have them in XLA, as they can be fused.
+  // TODO(timshen): Figure out the exact condition. This may be achieved by
+  // auto-tuning layouts offline.
   if (instr->custom_call_target() == kCudnnConvBackwardInputCallTarget &&
       window_util::HasStride(instr->window())) {
-    return std::make_tuple(DataLayout::kBatchYXDepth,
-                           FilterLayout::kOutputInputYX,
-                           DataLayout::kBatchDepthYX);
+    return kAllNCHW;
   }
-  return std::make_tuple(DataLayout::kBatchYXDepth,
-                         FilterLayout::kOutputYXInput,
-                         DataLayout::kBatchYXDepth);
+
+  // For other Volta f16 convolutions, use NHWC.
+  return kAllNHWC;
 }
 
 // Adds layout constraints on the cudnn custom-call instruction. The layout
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.cc b/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.cc
index 19420e590d..1722676930 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/pool.h"
+#include "tensorflow/compiler/xla/service/stream_pool.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/util/ptr_util.h"
@@ -37,10 +37,9 @@ void InitAndStartTimer(std::stack<std::unique_ptr<se::Timer>>* timers,
   stream->InitTimer(timers->top().get()).ThenStartTimer(timers->top().get());
 }
 
-uint64 GetCyclesTaken(
-    std::stack<std::unique_ptr<se::Timer>>* timers,
-    const std::vector<Pool<se::Stream>::SmartPtr>& sub_streams,
-    se::Stream* stream, double clock_rate_ghz) {
+uint64 GetCyclesTaken(std::stack<std::unique_ptr<se::Timer>>* timers,
+                      const std::vector<StreamPool::Ptr>& sub_streams,
+                      se::Stream* stream, double clock_rate_ghz) {
   CHECK_GT(timers->size(), 0);
   stream->ThenWaitFor(&sub_streams);
   stream->ThenStopTimer(timers->top().get());
@@ -53,7 +52,7 @@ uint64 GetCyclesTaken(
 
 HloExecutionProfiler::HloExecutionProfiler(
     bool do_profile, HloExecutionProfile* profile, se::Stream* stream,
-    const std::vector<Pool<se::Stream>::SmartPtr>& sub_streams,
+    const std::vector<StreamPool::Ptr>& sub_streams,
     const HloComputation* computation)
     : do_profile_(do_profile),
       profile_(profile),
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h b/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h
index 6654850bef..80cde75f2b 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h
+++ b/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h
@@ -24,7 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/pool.h"
+#include "tensorflow/compiler/xla/service/stream_pool.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
 namespace xla {
@@ -38,10 +38,10 @@ class ScopedInstructionProfiler;
 class HloExecutionProfiler {
  public:
   // If profiling is enabled, start an execution timer running.
-  explicit HloExecutionProfiler(
-      bool do_profile, HloExecutionProfile* profile, se::Stream* stream,
-      const std::vector<Pool<se::Stream>::SmartPtr>& sub_streams,
-      const HloComputation* computation);
+  explicit HloExecutionProfiler(bool do_profile, HloExecutionProfile* profile,
+                                se::Stream* stream,
+                                const std::vector<StreamPool::Ptr>& sub_streams,
+                                const HloComputation* computation);
 
   // If profiling is enabled, sets the total cycle count on the profile from the
   // execution timer.
@@ -72,7 +72,7 @@ class HloExecutionProfiler {
   double clock_rate_ghz_;
   HloExecutionProfile* profile_;
   se::Stream* stream_;
-  const std::vector<Pool<se::Stream>::SmartPtr>& sub_streams_;
+  const std::vector<StreamPool::Ptr>& sub_streams_;
   const HloComputation* computation_;
   std::stack<std::unique_ptr<se::Timer>> timers_;
   // Contains the HLO instructions for which we are currently measuring the
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
index 6f2a7e1850..8c11cd0541 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
+++ b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
@@ -18,8 +18,10 @@ limitations under the License.
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
+#include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -39,7 +41,7 @@ void HloToIrBindings::EmitBasePointersForHlos(
   // I/O HLOs are bound to the arguments of the current IR function. I.e.,
   //
   // void IrFunction(io_0, io_1, ..., io_{m-1}, temp_buffer_base) {
-  llvm::Function* function = ir_builder_->GetInsertBlock()->getParent();
+  llvm::Function* function = b_->GetInsertBlock()->getParent();
   CHECK_EQ(io_hlos.size() + 1, function->arg_size());
 
   // An HLO can have duplicated operands. This data structure remembers which
@@ -79,8 +81,8 @@ void HloToIrBindings::EmitBasePointersForHlos(
         const int64 offset = slice.offset();
         CHECK_NE(nullptr, temp_buffer_base_);
         // Emit IR for GetTupleElement instruction and bind to emitted value.
-        llvm::Value* base_ptr = ir_builder_->CreateInBoundsGEP(
-            temp_buffer_base_, ir_builder_->getInt64(offset));
+        llvm::Value* base_ptr =
+            b_->CreateInBoundsGEP(temp_buffer_base_, b_->getInt64(offset));
         BindHloToIrValue(*non_io_hlo,
                          EmitGetTupleElement(non_io_hlo, base_ptr));
       }
@@ -108,15 +110,20 @@ void HloToIrBindings::EmitBasePointersForHlos(
           if (slice.allocation()->is_thread_local()) {
             llvm::Type* pointee_type =
                 llvm_ir::ShapeToIrType(non_io_hlo->shape(), module_);
-            BindHloToIrValue(*non_io_hlo,
-                             ir_builder_->CreateAlloca(pointee_type), index);
+            BindHloToIrValue(*non_io_hlo, b_->CreateAlloca(pointee_type),
+                             index);
+          } else if (slice.allocation()->is_constant()) {
+            llvm::Value* global_for_constant =
+                module_->getGlobalVariable(llvm_ir::AsStringRef(
+                    llvm_ir::ConstantBufferAllocationToGlobalName(
+                        *slice.allocation())));
+            BindHloToIrValue(*non_io_hlo, global_for_constant);
           } else {
             const int64 offset = slice.offset();
             CHECK_NE(nullptr, temp_buffer_base_);
             BindHloToIrValue(
                 *non_io_hlo,
-                ir_builder_->CreateInBoundsGEP(temp_buffer_base_,
-                                               ir_builder_->getInt64(offset)),
+                b_->CreateInBoundsGEP(temp_buffer_base_, b_->getInt64(offset)),
                 index);
           }
         });
@@ -129,11 +136,19 @@ llvm::Value* HloToIrBindings::EmitGetTupleElement(const HloInstruction* gte,
   if (gte->operand(0)->opcode() != HloOpcode::kGetTupleElement) {
     return llvm_ir::EmitGetTupleElement(
         gte->shape(), gte->tuple_index(), /*alignment=*/1,
-        GetTypedIrValue(*gte->operand(0), {}, base_ptr), ir_builder_, module_);
+        GetTypedIrValue(*gte->operand(0), {}, base_ptr), b_, module_);
   }
   return llvm_ir::EmitGetTupleElement(
       gte->shape(), gte->tuple_index(), /*alignment=*/1,
-      EmitGetTupleElement(gte->operand(0), base_ptr), ir_builder_, module_);
+      EmitGetTupleElement(gte->operand(0), base_ptr), b_, module_);
+}
+
+// Returns true if `value` has a name that should not be changed.
+static bool HasMeaningfulName(llvm::Value* value) {
+  if (auto* global = llvm::dyn_cast<llvm::GlobalValue>(value)) {
+    return global->getLinkage() != llvm::GlobalValue::PrivateLinkage;
+  }
+  return false;
 }
 
 llvm::Value* HloToIrBindings::GetTypedIrValue(const HloInstruction& hlo,
@@ -148,11 +163,15 @@ llvm::Value* HloToIrBindings::GetTypedIrValue(const HloInstruction& hlo,
     typed_ir_value = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
         llvm::cast<llvm::GlobalVariable>(ir_value), dest_type);
   } else {
-    typed_ir_value =
-        ir_builder_->CreateBitCast(ir_value, pointee_type->getPointerTo());
+    typed_ir_value = b_->CreateBitCast(ir_value, pointee_type->getPointerTo());
+  }
+  if (!HasMeaningfulName(ir_value)) {
+    ir_value->setName(llvm_ir::AsStringRef(llvm_ir::IrName(&hlo, "raw")));
+  }
+  if (!HasMeaningfulName(typed_ir_value)) {
+    typed_ir_value->setName(
+        llvm_ir::AsStringRef(llvm_ir::IrName(&hlo, "typed")));
   }
-  ir_value->setName(llvm_ir::AsStringRef(llvm_ir::IrName(&hlo, "raw")));
-  typed_ir_value->setName(llvm_ir::AsStringRef(llvm_ir::IrName(&hlo, "typed")));
   return typed_ir_value;
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h
index a86e6e78c6..eee40b0e91 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h
+++ b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h
@@ -36,14 +36,13 @@ class HloToIrBindings {
  public:
   HloToIrBindings(const HloModule& module,
                   const BufferAssignment* buffer_assignment,
-                  llvm::IRBuilder<>* ir_builder, llvm::Module* llvm_module,
+                  llvm::IRBuilder<>* b, llvm::Module* llvm_module,
                   bool is_nested)
       : buffer_assignment_(buffer_assignment),
         is_nested_(is_nested),
-        ir_builder_(ir_builder),
+        b_(b),
         module_(llvm_module),
-        alias_analysis_(module, *buffer_assignment_,
-                        &ir_builder_->getContext()) {}
+        alias_analysis_(module, *buffer_assignment_, &b_->getContext()) {}
 
   void EmitBasePointersForHlos(
       tensorflow::gtl::ArraySlice<const HloInstruction*> io_hlos,
@@ -104,7 +103,7 @@ class HloToIrBindings {
 
   const bool is_nested_;
 
-  llvm::IRBuilder<>* ir_builder_;
+  llvm::IRBuilder<>* b_;
   llvm::Module* module_;
 
   // Stores the underlying llvm::IrArray for each HloInstruction.
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
index 64ed3d748f..0f2c83aeb2 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
@@ -73,6 +73,67 @@ bool IsIEEEFloatingPointScalarConstant(const HloInstruction* constant) {
   }
 }
 
+// This function limits the maximum number of operands to a fusion.
+//
+// There's a cap on how many parameters we can pass to a CUDA kernel, but
+// exactly what that limit is is hazy, as it depends on (among other things) how
+// much GPU constant memory is in use for other purposes.
+//
+// Moreover, we don't even know at the point that we're running fusion how many
+// arguments the CUDA kernel for a fusion node will have: It depends on buffer
+// assignment, where we will decide which of the fusion's operands live in XLA's
+// big temp buffer versus in other allocations.
+//
+// As a heuristic, we simply cap the number of fusion operands plus outputs at
+// kMaxOperandsAndOutputsPerFusion.  This puts an upper bound on the number of
+// parameters to the kernel, working around the correctness problem.
+//
+// This limit is also often good for performance.  In a fusion with many
+// operands, each GPU thread likely has to do a lot of work, and so possibly
+// uses a lot of registers, thus limiting occupancy.
+/*static*/ bool GpuInstructionFusion::FusionWouldBeTooLarge(
+    const HloInstruction* a, const HloInstruction* b) {
+  // Compute the number of outputs of the (possibly multi-output) fusion node
+  // we're considering creating.
+  //
+  // This isn't precise; we may be off by one if
+  //  - We're creating a multi-output fusion out of two non-MOFs.  Creating a
+  //    MOF adds a new buffer, namely, the tuple buffer.
+  //  - We're merging two MOFs.  In this case, we should count the tuple buffer
+  //    only once.
+  //  - WLOG there's an edge from `a` to `b` and `b` is the only consumer of
+  //    `a`.  In this case the result of `a` is not part of the output of the
+  //    fusion.
+  //
+  // But because this is a heuristic and our limit
+  // kMaxOperandsAndOutputsPerFusion is a large value (so +/- 1 doesn't make a
+  // big difference), we ignore this small inaccuracy in favor of simplicity.
+  int64 num_output_buffers = ShapeUtil::SubshapeCount(a->shape()) +
+                             ShapeUtil::SubshapeCount(b->shape());
+
+  // The new fusion will have no more operands and outputs than
+  //   producer_operands + consumer_operands - 1 + num_output_buffers
+  // (minus one because we may be fusing a producer->consumer edge between `a`
+  // and `b`).
+  //
+  // This fact may be enough to let us avoid having to compute the true total
+  // number of operands, which can be expensive.
+  if (a->operand_count() + b->operand_count() - 1 + num_output_buffers <=
+      kMaxOperandsAndOutputsPerFusion) {
+    return false;
+  }
+
+  // Compute the precise number of operands to the new fusion.
+  tensorflow::gtl::FlatSet<const HloInstruction*> operands(
+      a->operands().begin(), a->operands().end());
+  operands.insert(b->operands().begin(), b->operands().end());
+  // If there's an edge between `a` and `b`, don't count it: We're fusing that
+  // producer -> consumer relationship.
+  operands.erase(a);
+  operands.erase(b);
+  return operands.size() + num_output_buffers > kMaxOperandsAndOutputsPerFusion;
+}
+
 bool GpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
                                       int64 operand_index) {
   HloInstruction* producer = consumer->mutable_operand(operand_index);
@@ -141,6 +202,7 @@ bool GpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
              IsIEEEFloatingPointScalarConstant(producer->operand(0)) &&
              fused_parameter_users[0]->opcode() == HloOpcode::kMultiply;
     }
+    return false;
   }
 
   // Other output fusions are not currently supported on GPUs.
@@ -183,8 +245,13 @@ bool GpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
     return true;
   }
 
-  return IsFusile(*producer) && IsFusile(*consumer) &&
-         InstructionFusion::ShouldFuse(consumer, operand_index);
+  if (!IsFusile(*producer) || !IsFusile(*consumer) ||
+      !InstructionFusion::ShouldFuse(consumer, operand_index)) {
+    return false;
+  }
+
+  // We put this check last because it's potentially expensive.
+  return !FusionWouldBeTooLarge(consumer, producer);
 }
 
 bool GpuInstructionFusion::ShouldFuseIntoMultiOutput(HloInstruction* consumer,
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion.h b/tensorflow/compiler/xla/service/gpu/instruction_fusion.h
index f629d9ff2c..c91f6343a6 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion.h
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion.h
@@ -27,6 +27,19 @@ class GpuInstructionFusion : public InstructionFusion {
   explicit GpuInstructionFusion(bool may_duplicate)
       : InstructionFusion(GpuInstructionFusion::IsExpensive, may_duplicate) {}
 
+  // Maximum number of operands plus outputs allowed on a single fusion node.
+  // Exposed publicly mainly for tests.
+  static constexpr int64 kMaxOperandsAndOutputsPerFusion = 64;
+
+  // Determines whether the combination of `a` and `b` into a (possibly
+  // multi-output) fusion would be "too large" -- i.e., have more operands and
+  // outputs than is allowed.
+  //
+  // `ShouldFuse` and `ShouldFuseIntoMultiOutput` call this; it's public so that
+  // other fusion passes (e.g. GPU multi-output fusion) can also call this.
+  static bool FusionWouldBeTooLarge(const HloInstruction* a,
+                                    const HloInstruction* b);
+
   static bool IsExpensive(const HloInstruction& instruction);
 
   bool ShouldFuse(HloInstruction* consumer, int64 operand_index) override;
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
index 98ba162cd9..8d0522bd8f 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
@@ -606,5 +606,35 @@ TEST_F(InstructionFusionTest, FuseScalarConstant) {
                       op::Parameter()));
 }
 
+// Check that we limit the number of operands to fusions we create.
+TEST_F(InstructionFusionTest, AvoidsLargeFusion) {
+  constexpr int64 kNumParams = 200;
+  ASSERT_GT(kNumParams, GpuInstructionFusion::kMaxOperandsAndOutputsPerFusion);
+
+  // Compute p0 + p1 + ... + pN.
+  HloComputation::Builder b(TestName());
+  Shape shape = ShapeUtil::MakeShape(F32, {10, 100});
+  auto param0 =
+      b.AddInstruction(HloInstruction::CreateParameter(0, shape, "p"));
+  auto sum = param0;
+  for (int64 i = 1; i < kNumParams; ++i) {
+    auto param =
+        b.AddInstruction(HloInstruction::CreateParameter(i, shape, "p"));
+    sum = b.AddInstruction(
+        HloInstruction::CreateBinary(shape, HloOpcode::kAdd, sum, param));
+  }
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(b.Build());
+  EXPECT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true)
+                  .Run(module.get())
+                  .ValueOrDie());
+  SCOPED_TRACE(module->ToString());
+  for (const HloInstruction* instr : computation->instructions()) {
+    EXPECT_LE(instr->operand_count(),
+              GpuInstructionFusion::kMaxOperandsAndOutputsPerFusion)
+        << instr->ToString();
+  }
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
index 2799baab41..6352b330d1 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
@@ -81,11 +81,6 @@ bool DotImplementedAsGemm(const HloInstruction& dot) {
 }  // namespace
 
 bool ImplementedAsGemm(const HloInstruction& hlo) {
-  // We can only do this if the HLO is unnested.
-  if (hlo.parent() != hlo.GetModule()->entry_computation()) {
-    return false;
-  }
-
   // For certain types of Dot, we can call pre-canned BLAS gemm.
   if (hlo.opcode() == HloOpcode::kDot) {
     return DotImplementedAsGemm(hlo);
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
index 9bb4c42b15..5d23a3d018 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
@@ -31,6 +31,12 @@ namespace gpu {
 constexpr int64 kWarpSize = 32;
 
 // Returns true if `hlo` will be implemented as a call to BLAS gemm.
+//
+// Precondition: `hlo` is in an "unnested context", meaning, it lives within the
+// entry computation, within the either of a while loop's subcomputations,
+// within any of a conditional's subcomputations, etc., but *does not* live
+// within a reduce subcomputation, a map subcomputation, a fusion
+// subcomputation, etc.  It's OK if `hlo` *is* a fusion.
 bool ImplementedAsGemm(const HloInstruction& hlo);
 
 // A call to cuDNN for batch normalization is represented as CustomCall HLO with
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
index 449a18e710..1295e83c0c 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
@@ -37,7 +37,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h"
-#include "tensorflow/compiler/xla/service/llvm_ir/sort_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h"
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -58,12 +57,12 @@ IrEmitter::IrEmitter(const HloModuleConfig& hlo_module_config,
                      IrEmitterContext* ir_emitter_context, bool is_nested)
     : ir_emitter_context_(ir_emitter_context),
       module_(ir_emitter_context->llvm_module()),
-      ir_builder_(module_->getContext()),
+      b_(module_->getContext()),
       bindings_(ir_emitter_context->hlo_module(),
-                &ir_emitter_context->buffer_assignment(), &ir_builder_, module_,
+                &ir_emitter_context->buffer_assignment(), &b_, module_,
                 is_nested),
       hlo_module_config_(hlo_module_config) {
-  ir_builder_.setFastMathFlags(llvm_ir::GetFastMathFlags(
+  b_.setFastMathFlags(llvm_ir::GetFastMathFlags(
       /*fast_math_enabled=*/hlo_module_config.debug_options()
           .xla_enable_fast_math()));
 }
@@ -72,30 +71,16 @@ Status IrEmitter::DefaultAction(HloInstruction* hlo) {
   ElementalIrEmitter::HloToElementGeneratorMap operand_to_generator;
   for (const HloInstruction* operand : hlo->operands()) {
     operand_to_generator[operand] = [=](const llvm_ir::IrArray::Index& index) {
-      return GetIrArray(*operand, *hlo)
-          .EmitReadArrayElement(index, &ir_builder_);
+      return GetIrArray(*operand, *hlo).EmitReadArrayElement(index, &b_);
     };
   }
   return EmitTargetElementLoop(
-      *hlo, GpuElementalIrEmitter(hlo_module_config_, module_, &ir_builder_,
+      *hlo, GpuElementalIrEmitter(hlo_module_config_, module_, &b_,
                                   GetNestedComputer())
                 .MakeElementGenerator(hlo, operand_to_generator));
 }
 
 Status IrEmitter::HandleConstant(HloInstruction* constant) {
-  const Literal& literal = constant->literal();
-  llvm::Constant* initializer =
-      llvm_ir::ConvertLiteralToIrConstant(literal, module_);
-  llvm::GlobalVariable* global_for_const = new llvm::GlobalVariable(
-      *module_, initializer->getType(),
-      /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, initializer,
-      /*Name=*/"");
-  VLOG(2) << "HandleConstant: " << constant->ToString() << std::endl
-          << "  emitted_value: " << llvm_ir::DumpToString(*global_for_const)
-          << std::endl
-          << "  its type: "
-          << llvm_ir::DumpToString(*global_for_const->getType());
-  bindings_.BindHloToIrValue(*constant, global_for_const);
   return Status::OK();
 }
 
@@ -120,21 +105,10 @@ Status IrEmitter::HandleGetTupleElement(HloInstruction* get_tuple_element) {
           get_tuple_element->shape(), get_tuple_element->tuple_index(),
           // TODO(b/26344050): tighten the alignment here
           // based on the real element type.
-          /*alignment=*/1, GetBasePointer(*operand), &ir_builder_, module_));
+          /*alignment=*/1, GetBasePointer(*operand), &b_, module_));
   return Status::OK();
 }
 
-Status IrEmitter::HandleSort(HloInstruction* sort) {
-  auto values = sort->operand_count() > 1 ? sort->operand(1) : nullptr;
-  if (values != nullptr) {
-    // TODO(b/26783907): Also sort the values by their corresponding key.
-    return Unimplemented("Key/Value Sort is not implemented on GPU");
-  }
-  int dimension_to_sort = sort->dimensions(0);
-  return llvm_ir::EmitSortInPlace(dimension_to_sort, GetIrArray(*sort, *sort),
-                                  IrName(sort), &ir_builder_);
-}
-
 Status IrEmitter::HandleSend(HloInstruction*) {
   return Unimplemented("Send is not implemented on GPU");
 }
@@ -156,8 +130,7 @@ Status IrEmitter::HandleTuple(HloInstruction* tuple) {
   for (const HloInstruction* operand : tuple->operands()) {
     base_ptrs.push_back(GetBasePointer(*operand));
   }
-  llvm_ir::EmitTuple(GetIrArray(*tuple, *tuple), base_ptrs, &ir_builder_,
-                     module_);
+  llvm_ir::EmitTuple(GetIrArray(*tuple, *tuple), base_ptrs, &b_, module_);
   return Status::OK();
 }
 
@@ -178,7 +151,7 @@ Status IrEmitter::EmitCallToNestedComputation(
   std::vector<llvm::Value*> arguments(operands.begin(), operands.end());
   arguments.push_back(output);
   arguments.push_back(bindings_.GetTempBufferBase());
-  ir_builder_.CreateCall(emitted_function, arguments);
+  b_.CreateCall(emitted_function, arguments);
 
   return Status::OK();
 }
@@ -200,21 +173,20 @@ bool IrEmitter::MaybeEmitDirectAtomicOperation(
       computation.root_instruction()->shape().element_type();
   bool is_atomic_integral = element_type == S32 || element_type == U32 ||
                             element_type == S64 || element_type == U64;
-  llvm::Value* source = ir_builder_.CreateLoad(source_address, "source");
+  llvm::Value* source = b_.CreateLoad(source_address, "source");
   if (root_opcode == HloOpcode::kAdd) {
     // NVPTX supports atomicAdd on F32 and integer types.
     if (element_type == F32) {
       // F32 + F32
       llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_atomic_load_add_f32,
                                    {output_address, source},
-                                   {output_address->getType()}, &ir_builder_);
+                                   {output_address->getType()}, &b_);
       return true;
     }
     if (is_atomic_integral) {
       // integral + integral
-      ir_builder_.CreateAtomicRMW(llvm::AtomicRMWInst::Add, output_address,
-                                  source,
-                                  llvm::AtomicOrdering::SequentiallyConsistent);
+      b_.CreateAtomicRMW(llvm::AtomicRMWInst::Add, output_address, source,
+                         llvm::AtomicOrdering::SequentiallyConsistent);
       return true;
     }
   }
@@ -225,8 +197,8 @@ bool IrEmitter::MaybeEmitDirectAtomicOperation(
     auto opcode = primitive_util::IsSignedIntegralType(element_type)
                       ? llvm::AtomicRMWInst::Max
                       : llvm::AtomicRMWInst::UMax;
-    ir_builder_.CreateAtomicRMW(opcode, output_address, source,
-                                llvm::AtomicOrdering::SequentiallyConsistent);
+    b_.CreateAtomicRMW(opcode, output_address, source,
+                       llvm::AtomicOrdering::SequentiallyConsistent);
     return true;
   }
 
@@ -235,8 +207,8 @@ bool IrEmitter::MaybeEmitDirectAtomicOperation(
     auto opcode = primitive_util::IsSignedIntegralType(element_type)
                       ? llvm::AtomicRMWInst::Min
                       : llvm::AtomicRMWInst::UMin;
-    ir_builder_.CreateAtomicRMW(opcode, output_address, source,
-                                llvm::AtomicOrdering::SequentiallyConsistent);
+    b_.CreateAtomicRMW(opcode, output_address, source,
+                       llvm::AtomicOrdering::SequentiallyConsistent);
     return true;
   }
 
@@ -308,20 +280,20 @@ Status IrEmitter::EmitAtomicOperationUsingCAS(const HloComputation& computation,
   llvm::Type* element_address_type = element_type->getPointerTo();
 
   int atomic_size = (element_size < 32) ? 32 : element_size;
-  llvm::Type* atomic_type = ir_builder_.getIntNTy(atomic_size);
+  llvm::Type* atomic_type = b_.getIntNTy(atomic_size);
   llvm::Type* atomic_address_type =
       atomic_type->getPointerTo(output_address_type->getPointerAddressSpace());
 
   // cas_old_output_address and cas_new_output_address point to the scratch
   // memory where we store the old and new values for the repeated atomicCAS
   // operations.
-  llvm::Value* cas_old_output_address = ir_builder_.CreateAlloca(
+  llvm::Value* cas_old_output_address = b_.CreateAlloca(
       atomic_type, /*ArraySize=*/nullptr, "cas_old_output_address");
-  llvm::Value* cas_new_output_address = ir_builder_.CreateAlloca(
+  llvm::Value* cas_new_output_address = b_.CreateAlloca(
       atomic_type, /*ArraySize=*/nullptr, "cas_new_output_address");
 
   // Emit preparation code to the preheader.
-  llvm::BasicBlock* loop_preheader_bb = ir_builder_.GetInsertBlock();
+  llvm::BasicBlock* loop_preheader_bb = b_.GetInsertBlock();
 
   llvm::Value* atomic_memory_address;
   // binop_output_address points to the scratch memory that stores the
@@ -332,77 +304,71 @@ Status IrEmitter::EmitAtomicOperationUsingCAS(const HloComputation& computation,
     CHECK_EQ((element_size % sizeof(char)), 0);
     llvm::Type* address_int_type =
         module_->getDataLayout().getIntPtrType(output_address_type);
-    atomic_memory_address =
-        ir_builder_.CreatePtrToInt(output_address, address_int_type);
+    atomic_memory_address = b_.CreatePtrToInt(output_address, address_int_type);
     llvm::Value* mask = llvm::ConstantInt::get(address_int_type, 3);
-    llvm::Value* offset = ir_builder_.CreateAnd(atomic_memory_address, mask);
+    llvm::Value* offset = b_.CreateAnd(atomic_memory_address, mask);
     mask = llvm::ConstantInt::get(address_int_type, -4);
-    atomic_memory_address = ir_builder_.CreateAnd(atomic_memory_address, mask);
+    atomic_memory_address = b_.CreateAnd(atomic_memory_address, mask);
     atomic_memory_address =
-        ir_builder_.CreateIntToPtr(atomic_memory_address, atomic_address_type);
-    binop_output_address = ir_builder_.CreateAdd(
-        ir_builder_.CreatePtrToInt(cas_new_output_address, address_int_type),
-        offset);
+        b_.CreateIntToPtr(atomic_memory_address, atomic_address_type);
+    binop_output_address = b_.CreateAdd(
+        b_.CreatePtrToInt(cas_new_output_address, address_int_type), offset);
     binop_output_address =
-        ir_builder_.CreateIntToPtr(binop_output_address, element_address_type);
+        b_.CreateIntToPtr(binop_output_address, element_address_type);
   } else {
     atomic_memory_address =
-        ir_builder_.CreateBitCast(output_address, atomic_address_type);
+        b_.CreateBitCast(output_address, atomic_address_type);
     binop_output_address =
-        ir_builder_.CreateBitCast(cas_new_output_address, element_address_type);
+        b_.CreateBitCast(cas_new_output_address, element_address_type);
   }
 
   // Use the value from the memory that atomicCAS operates on to initialize
   // cas_old_output.
   llvm::Value* cas_old_output =
-      ir_builder_.CreateLoad(atomic_memory_address, "cas_old_output");
-  ir_builder_.CreateStore(cas_old_output, cas_old_output_address);
+      b_.CreateLoad(atomic_memory_address, "cas_old_output");
+  b_.CreateStore(cas_old_output, cas_old_output_address);
 
   llvm::BasicBlock* loop_exit_bb = loop_preheader_bb->splitBasicBlock(
-      ir_builder_.GetInsertPoint(), "atomic_op_loop_exit");
-  llvm::BasicBlock* loop_body_bb =
-      llvm::BasicBlock::Create(ir_builder_.getContext(), "atomic_op_loop_body",
-                               ir_builder_.GetInsertBlock()->getParent());
-  ir_builder_.SetInsertPoint(loop_body_bb);
+      b_.GetInsertPoint(), "atomic_op_loop_exit");
+  llvm::BasicBlock* loop_body_bb = llvm::BasicBlock::Create(
+      b_.getContext(), "atomic_op_loop_body", b_.GetInsertBlock()->getParent());
+  b_.SetInsertPoint(loop_body_bb);
   // Change preheader's successor from loop_exit_bb to loop_body_bb.
   loop_preheader_bb->getTerminator()->setSuccessor(0, loop_body_bb);
 
   // Emit the body of the loop that repeatedly invokes atomicCAS.
   //
   // Use cas_old_output to initialize cas_new_output.
-  cas_old_output =
-      ir_builder_.CreateLoad(cas_old_output_address, "cas_old_output");
-  ir_builder_.CreateStore(cas_old_output, cas_new_output_address);
+  cas_old_output = b_.CreateLoad(cas_old_output_address, "cas_old_output");
+  b_.CreateStore(cas_old_output, cas_new_output_address);
   // Emits code to calculate new_output = operation(old_output, source);
   TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
       computation, {binop_output_address, source_address},
       binop_output_address));
 
   llvm::Value* cas_new_output =
-      ir_builder_.CreateLoad(cas_new_output_address, "cas_new_output");
+      b_.CreateLoad(cas_new_output_address, "cas_new_output");
 
   // Emit code to perform the atomicCAS operation
   // (cas_old_output, success) = atomicCAS(memory_address, cas_old_output,
   //                                       cas_new_output);
-  llvm::Value* ret_value = ir_builder_.CreateAtomicCmpXchg(
+  llvm::Value* ret_value = b_.CreateAtomicCmpXchg(
       atomic_memory_address, cas_old_output, cas_new_output,
       llvm::AtomicOrdering::SequentiallyConsistent,
       llvm::AtomicOrdering::SequentiallyConsistent);
 
   // Extract the memory value returned from atomicCAS and store it as
   // cas_old_output.
-  ir_builder_.CreateStore(
-      ir_builder_.CreateExtractValue(ret_value, 0, "cas_old_output"),
-      cas_old_output_address);
+  b_.CreateStore(b_.CreateExtractValue(ret_value, 0, "cas_old_output"),
+                 cas_old_output_address);
   // Extract the success bit returned from atomicCAS and generate a
   // conditional branch on the success bit.
-  ir_builder_.CreateCondBr(
-      ir_builder_.CreateExtractValue(ret_value, 1, "success"), loop_exit_bb,
-      loop_body_bb);
+  b_.CreateCondBr(b_.CreateExtractValue(ret_value, 1, "success"), loop_exit_bb,
+                  loop_body_bb);
 
   // Set the insertion point to the exit basic block so that the caller of
   // this method can continue emitting code to the right place.
-  SetToFirstInsertPoint(loop_exit_bb, &ir_builder_);
+  SetToFirstInsertPoint(loop_exit_bb, &b_);
   return Status::OK();
 }
 
@@ -445,32 +411,32 @@ Status IrEmitter::HandleTupleSelect(HloInstruction* tuple_select) {
   llvm_ir::EmitTupleSelect(GetIrArray(*tuple_select, *tuple_select),
                            GetIrArray(*pred, *tuple_select),
                            GetBasePointer(*on_true), GetBasePointer(*on_false),
-                           &ir_builder_, module_);
+                           &b_, module_);
   return Status::OK();
 }
 
 namespace {
-llvm::Value* Real(llvm::Value* x, llvm::IRBuilder<>* ir_builder) {
-  return ir_builder->CreateExtractValue(x, {0});
-}
-
-llvm::Value* Imag(llvm::Value* x, llvm::IRBuilder<>* ir_builder) {
-  return ir_builder->CreateExtractValue(x, {1});
-}
-
-std::pair<llvm::Value*, llvm::Value*> MultiplyComplex(
-    llvm::Value* lhs_value, llvm::Value* rhs_value,
-    llvm::IRBuilder<>* ir_builder) {
-  llvm::Value* lhs_real = Real(lhs_value, ir_builder);
-  llvm::Value* lhs_imag = Imag(lhs_value, ir_builder);
-  llvm::Value* rhs_real = Real(rhs_value, ir_builder);
-  llvm::Value* rhs_imag = Imag(rhs_value, ir_builder);
-  llvm::Value* real_result1 = ir_builder->CreateFMul(lhs_real, rhs_real);
-  llvm::Value* real_result2 = ir_builder->CreateFMul(lhs_imag, rhs_imag);
-  llvm::Value* real_result = ir_builder->CreateFSub(real_result1, real_result2);
-  llvm::Value* imag_result1 = ir_builder->CreateFMul(lhs_real, rhs_imag);
-  llvm::Value* imag_result2 = ir_builder->CreateFMul(lhs_imag, rhs_real);
-  llvm::Value* imag_result = ir_builder->CreateFAdd(imag_result1, imag_result2);
+llvm::Value* Real(llvm::Value* x, llvm::IRBuilder<>* b) {
+  return b->CreateExtractValue(x, {0});
+}
+
+llvm::Value* Imag(llvm::Value* x, llvm::IRBuilder<>* b) {
+  return b->CreateExtractValue(x, {1});
+}
+
+std::pair<llvm::Value*, llvm::Value*> MultiplyComplex(llvm::Value* lhs_value,
+                                                      llvm::Value* rhs_value,
+                                                      llvm::IRBuilder<>* b) {
+  llvm::Value* lhs_real = Real(lhs_value, b);
+  llvm::Value* lhs_imag = Imag(lhs_value, b);
+  llvm::Value* rhs_real = Real(rhs_value, b);
+  llvm::Value* rhs_imag = Imag(rhs_value, b);
+  llvm::Value* real_result1 = b->CreateFMul(lhs_real, rhs_real);
+  llvm::Value* real_result2 = b->CreateFMul(lhs_imag, rhs_imag);
+  llvm::Value* real_result = b->CreateFSub(real_result1, real_result2);
+  llvm::Value* imag_result1 = b->CreateFMul(lhs_real, rhs_imag);
+  llvm::Value* imag_result2 = b->CreateFMul(lhs_imag, rhs_real);
+  llvm::Value* imag_result = b->CreateFAdd(imag_result1, imag_result2);
   return {real_result, imag_result};
 }
 }  // namespace
@@ -486,25 +452,24 @@ Status IrEmitter::HandleDot(HloInstruction* dot) {
   const Shape& rhs_shape = rhs_instruction->shape();
 
   // TODO(b/110211620): Convert to use i32 index_type when it is possible.
-  llvm::Type* index_type = ir_builder_.getInt64Ty();
+  llvm::Type* index_type = b_.getInt64Ty();
   llvm_ir::IrArray::Index element_index(index_type);
   if (ShapeUtil::IsScalar(lhs_shape) && ShapeUtil::IsScalar(rhs_shape)) {
     // If the operands are scalar, don't emit any loops.
     llvm::Value* lhs_value =
-        lhs_array.EmitReadArrayElement(/*index=*/element_index, &ir_builder_);
+        lhs_array.EmitReadArrayElement(/*index=*/element_index, &b_);
     llvm::Value* rhs_value =
-        rhs_array.EmitReadArrayElement(/*index=*/element_index, &ir_builder_);
+        rhs_array.EmitReadArrayElement(/*index=*/element_index, &b_);
     llvm::Value* result;
     if (ShapeUtil::ElementIsComplex(lhs_shape)) {
-      auto value = MultiplyComplex(lhs_value, rhs_value, &ir_builder_);
+      auto value = MultiplyComplex(lhs_value, rhs_value, &b_);
       result = llvm::ConstantAggregateZero::get(lhs_array.GetElementLlvmType());
-      result = ir_builder_.CreateInsertValue(result, value.first, {0});
-      result = ir_builder_.CreateInsertValue(result, value.second, {1});
+      result = b_.CreateInsertValue(result, value.first, {0});
+      result = b_.CreateInsertValue(result, value.second, {1});
     } else {
-      result = ir_builder_.CreateFMul(lhs_value, rhs_value);
+      result = b_.CreateFMul(lhs_value, rhs_value);
     }
-    target_array.EmitWriteArrayElement(/*index=*/element_index, result,
-                                       &ir_builder_);
+    target_array.EmitWriteArrayElement(/*index=*/element_index, result, &b_);
     return Status::OK();
   }
 
@@ -531,7 +496,7 @@ Status IrEmitter::HandleDot(HloInstruction* dot) {
   // Create loop nests which loop through the LHS operand dimensions and the RHS
   // operand dimensions. The reduction dimension of the LHS and RHS are handled
   // in a separate innermost loop which performs the sum of products.
-  llvm_ir::ForLoopNest loop_nest(IrName(dot), &ir_builder_);
+  llvm_ir::ForLoopNest loop_nest(IrName(dot), &b_);
   llvm_ir::IrArray::Index lhs_index = loop_nest.EmitOperandArrayLoopNest(
       lhs_array, /*dimension_to_skip=*/lhs_reduction_dimension, "lhs");
   llvm_ir::IrArray::Index rhs_index = loop_nest.EmitOperandArrayLoopNest(
@@ -555,7 +520,7 @@ Status IrEmitter::HandleDot(HloInstruction* dot) {
   llvm::Value* accum_address = llvm_ir::EmitAllocaAtFunctionEntry(
       accum_type,       // The pointee type of the alloca instruction.
       "accum_address",  // The name of the alloca instruction.
-      &ir_builder_);
+      &b_);
 
   // Initialize the accumulator in the preheader to zero.
   new llvm::StoreInst(
@@ -569,27 +534,25 @@ Status IrEmitter::HandleDot(HloInstruction* dot) {
   //   updated_accum = accum + lhs_element * rhs_element
   //   *accum_address = updated_accum
   TF_RET_CHECK(!reduction_loop->GetBodyBasicBlock()->empty());
-  ir_builder_.SetInsertPoint(
+  b_.SetInsertPoint(
       &*reduction_loop->GetBodyBasicBlock()->getFirstInsertionPt());
-  llvm::Value* lhs_element =
-      lhs_array.EmitReadArrayElement(lhs_index, &ir_builder_);
-  llvm::Value* rhs_element =
-      rhs_array.EmitReadArrayElement(rhs_index, &ir_builder_);
-  llvm::Value* accum = ir_builder_.CreateLoad(accum_address);
+  llvm::Value* lhs_element = lhs_array.EmitReadArrayElement(lhs_index, &b_);
+  llvm::Value* rhs_element = rhs_array.EmitReadArrayElement(rhs_index, &b_);
+  llvm::Value* accum = b_.CreateLoad(accum_address);
   llvm::Value* updated_accum;
   if (ShapeUtil::ElementIsComplex(lhs_shape)) {
-    auto value = MultiplyComplex(lhs_element, rhs_element, &ir_builder_);
-    llvm::Value* accum_real = Real(accum, &ir_builder_);
-    llvm::Value* real_sum = ir_builder_.CreateFAdd(accum_real, value.first);
-    updated_accum = ir_builder_.CreateInsertValue(accum, real_sum, {0});
-    llvm::Value* accum_imag = Imag(accum, &ir_builder_);
-    llvm::Value* imag_sum = ir_builder_.CreateFAdd(accum_imag, value.second);
-    updated_accum = ir_builder_.CreateInsertValue(updated_accum, imag_sum, {1});
+    auto value = MultiplyComplex(lhs_element, rhs_element, &b_);
+    llvm::Value* accum_real = Real(accum, &b_);
+    llvm::Value* real_sum = b_.CreateFAdd(accum_real, value.first);
+    updated_accum = b_.CreateInsertValue(accum, real_sum, {0});
+    llvm::Value* accum_imag = Imag(accum, &b_);
+    llvm::Value* imag_sum = b_.CreateFAdd(accum_imag, value.second);
+    updated_accum = b_.CreateInsertValue(updated_accum, imag_sum, {1});
   } else {
-    llvm::Value* product = ir_builder_.CreateFMul(lhs_element, rhs_element);
-    updated_accum = ir_builder_.CreateFAdd(accum, product);
+    llvm::Value* product = b_.CreateFMul(lhs_element, rhs_element);
+    updated_accum = b_.CreateFAdd(accum, product);
   }
-  ir_builder_.CreateStore(updated_accum, accum_address);
+  b_.CreateStore(updated_accum, accum_address);
 
   // After the reduction loop exits, store the accumulator into the target
   // address. The index into the target address is the concatenation of the rhs
@@ -606,16 +569,15 @@ Status IrEmitter::HandleDot(HloInstruction* dot) {
       target_index.push_back(rhs_index[dimension]);
     }
   }
-  SetToFirstInsertPoint(reduction_loop->GetExitBasicBlock(), &ir_builder_);
+  SetToFirstInsertPoint(reduction_loop->GetExitBasicBlock(), &b_);
   target_array.EmitWriteArrayElement(
       target_index,
-      ir_builder_.CreateLoad(
-          accum_address),  // The value written to the target array.
-      &ir_builder_);
+      b_.CreateLoad(accum_address),  // The value written to the target array.
+      &b_);
 
   // Set the IR builder insert point to the exit basic block of the outer most
   // loop. This ensures later instructions are inserted after this loop nest.
-  ir_builder_.SetInsertPoint(loop_nest.GetOuterLoopExitBasicBlock());
+  b_.SetInsertPoint(loop_nest.GetOuterLoopExitBasicBlock());
 
   return Status::OK();
 }
@@ -657,11 +619,10 @@ Status IrEmitter::HandleReduce(HloInstruction* reduce) {
       [=](const llvm_ir::IrArray::Index& index) -> StatusOr<llvm::Value*> {
         // Initialize an accumulator with init_value.
         llvm::AllocaInst* accumulator_addr =
-            ir_builder_.CreateAlloca(llvm_ir::PrimitiveTypeToIrType(
+            b_.CreateAlloca(llvm_ir::PrimitiveTypeToIrType(
                 reduce->shape().element_type(), module_));
-        ir_builder_.CreateStore(
-            ir_builder_.CreateLoad(GetBasePointer(*init_value)),
-            accumulator_addr);
+        b_.CreateStore(b_.CreateLoad(GetBasePointer(*init_value)),
+                       accumulator_addr);
 
         // The enclosing loops go over all the target elements. Now we have to
         // compute the actual target element. For this, we build a new loop nest
@@ -669,12 +630,12 @@ Status IrEmitter::HandleReduce(HloInstruction* reduce) {
         // AddLoopsForShapeOnDimensions will return an Index where induction
         // Value*s are placed for each dimension in dimensions, and all the rest
         // are nullptrs.
-        llvm_ir::ForLoopNest loops(IrName(reduce, "inner"), &ir_builder_);
+        llvm_ir::ForLoopNest loops(IrName(reduce, "inner"), &b_);
         const llvm_ir::IrArray::Index reduced_dims_index =
             loops.AddLoopsForShapeOnDimensions(arg->shape(), dimensions,
                                                "reduction_dim");
 
-        SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &ir_builder_);
+        SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &b_);
 
         // Build a full index for the input argument, using reduced_dims_index
         // as the base. In reduced_dims_index only the reduction dimensions are
@@ -693,13 +654,12 @@ Status IrEmitter::HandleReduce(HloInstruction* reduce) {
 
         // Apply the reduction function to the loaded value.
         llvm::Value* input_address =
-            GetIrArray(*arg, *reduce)
-                .EmitArrayElementAddress(input_index, &ir_builder_);
+            GetIrArray(*arg, *reduce).EmitArrayElementAddress(input_index, &b_);
         TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
             *function, {accumulator_addr, input_address}, accumulator_addr));
 
-        SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &ir_builder_);
-        return ir_builder_.CreateLoad(accumulator_addr);
+        SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &b_);
+        return b_.CreateLoad(accumulator_addr);
       });
 }
 
@@ -712,8 +672,8 @@ Status IrEmitter::HandleFusion(HloInstruction* fusion) {
   for (HloInstruction* operand : fusion->operands()) {
     parameter_arrays.push_back(GetIrArray(*operand, *fusion));
   }
-  GpuElementalIrEmitter elemental_emitter(hlo_module_config_, module_,
-                                          &ir_builder_, GetNestedComputer());
+  GpuElementalIrEmitter elemental_emitter(hlo_module_config_, module_, &b_,
+                                          GetNestedComputer());
   FusedIrEmitter fused_emitter(parameter_arrays, &elemental_emitter);
   TF_RETURN_IF_ERROR(fusion->fused_expression_root()->Accept(&fused_emitter));
 
@@ -743,24 +703,6 @@ Status IrEmitter::HandleOutfeed(HloInstruction*) {
   return Unimplemented("Outfeed is not supported on GPU.");
 }
 
-Status IrEmitter::HandleRng(HloInstruction* random) {
-  ElementalIrEmitter::HloToElementGeneratorMap operand_to_generator;
-  for (const HloInstruction* operand : random->operands()) {
-    operand_to_generator[operand] = [=](const llvm_ir::IrArray::Index& index) {
-      return GetIrArray(*operand, *random)
-          .EmitReadArrayElement(index, &ir_builder_);
-    };
-  }
-  // Emits a single-threaded loop because the loop body generated by the element
-  // generator for Rng can't be parallelized (b/32333178).
-  return llvm_ir::LoopEmitter(
-             GpuElementalIrEmitter(hlo_module_config_, module_, &ir_builder_,
-                                   GetNestedComputer())
-                 .MakeElementGenerator(random, operand_to_generator),
-             GetIrArray(*random, *random), &ir_builder_)
-      .EmitLoop(IrName(random));
-}
-
 Status IrEmitter::HandleBatchNormInference(HloInstruction*) {
   return Unimplemented(
       "The GPU backend does not implement BatchNormInference directly.  It "
@@ -784,22 +726,27 @@ Status IrEmitter::HandleBatchNormGrad(HloInstruction*) {
       "to a cudnn CustomCall using CudnnBatchNormRewriter.");
 }
 
+Status IrEmitter::HandleIota(HloInstruction*) {
+  // TODO(b/64798317): implement iota on GPU.
+  return Unimplemented("Iota is not implemented on GPU.");
+}
+
 StatusOr<llvm::Value*> IrEmitter::ComputeNestedElement(
     const HloComputation& computation,
     tensorflow::gtl::ArraySlice<llvm::Value*> parameter_elements) {
   llvm::Value* return_buffer = llvm_ir::EmitAllocaAtFunctionEntry(
       llvm_ir::PrimitiveTypeToIrType(
           computation.root_instruction()->shape().element_type(), module_),
-      "return_buffer", &ir_builder_);
+      "return_buffer", &b_);
   std::vector<llvm::Value*> parameter_buffers;
   for (llvm::Value* parameter_element : parameter_elements) {
     parameter_buffers.push_back(llvm_ir::EmitAllocaAtFunctionEntry(
-        parameter_element->getType(), "parameter_buffer", &ir_builder_));
-    ir_builder_.CreateStore(parameter_element, parameter_buffers.back());
+        parameter_element->getType(), "parameter_buffer", &b_));
+    b_.CreateStore(parameter_element, parameter_buffers.back());
   }
   TF_RETURN_IF_ERROR(EmitCallToNestedComputation(computation, parameter_buffers,
                                                  return_buffer));
-  return ir_builder_.CreateLoad(return_buffer);
+  return b_.CreateLoad(return_buffer);
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.h b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
index 77e48d729c..80e2a203ac 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
@@ -79,7 +79,6 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   Status HandleCrossReplicaSum(HloInstruction* crs) override;
   Status HandleInfeed(HloInstruction* infeed) override;
   Status HandleOutfeed(HloInstruction* outfeed) override;
-  Status HandleSort(HloInstruction* sort) override;
   Status HandleSend(HloInstruction* send) override;
   Status HandleSendDone(HloInstruction* send_done) override;
   Status HandleRecv(HloInstruction* recv) override;
@@ -92,10 +91,10 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   Status HandleFusion(HloInstruction* fusion) override;
   Status HandleCall(HloInstruction* call) override;
   Status HandleCustomCall(HloInstruction* custom_call) override;
-  Status HandleRng(HloInstruction* random) override;
   Status HandleBatchNormInference(HloInstruction* batch_norm) override;
   Status HandleBatchNormTraining(HloInstruction* batch_norm) override;
   Status HandleBatchNormGrad(HloInstruction* batch_norm) override;
+  Status HandleIota(HloInstruction* iota) override;
 
   Status FinishVisit(HloInstruction* root) override { return Status::OK(); }
 
@@ -162,7 +161,7 @@ class IrEmitter : public DfsHloVisitorWithDefault {
 
   // The following fields track the IR emission state. According to LLVM memory
   // management rules, their memory is owned by the module.
-  llvm::IRBuilder<> ir_builder_;
+  llvm::IRBuilder<> b_;
 
   // Mapping from HLO to its underlying LLVM value.
   HloToIrBindings bindings_;
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc
index c9574c87a3..5c827e5f9c 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc
@@ -70,10 +70,10 @@ llvm::Function* IrEmitterNested::EmitBasePointersForNestedComputation(
     argument_dereferenceable_bytes.push_back(root_size);
   }
   // The base pointer of the memory block for all pre-allocated temp buffers.
-  argument_types.push_back(ir_builder_.getInt8PtrTy());
+  argument_types.push_back(b_.getInt8PtrTy());
 
   llvm::FunctionType* function_type =
-      llvm::FunctionType::get(ir_builder_.getVoidTy(), argument_types, false);
+      llvm::FunctionType::get(b_.getVoidTy(), argument_types, false);
   llvm::Function* function = llvm::Function::Create(
       function_type,                       // The function type.
       llvm::GlobalValue::InternalLinkage,  // The linkage type.
@@ -96,8 +96,7 @@ llvm::Function* IrEmitterNested::EmitBasePointersForNestedComputation(
       llvm::BasicBlock::Create(function->getContext(), "entry", function);
   // Emit a "return void" at entry_bb's end, and sets the insert point before
   // that return instruction.
-  ir_builder_.SetInsertPoint(
-      llvm::ReturnInst::Create(function->getContext(), entry_bb));
+  b_.SetInsertPoint(llvm::ReturnInst::Create(function->getContext(), entry_bb));
 
   std::vector<const HloInstruction*> non_io_hlos;
   for (const auto* hlo : nested_computation.instructions()) {
@@ -127,20 +126,17 @@ Status IrEmitterNested::EmitTargetElementLoop(
       target_arrays.push_back(GetIrArray(hlo, hlo, {i}));
     }
     TF_RETURN_IF_ERROR(
-        llvm_ir::LoopEmitter(element_generator, target_arrays, &ir_builder_)
-            .EmitLoop());
+        llvm_ir::LoopEmitter(element_generator, target_arrays, &b_).EmitLoop());
 
     std::vector<llvm::Value*> tuple_operand_ptrs;
     tuple_operand_ptrs.reserve(num_elems);
     for (const llvm_ir::IrArray& array : target_arrays) {
       tuple_operand_ptrs.push_back(array.GetBasePointer());
     }
-    llvm_ir::EmitTuple(GetIrArray(hlo, hlo), tuple_operand_ptrs, &ir_builder_,
-                       module_);
+    llvm_ir::EmitTuple(GetIrArray(hlo, hlo), tuple_operand_ptrs, &b_, module_);
     return Status::OK();
   }
-  return llvm_ir::LoopEmitter(element_generator, GetIrArray(hlo, hlo),
-                              &ir_builder_)
+  return llvm_ir::LoopEmitter(element_generator, GetIrArray(hlo, hlo), &b_)
       .EmitLoop();
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 1caf10a6c1..3a5394dac6 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor.h"
 #include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
+#include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/conditional_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/copy_thunk.h"
@@ -59,10 +60,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/sort_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h"
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -71,8 +74,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
@@ -213,7 +218,7 @@ llvm::Function* IrEmitterUnnested::BuildKernelPrototype(
   llvm::LLVMContext& context = module->getContext();
   llvm::FunctionType* kernel_type = llvm::FunctionType::get(
       /*Result=*/llvm::Type::getVoidTy(context),
-      std::vector<llvm::Type*>(args.size(), ir_builder_.getInt8PtrTy()),
+      std::vector<llvm::Type*>(args.size(), b_.getInt8PtrTy()),
       /*isVarArg=*/false);
   llvm::Function* kernel =
       llvm::Function::Create(kernel_type, llvm::GlobalValue::ExternalLinkage,
@@ -228,9 +233,20 @@ llvm::Function* IrEmitterUnnested::BuildKernelPrototype(
     ++arg_it;
 
     kernel->addDereferenceableAttr(arg_no + 1, alloc->size());
+
+    const int64 alignment = [&] {
+      if (alloc->is_entry_computation_parameter()) {
+        return kEntryParameterAlignBytes;
+      } else if (alloc->is_constant()) {
+        return kConstantBufferAlignBytes;
+      } else {
+        return kXlaAllocatedBufferAlignBytes;
+      }
+    }();
+
     kernel->addParamAttr(
-        arg_no, llvm::Attribute::get(context, llvm::Attribute::Alignment,
-                                     kCudaMallocAlignBytes));
+        arg_no,
+        llvm::Attribute::get(context, llvm::Attribute::Alignment, alignment));
 
     if (alloc->IsPreallocatedTempBuffer()) {
       fn_arg->setName("temp_buf");
@@ -249,7 +265,7 @@ llvm::Function* IrEmitterUnnested::BuildKernelPrototype(
   nvvm_annotations_node->addOperand(llvm::MDNode::get(
       context, {llvm::ConstantAsMetadata::get(kernel),
                 llvm::MDString::get(context, "kernel"),
-                llvm::ConstantAsMetadata::get(ir_builder_.getInt32(1))}));
+                llvm::ConstantAsMetadata::get(b_.getInt32(1))}));
 
   // Update the insert point to the entry basic block.
   llvm::BasicBlock* entry_bb =
@@ -257,7 +273,7 @@ llvm::Function* IrEmitterUnnested::BuildKernelPrototype(
 
   // Emit a "return void" at entry_bb's end, and set the insert point before
   // that return instruction.
-  ir_builder_.SetInsertPoint(llvm::ReturnInst::Create(context, entry_bb));
+  b_.SetInsertPoint(llvm::ReturnInst::Create(context, entry_bb));
 
   return kernel;
 }
@@ -295,7 +311,7 @@ int ComputeMaxUnrollFactor(const HloInstruction* hlo) {
 //    range of i32.
 // Otherwise, the return type is i64.
 llvm::Type* GetIndexTypeForKernel(const HloInstruction* hlo, int64 launch_size,
-                                  llvm::IRBuilder<>* ir_builder) {
+                                  llvm::IRBuilder<>* b) {
   // Find the unnested hlo instructon for which the kernel is generated for.
   const HloInstruction* unnested_hlo = hlo;
   const HloComputation* computation = hlo->parent();
@@ -316,7 +332,7 @@ llvm::Type* GetIndexTypeForKernel(const HloInstruction* hlo, int64 launch_size,
     return in_range;
   };
 
-  llvm::Type* i64_ty = ir_builder->getInt64Ty();
+  llvm::Type* i64_ty = b->getInt64Ty();
   // Check launch dimension
   if (!IsInt32(launch_size)) {
     return i64_ty;
@@ -345,7 +361,7 @@ llvm::Type* GetIndexTypeForKernel(const HloInstruction* hlo, int64 launch_size,
     }
   }
 
-  return ir_builder->getInt32Ty();
+  return b->getInt32Ty();
 }
 
 }  // namespace
@@ -600,8 +616,8 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
           parameter_arrays.push_back(GetIrArray(*operand, *fusion));
         }
         GpuElementalIrEmitter elemental_emitter(
-            hlo_module_config_, ir_emitter_context_->llvm_module(),
-            &ir_builder_, GetNestedComputer());
+            hlo_module_config_, ir_emitter_context_->llvm_module(), &b_,
+            GetNestedComputer());
         FusedIrEmitter fused_emitter(parameter_arrays, &elemental_emitter);
         TF_RETURN_IF_ERROR(root->Accept(&fused_emitter));
 
@@ -674,7 +690,7 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
     }
     GpuElementalIrEmitter elemental_emitter(hlo_module_config_,
                                             ir_emitter_context_->llvm_module(),
-                                            &ir_builder_, GetNestedComputer());
+                                            &b_, GetNestedComputer());
 
     // Shape of the dynamic-update-slice's "update" operand.
     Shape update_shape = root->operand(1)->shape();
@@ -692,7 +708,7 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
 
     return llvm_ir::EmitParallelFusedDynamicUpdateSliceInPlace(
         fusion, operand_arrays, output_array, &elemental_emitter,
-        launch_dimensions, &ir_builder_);
+        launch_dimensions, &b_);
   }
 
   if (ImplementedAsGemm(*fusion)) {
@@ -740,11 +756,11 @@ Status IrEmitterUnnested::EmitExtraOutputsForReduce(
     const HloInstruction* output = reduce->parent()->FusionInstruction();
     llvm::Value* extra_output_address =
         GetIrArray(*output, *output, extra_output_gens[i].second)
-            .EmitArrayElementAddress(index, &ir_builder_,
+            .EmitArrayElementAddress(index, &b_,
                                      "extra_output_element_address");
     TF_ASSIGN_OR_RETURN(llvm::Value* const extra_output_ir_value,
                         extra_output_gens[i].first(index));
-    ir_builder_.CreateStore(extra_output_ir_value, extra_output_address);
+    b_.CreateStore(extra_output_ir_value, extra_output_address);
   }
   return Status::OK();
 }
@@ -774,8 +790,8 @@ Status IrEmitterUnnested::EmitReductionToScalar(
   LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
       tiled_input_shape, ir_emitter_context_->device_description());
 
-  llvm::Type* index_ty = GetIndexTypeForKernel(
-      reduce, launch_dimensions.launch_bound(), &ir_builder_);
+  llvm::Type* index_ty =
+      GetIndexTypeForKernel(reduce, launch_dimensions.launch_bound(), &b_);
 
   auto index_typed_constant = [&](uint64 c) -> llvm::Constant* {
     return llvm::ConstantInt::get(index_ty, c);
@@ -825,52 +841,51 @@ Status IrEmitterUnnested::EmitReductionToScalar(
         llvm_ir::PrimitiveTypeToIrType(input_shape.element_type(), module_);
     std::vector<llvm::Value*> partial_reduction_result_addresses;
     for (int i = 0; i != num_reduces; ++i) {
-      llvm::Value* partial_reduction_result_address = ir_builder_.CreateAlloca(
-          element_ir_type, /*ArraySize=*/nullptr,
-          "partial_reduction_result." + llvm::Twine(i));
+      llvm::Value* partial_reduction_result_address =
+          b_.CreateAlloca(element_ir_type, /*ArraySize=*/nullptr,
+                          "partial_reduction_result." + llvm::Twine(i));
       TF_ASSIGN_OR_RETURN(llvm::Value* const init_ir_value,
                           init_value_gens[i](IrArray::Index(index_ty)));
-      ir_builder_.CreateStore(init_ir_value, partial_reduction_result_address);
+      b_.CreateStore(init_ir_value, partial_reduction_result_address);
       partial_reduction_result_addresses.push_back(
           partial_reduction_result_address);
     }
 
     llvm::Value* x_in_tiles = tile_index[0];
-    x_in_tiles = ir_builder_.CreateZExtOrTrunc(x_in_tiles, index_ty);
+    x_in_tiles = b_.CreateZExtOrTrunc(x_in_tiles, index_ty);
 
     // Emit an inner for-loop that reduces the elements in the tile.
     auto emit_tile_element_loop = [=](bool tile_in_bounds) -> Status {
       std::unique_ptr<llvm_ir::ForLoop> tile_element_loop =
-          llvm_ir::ForLoop::EmitForLoop("element_id_in_tile",
-                                        index_typed_constant(0),
-                                        index_typed_constant(kTileSize),
-                                        index_typed_constant(1), &ir_builder_);
+          llvm_ir::ForLoop::EmitForLoop(
+              "element_id_in_tile", index_typed_constant(0),
+              index_typed_constant(kTileSize), index_typed_constant(1), &b_);
 
       // Emit the body of the partial reduction loop.
       llvm_ir::SetToFirstInsertPoint(tile_element_loop->GetBodyBasicBlock(),
-                                     &ir_builder_);
-      llvm::Value* x = ir_builder_.CreateNSWAdd(
-          ir_builder_.CreateNSWMul(x_in_tiles, index_typed_constant(kTileSize)),
+                                     &b_);
+      llvm::Value* x = b_.CreateNSWAdd(
+          b_.CreateNSWMul(x_in_tiles, index_typed_constant(kTileSize)),
           tile_element_loop->GetIndVarValue());
       // Unless we know the tile is entirely in bounds, we have to emit a
       // x-in-bounds check before reading from the input.
       if (!tile_in_bounds) {
         llvm_ir::LlvmIfData if_data = llvm_ir::EmitIfThenElse(
-            ir_builder_.CreateICmpULT(x, index_typed_constant(num_elems)),
-            "x_in_bounds", &ir_builder_);
+            b_.CreateICmpULT(x, index_typed_constant(num_elems)), "x_in_bounds",
+            &b_);
 
         // Emit code that reads the input element and accumulates it to
         // the partial reduction result.
-        llvm_ir::SetToFirstInsertPoint(if_data.true_block, &ir_builder_);
+        llvm_ir::SetToFirstInsertPoint(if_data.true_block, &b_);
       }
 
       IrArray::Index input_index(
-          /*linear=*/x, input_shape, &ir_builder_);
-      llvm::Value* input_address = ir_builder_.CreateAlloca(element_ir_type);
+          /*linear=*/x, input_shape, &b_);
+      llvm::Value* input_address = b_.CreateAlloca(element_ir_type);
       for (int i = 0; i != num_reduces; ++i) {
         TF_ASSIGN_OR_RETURN(llvm::Value* const input_ir_value,
                             input_gens[i](input_index));
-        ir_builder_.CreateStore(input_ir_value, input_address);
+        b_.CreateStore(input_ir_value, input_address);
         TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
             *reducers[i],
             {partial_reduction_result_addresses[i], input_address},
@@ -881,52 +896,48 @@ Status IrEmitterUnnested::EmitReductionToScalar(
 
     // x_end = kTileSize + x_in_tiles * kTileSize, i.e., the location that's
     // immediately beyond the tile.
-    llvm::Value* x_end = ir_builder_.CreateNSWAdd(
+    llvm::Value* x_end = b_.CreateNSWAdd(
         index_typed_constant(kTileSize),
-        ir_builder_.CreateNSWMul(x_in_tiles, index_typed_constant(kTileSize)));
+        b_.CreateNSWMul(x_in_tiles, index_typed_constant(kTileSize)));
     // The tile is entirely in bound if all_threads_in_bounds or
     // x_end <= num_elems.
-    llvm::Value* tile_in_bounds = ir_builder_.CreateOr(
-        ir_builder_.CreateICmpULE(x_end, index_typed_constant(num_elems)),
-        ir_builder_.getInt1(all_threads_in_bounds));
+    llvm::Value* tile_in_bounds =
+        b_.CreateOr(b_.CreateICmpULE(x_end, index_typed_constant(num_elems)),
+                    b_.getInt1(all_threads_in_bounds));
     llvm_ir::LlvmIfData if_tile_in_bounds_data =
-        llvm_ir::EmitIfThenElse(tile_in_bounds, "tile_in_bounds", &ir_builder_);
-    llvm_ir::SetToFirstInsertPoint(if_tile_in_bounds_data.true_block,
-                                   &ir_builder_);
+        llvm_ir::EmitIfThenElse(tile_in_bounds, "tile_in_bounds", &b_);
+    llvm_ir::SetToFirstInsertPoint(if_tile_in_bounds_data.true_block, &b_);
     TF_RETURN_IF_ERROR(emit_tile_element_loop(/*tile_in_bounds=*/true));
-    llvm_ir::SetToFirstInsertPoint(if_tile_in_bounds_data.false_block,
-                                   &ir_builder_);
+    llvm_ir::SetToFirstInsertPoint(if_tile_in_bounds_data.false_block, &b_);
     TF_RETURN_IF_ERROR(emit_tile_element_loop(/*tile_in_bounds=*/false));
 
     // After the if-then-else statement on tile_in_bounds, emit calls to
     // shfl_down that accumulate the partial reduction results of all threads
     // from the warp.
-    llvm_ir::SetToFirstInsertPoint(if_tile_in_bounds_data.after_block,
-                                   &ir_builder_);
+    llvm_ir::SetToFirstInsertPoint(if_tile_in_bounds_data.after_block, &b_);
     int bit_width = llvm_ir::GetSizeInBits(element_ir_type);
     // bitcast cannot be applied to aggregate types (even packed ones), so we
     // instead bitcast addresses of load/store to intN* of the same bit-width.
     llvm::Type* shuffle_ir_type = element_ir_type->isStructTy()
-                                      ? ir_builder_.getIntNTy(bit_width)
+                                      ? b_.getIntNTy(bit_width)
                                       : element_ir_type;
     for (int shuffle_distance = kWarpSize / 2; shuffle_distance >= 1;
          shuffle_distance /= 2) {
-      llvm::Value* result_from_other_lane = ir_builder_.CreateAlloca(
-          element_ir_type, nullptr, "result_from_other_lane");
+      llvm::Value* result_from_other_lane =
+          b_.CreateAlloca(element_ir_type, nullptr, "result_from_other_lane");
       for (int i = 0; i != num_reduces; ++i) {
-        llvm::Value* partial_reduction_result = ir_builder_.CreateLoad(
-            ir_builder_.CreateBitCast(partial_reduction_result_addresses[i],
-                                      shuffle_ir_type->getPointerTo()),
+        llvm::Value* partial_reduction_result = b_.CreateLoad(
+            b_.CreateBitCast(partial_reduction_result_addresses[i],
+                             shuffle_ir_type->getPointerTo()),
             "partial_reduction_result");
         CHECK_EQ(launch_dimensions.threads_per_block() % kWarpSize, 0)
             << "Requires block size a multiple of the warp size, otherwise we "
                "will read undefined elements.";
-        ir_builder_.CreateStore(
+        b_.CreateStore(
             EmitFullWarpShuffleDown(partial_reduction_result,
-                                    ir_builder_.getInt32(shuffle_distance),
-                                    &ir_builder_),
-            ir_builder_.CreateBitCast(result_from_other_lane,
-                                      shuffle_ir_type->getPointerTo()));
+                                    b_.getInt32(shuffle_distance), &b_),
+            b_.CreateBitCast(result_from_other_lane,
+                             shuffle_ir_type->getPointerTo()));
         TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
             *reducers[i],
             {partial_reduction_result_addresses[i], result_from_other_lane},
@@ -940,24 +951,23 @@ Status IrEmitterUnnested::EmitReductionToScalar(
     // Emit an atomic operation that accumulates the partial reduction result of
     // lane 0 (which holds the partially accumulated result for its warp) to the
     // output element.
-    llvm::Value* lane_id = ir_builder_.CreateURem(
-        x_in_tiles, index_typed_constant(kWarpSize), "lane_id");
+    llvm::Value* lane_id =
+        b_.CreateURem(x_in_tiles, index_typed_constant(kWarpSize), "lane_id");
     llvm_ir::LlvmIfData if_lane_id_is_zero_data = llvm_ir::EmitIfThenElse(
-        ir_builder_.CreateICmpEQ(lane_id, index_typed_constant(0)),
-        "lane_id_is_zero", &ir_builder_);
-    llvm_ir::SetToFirstInsertPoint(if_lane_id_is_zero_data.true_block,
-                                   &ir_builder_);
+        b_.CreateICmpEQ(lane_id, index_typed_constant(0)), "lane_id_is_zero",
+        &b_);
+    llvm_ir::SetToFirstInsertPoint(if_lane_id_is_zero_data.true_block, &b_);
 
     for (int i = 0; i != num_reduces; ++i) {
       llvm::Value* output_address =
           GetIrArray(*output, *output, reduce_output_shapes[i])
               .EmitArrayElementAddress(
                   IrArray::Index(
-                      /*linear=*/ir_builder_.getInt64(0),
+                      /*linear=*/b_.getInt64(0),
                       ShapeUtil::GetSubshape(output->shape(),
                                              reduce_output_shapes[i]),
-                      &ir_builder_),
-                  &ir_builder_, "output_element_address");
+                      &b_),
+                  &b_, "output_element_address");
       TF_RETURN_IF_ERROR(EmitAtomicOperationForNestedComputation(
           *reducers[i], output_address, partial_reduction_result_addresses[i]));
     }
@@ -971,7 +981,7 @@ Status IrEmitterUnnested::EmitReductionToScalar(
       static_cast<SequentialThunk*>(LastThunk())->thunks().back().get(),
       ir_emitter_context_->llvm_module());
   return ParallelLoopEmitter(loop_body_emitter, tiled_input_shape,
-                             launch_dimensions, &ir_builder_)
+                             launch_dimensions, &b_)
       .EmitLoop(IrName(reduce), index_ty);
 }
 
@@ -1015,7 +1025,7 @@ Status IrEmitterUnnested::EmitColumnReduction(
       tiled_input_shape, ir_emitter_context_->device_description());
 
   // TODO(b/110211620): Convert to use i32 index_type when it is possible.
-  llvm::Type* index_ty = ir_builder_.getInt64Ty();
+  llvm::Type* index_ty = b_.getInt64Ty();
 
   auto index_typed_constant = [&](uint64 c) -> llvm::Constant* {
     return llvm::ConstantInt::get(index_ty, c);
@@ -1065,14 +1075,12 @@ Status IrEmitterUnnested::EmitColumnReduction(
     for (int i = 0; i != num_reduces; ++i) {
       for (int x_offset = 0; x_offset < kTileWidth; ++x_offset) {
         llvm::Value* partial_reduction_result_address =
-            ir_builder_.CreateAlloca(
-                element_ir_type, /*ArraySize=*/nullptr,
-                "partial_reduction_result." +
-                    llvm::Twine(i * kTileWidth + x_offset));
+            b_.CreateAlloca(element_ir_type, /*ArraySize=*/nullptr,
+                            "partial_reduction_result." +
+                                llvm::Twine(i * kTileWidth + x_offset));
         TF_ASSIGN_OR_RETURN(llvm::Value* const init_ir_value,
                             init_value_gens[i](IrArray::Index(index_ty)));
-        ir_builder_.CreateStore(init_ir_value,
-                                partial_reduction_result_address);
+        b_.CreateStore(init_ir_value, partial_reduction_result_address);
         partial_reduction_result_addresses.push_back(
             partial_reduction_result_address);
       }
@@ -1083,50 +1091,47 @@ Status IrEmitterUnnested::EmitColumnReduction(
     llvm::Value* y_in_tiles = tile_index[0];
     llvm::Value* x_in_tiles = tile_index[1];
 
-    y_in_tiles = ir_builder_.CreateZExtOrTrunc(y_in_tiles, index_ty);
-    x_in_tiles = ir_builder_.CreateZExtOrTrunc(x_in_tiles, index_ty);
+    y_in_tiles = b_.CreateZExtOrTrunc(y_in_tiles, index_ty);
+    x_in_tiles = b_.CreateZExtOrTrunc(x_in_tiles, index_ty);
 
     auto emit_tile_element_loop = [=](bool tile_in_y_bounds,
                                       bool tile_in_x_bounds) -> Status {
       std::unique_ptr<llvm_ir::ForLoop> tile_element_loop =
-          llvm_ir::ForLoop::EmitForLoop("element_id_in_tile",
-                                        index_typed_constant(0),
-                                        index_typed_constant(kTileHeight),
-                                        index_typed_constant(1), &ir_builder_);
+          llvm_ir::ForLoop::EmitForLoop(
+              "element_id_in_tile", index_typed_constant(0),
+              index_typed_constant(kTileHeight), index_typed_constant(1), &b_);
 
       // Emit the body of the partial reduction loop.
       llvm_ir::SetToFirstInsertPoint(tile_element_loop->GetBodyBasicBlock(),
-                                     &ir_builder_);
-      llvm::Value* y = ir_builder_.CreateNSWAdd(
-          ir_builder_.CreateNSWMul(y_in_tiles,
-                                   index_typed_constant(kTileHeight)),
+                                     &b_);
+      llvm::Value* y = b_.CreateNSWAdd(
+          b_.CreateNSWMul(y_in_tiles, index_typed_constant(kTileHeight)),
           tile_element_loop->GetIndVarValue());
 
       // Unless we know that y is in bounds, we have to emit a check before
       // reading from the input.
       if (!tile_in_y_bounds) {
         llvm_ir::LlvmIfData if_data = llvm_ir::EmitIfThenElse(
-            ir_builder_.CreateICmpULT(y, index_typed_constant(height)),
-            "y_in_bounds", &ir_builder_);
+            b_.CreateICmpULT(y, index_typed_constant(height)), "y_in_bounds",
+            &b_);
 
         // Emit code that reads the input element and accumulates it to
         // the partial reduction result.
-        llvm_ir::SetToFirstInsertPoint(if_data.true_block, &ir_builder_);
+        llvm_ir::SetToFirstInsertPoint(if_data.true_block, &b_);
       }
       for (int x_offset = 0; x_offset < kTileWidth; ++x_offset) {
-        llvm::Value* x = ir_builder_.CreateNSWAdd(
-            ir_builder_.CreateNSWMul(x_in_tiles,
-                                     index_typed_constant(kTileWidth)),
+        llvm::Value* x = b_.CreateNSWAdd(
+            b_.CreateNSWMul(x_in_tiles, index_typed_constant(kTileWidth)),
             index_typed_constant(x_offset));
         // Unless we know that x is in bounds, we have to emit a check before
         // reading from the input.
         if (!tile_in_x_bounds) {
           llvm_ir::LlvmIfData if_data = llvm_ir::EmitIfThenElse(
-              ir_builder_.CreateICmpULT(x, index_typed_constant(width)),
-              "x_in_bounds", &ir_builder_);
-          llvm_ir::SetToFirstInsertPoint(if_data.true_block, &ir_builder_);
+              b_.CreateICmpULT(x, index_typed_constant(width)), "x_in_bounds",
+              &b_);
+          llvm_ir::SetToFirstInsertPoint(if_data.true_block, &b_);
         }
-        llvm::Value* input_address = ir_builder_.CreateAlloca(element_ir_type);
+        llvm::Value* input_address = b_.CreateAlloca(element_ir_type);
         // {y,x} is an index to input_matrix_shape [height,width]. We need to
         // convert that to an index to input_shape (the shape of the operand of
         // "reduce"). This conversion is composed of a transposition from
@@ -1143,18 +1148,17 @@ Status IrEmitterUnnested::EmitColumnReduction(
             ShapeUtil::MakeShapeWithDescendingLayout(input_shape.element_type(),
                                                      {height, width});
         const IrArray::Index input_matrix_index({y, x}, input_matrix_shape,
-                                                &ir_builder_);
+                                                &b_);
         const IrArray::Index input_index =
             input_matrix_index
                 .SourceIndexOfReshape(input_matrix_shape,
-                                      normalized_input_shape, &ir_builder_)
+                                      normalized_input_shape, &b_)
                 .SourceIndexOfTranspose(normalized_input_shape, input_shape,
-                                        transpose_dimension_mapping,
-                                        &ir_builder_);
+                                        transpose_dimension_mapping, &b_);
         for (int i = 0; i != num_reduces; ++i) {
           TF_ASSIGN_OR_RETURN(llvm::Value* const input_ir_value,
                               input_gens[i](input_index));
-          ir_builder_.CreateStore(input_ir_value, input_address);
+          b_.CreateStore(input_ir_value, input_address);
           TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
               *reducers[i],
               {partial_reduction_result_addresses[i * kTileWidth + x_offset],
@@ -1169,64 +1173,55 @@ Status IrEmitterUnnested::EmitColumnReduction(
 
     // y_end = kTileHeight + y_in_tiles * kTileHeight, i.e., the y location
     // that's immediately beyond the tile.
-    llvm::Value* y_end = ir_builder_.CreateNSWAdd(
+    llvm::Value* y_end = b_.CreateNSWAdd(
         index_typed_constant(kTileHeight),
-        ir_builder_.CreateNSWMul(y_in_tiles,
-                                 index_typed_constant(kTileHeight)));
+        b_.CreateNSWMul(y_in_tiles, index_typed_constant(kTileHeight)));
     // x_end = kTileWidth + x_in_tiles * kTileWidth, i.e., the x location
     // that's immediately beyond the tile.
-    llvm::Value* x_end = ir_builder_.CreateNSWAdd(
+    llvm::Value* x_end = b_.CreateNSWAdd(
         index_typed_constant(kTileWidth),
-        ir_builder_.CreateNSWMul(x_in_tiles, index_typed_constant(kTileWidth)));
-    llvm::Value* tile_in_y_bounds = ir_builder_.CreateOr(
-        ir_builder_.CreateICmpULE(y_end, index_typed_constant(height)),
-        ir_builder_.getInt1(height % kTileHeight == 0));
-    llvm::Value* tile_in_x_bounds = ir_builder_.CreateOr(
-        ir_builder_.CreateICmpULE(x_end, index_typed_constant(width)),
-        ir_builder_.getInt1(width % kTileWidth == 0));
+        b_.CreateNSWMul(x_in_tiles, index_typed_constant(kTileWidth)));
+    llvm::Value* tile_in_y_bounds =
+        b_.CreateOr(b_.CreateICmpULE(y_end, index_typed_constant(height)),
+                    b_.getInt1(height % kTileHeight == 0));
+    llvm::Value* tile_in_x_bounds =
+        b_.CreateOr(b_.CreateICmpULE(x_end, index_typed_constant(width)),
+                    b_.getInt1(width % kTileWidth == 0));
     // The tile is in y bounds if "height" is a multiple of kTileHeight or
     // y_end <= height.
-    llvm_ir::LlvmIfData if_tile_in_y_bounds_data = llvm_ir::EmitIfThenElse(
-        tile_in_y_bounds, "tile_in_y_bounds", &ir_builder_);
-    llvm_ir::SetToFirstInsertPoint(if_tile_in_y_bounds_data.true_block,
-                                   &ir_builder_);
+    llvm_ir::LlvmIfData if_tile_in_y_bounds_data =
+        llvm_ir::EmitIfThenElse(tile_in_y_bounds, "tile_in_y_bounds", &b_);
+    llvm_ir::SetToFirstInsertPoint(if_tile_in_y_bounds_data.true_block, &b_);
     // The tile is in x bounds if "width" is a multiple of kTileWidth or
     // x_end <= width.
-    llvm_ir::LlvmIfData if_tile_in_x_bounds_data = llvm_ir::EmitIfThenElse(
-        tile_in_x_bounds, "tile_in_x_bounds", &ir_builder_);
-    llvm_ir::SetToFirstInsertPoint(if_tile_in_x_bounds_data.true_block,
-                                   &ir_builder_);
+    llvm_ir::LlvmIfData if_tile_in_x_bounds_data =
+        llvm_ir::EmitIfThenElse(tile_in_x_bounds, "tile_in_x_bounds", &b_);
+    llvm_ir::SetToFirstInsertPoint(if_tile_in_x_bounds_data.true_block, &b_);
     TF_RETURN_IF_ERROR(emit_tile_element_loop(/*tile_in_y_bounds=*/true,
                                               /*tile_in_x_bounds=*/true));
-    llvm_ir::SetToFirstInsertPoint(if_tile_in_x_bounds_data.false_block,
-                                   &ir_builder_);
+    llvm_ir::SetToFirstInsertPoint(if_tile_in_x_bounds_data.false_block, &b_);
     TF_RETURN_IF_ERROR(emit_tile_element_loop(/*tile_in_y_bounds=*/true,
                                               /*tile_in_x_bounds=*/false));
-    llvm_ir::SetToFirstInsertPoint(if_tile_in_y_bounds_data.false_block,
-                                   &ir_builder_);
-    if_tile_in_x_bounds_data = llvm_ir::EmitIfThenElse(
-        tile_in_x_bounds, "tile_in_x_bounds", &ir_builder_);
-    llvm_ir::SetToFirstInsertPoint(if_tile_in_x_bounds_data.true_block,
-                                   &ir_builder_);
+    llvm_ir::SetToFirstInsertPoint(if_tile_in_y_bounds_data.false_block, &b_);
+    if_tile_in_x_bounds_data =
+        llvm_ir::EmitIfThenElse(tile_in_x_bounds, "tile_in_x_bounds", &b_);
+    llvm_ir::SetToFirstInsertPoint(if_tile_in_x_bounds_data.true_block, &b_);
     TF_RETURN_IF_ERROR(emit_tile_element_loop(/*tile_in_y_bounds=*/false,
                                               /*tile_in_x_bounds=*/true));
-    llvm_ir::SetToFirstInsertPoint(if_tile_in_x_bounds_data.false_block,
-                                   &ir_builder_);
+    llvm_ir::SetToFirstInsertPoint(if_tile_in_x_bounds_data.false_block, &b_);
     TF_RETURN_IF_ERROR(emit_tile_element_loop(/*tile_in_y_bounds=*/false,
                                               /*tile_in_x_bounds=*/false));
 
     // After the nested if-then-else statement on tile_in_y_bounds and
     // tile_in_x_bounds, emit atomic operations to accumulate the partial
     // reduction result to the output element.
-    llvm_ir::SetToFirstInsertPoint(if_tile_in_y_bounds_data.after_block,
-                                   &ir_builder_);
+    llvm_ir::SetToFirstInsertPoint(if_tile_in_y_bounds_data.after_block, &b_);
     const HloInstruction* output =
         reduce->IsFused() ? reduce->parent()->FusionInstruction() : reduce;
     for (int i = 0; i != num_reduces; ++i) {
       for (int x_offset = 0; x_offset < kTileWidth; ++x_offset) {
-        llvm::Value* x = ir_builder_.CreateNSWAdd(
-            ir_builder_.CreateNSWMul(x_in_tiles,
-                                     index_typed_constant(kTileWidth)),
+        llvm::Value* x = b_.CreateNSWAdd(
+            b_.CreateNSWMul(x_in_tiles, index_typed_constant(kTileWidth)),
             index_typed_constant(x_offset));
         llvm::Value* output_address =
             GetIrArray(*output, *output, reduce_output_shapes[i])
@@ -1235,8 +1230,8 @@ Status IrEmitterUnnested::EmitColumnReduction(
                         x,
                         ShapeUtil::GetSubshape(output->shape(),
                                                reduce_output_shapes[i]),
-                        &ir_builder_),
-                    &ir_builder_, "output_element_address");
+                        &b_),
+                    &b_, "output_element_address");
         TF_RETURN_IF_ERROR(EmitAtomicOperationForNestedComputation(
             *reducers[i], output_address,
             partial_reduction_result_addresses[i * kTileWidth + x_offset]));
@@ -1252,7 +1247,7 @@ Status IrEmitterUnnested::EmitColumnReduction(
       static_cast<SequentialThunk*>(LastThunk())->thunks().back().get(),
       ir_emitter_context_->llvm_module());
   return ParallelLoopEmitter(loop_body_emitter, tiled_input_shape,
-                             launch_dimensions, &ir_builder_)
+                             launch_dimensions, &b_)
       .EmitLoop(IrName(reduce), index_ty);
 }
 
@@ -1402,8 +1397,8 @@ Status IrEmitterUnnested::EmitRowReduction(
       {depth / z_tile_size, height, width_in_tiles}, {2, 1, 0});
   LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
       tiled_input_shape, ir_emitter_context_->device_description());
-  llvm::Type* index_ty = GetIndexTypeForKernel(
-      reduce, launch_dimensions.launch_bound(), &ir_builder_);
+  llvm::Type* index_ty =
+      GetIndexTypeForKernel(reduce, launch_dimensions.launch_bound(), &b_);
 
   auto index_typed_constant = [&](uint64 c) -> llvm::Constant* {
     return llvm::ConstantInt::get(index_ty, c);
@@ -1415,12 +1410,12 @@ Status IrEmitterUnnested::EmitRowReduction(
         input_shape.element_type(), ir_emitter_context_->llvm_module());
     std::vector<llvm::Value*> partial_reduction_result_addresses;
     for (int i = 0; i != num_reduces; ++i) {
-      llvm::Value* partial_reduction_result_address = ir_builder_.CreateAlloca(
-          element_ir_type, /*ArraySize=*/nullptr,
-          "partial_reduction_result." + llvm::Twine(i));
+      llvm::Value* partial_reduction_result_address =
+          b_.CreateAlloca(element_ir_type, /*ArraySize=*/nullptr,
+                          "partial_reduction_result." + llvm::Twine(i));
       TF_ASSIGN_OR_RETURN(llvm::Value* const init_ir_value,
                           init_value_gens[i](IrArray::Index(index_ty)));
-      ir_builder_.CreateStore(init_ir_value, partial_reduction_result_address);
+      b_.CreateStore(init_ir_value, partial_reduction_result_address);
       partial_reduction_result_addresses.push_back(
           partial_reduction_result_address);
     }
@@ -1429,25 +1424,25 @@ Status IrEmitterUnnested::EmitRowReduction(
     llvm::Value* y = tile_index[1];
     llvm::Value* x_tile = tile_index[2];
 
-    x_tile = ir_builder_.CreateZExtOrTrunc(x_tile, index_ty);
+    x_tile = b_.CreateZExtOrTrunc(x_tile, index_ty);
 
-    llvm::Value* warp_id = ir_builder_.CreateUDiv(
-        x_tile, index_typed_constant(kWarpSize), "warp_id");
-    llvm::Value* lane_id = ir_builder_.CreateURem(
-        x_tile, index_typed_constant(kWarpSize), "lane_id");
+    llvm::Value* warp_id =
+        b_.CreateUDiv(x_tile, index_typed_constant(kWarpSize), "warp_id");
+    llvm::Value* lane_id =
+        b_.CreateURem(x_tile, index_typed_constant(kWarpSize), "lane_id");
 
     // The x-location of the last element in this z-x-tile.
     // last_x = lane_id + warpSize * (x_tile_size - 1 + warp_id * x_tile_size);
-    llvm::Value* last_x = ir_builder_.CreateNSWAdd(
-        lane_id, ir_builder_.CreateNSWMul(
-                     index_typed_constant(kWarpSize),
-                     ir_builder_.CreateNSWAdd(
-                         index_typed_constant(x_tile_size - 1),
-                         ir_builder_.CreateNSWMul(
-                             warp_id, index_typed_constant(x_tile_size)))));
+    llvm::Value* last_x = b_.CreateNSWAdd(
+        lane_id,
+        b_.CreateNSWMul(
+            index_typed_constant(kWarpSize),
+            b_.CreateNSWAdd(
+                index_typed_constant(x_tile_size - 1),
+                b_.CreateNSWMul(warp_id, index_typed_constant(x_tile_size)))));
 
     KernelSupportLibrary ksl(
-        &ir_builder_,
+        &b_,
         /*unroll_mode=*/xla::llvm_ir::UnrollMode::kFullyUnroll,
         /*prevent_vectorization=*/false);
 
@@ -1456,9 +1451,9 @@ Status IrEmitterUnnested::EmitRowReduction(
     auto emit_z_x_tile_element_loop = [&](bool x_tile_in_bounds,
                                           int64 x_tile_loop_bound) -> Status {
       auto emit_z_tile_element_loop = [&](llvm::Value* z_indvar) -> Status {
-        llvm::Value* z = ir_builder_.CreateNSWAdd(
-            z_indvar, ir_builder_.CreateNSWMul(
-                          index_typed_constant(z_tile_size), z_tile));
+        llvm::Value* z = b_.CreateNSWAdd(
+            z_indvar,
+            b_.CreateNSWMul(index_typed_constant(z_tile_size), z_tile));
         TF_RETURN_IF_ERROR(ksl.For(
             "x_tile",
             /*start=*/index_typed_constant(0),
@@ -1466,12 +1461,12 @@ Status IrEmitterUnnested::EmitRowReduction(
             /*step=*/1, [&](llvm::Value* x_indvar) -> Status {
               // x = lane_id +
               //     warpSize * (element_id_in_x_tile + warp_id * x_tile_size);
-              llvm::Value* x = ir_builder_.CreateNSWAdd(
+              llvm::Value* x = b_.CreateNSWAdd(
                   lane_id,
-                  ir_builder_.CreateNSWMul(
+                  b_.CreateNSWMul(
                       index_typed_constant(kWarpSize),
-                      ir_builder_.CreateNSWAdd(
-                          x_indvar, ir_builder_.CreateNSWMul(
+                      b_.CreateNSWAdd(
+                          x_indvar, b_.CreateNSWMul(
                                         warp_id, llvm::ConstantInt::get(
                                                      index_ty, x_tile_size)))));
 
@@ -1479,18 +1474,17 @@ Status IrEmitterUnnested::EmitRowReduction(
               // emit a x-in-bounds check before reading from the input.
               if (!x_tile_in_bounds) {
                 llvm_ir::LlvmIfData if_x_in_bounds_data =
-                    llvm_ir::EmitIfThenElse(ir_builder_.CreateICmpULT(
-                                                x, index_typed_constant(width)),
-                                            "x_in_bounds", &ir_builder_);
-                // Points ir_builder_ to the then-block.
+                    llvm_ir::EmitIfThenElse(
+                        b_.CreateICmpULT(x, index_typed_constant(width)),
+                        "x_in_bounds", &b_);
+                // Points b_ to the then-block.
                 llvm_ir::SetToFirstInsertPoint(if_x_in_bounds_data.true_block,
-                                               &ir_builder_);
+                                               &b_);
               }
 
               // Emit code that reads the input element and accumulates it
               // to the partial reduction result.
-              llvm::Value* input_address =
-                  ir_builder_.CreateAlloca(element_ir_type);
+              llvm::Value* input_address = b_.CreateAlloca(element_ir_type);
               {
                 // {z,y,x} is an index to input_3d_tensor_shape
                 // [depth,height,width]. We need to convert that to an index
@@ -1509,20 +1503,19 @@ Status IrEmitterUnnested::EmitRowReduction(
                     ShapeUtil::MakeShapeWithDescendingLayout(
                         input_shape.element_type(), {depth, height, width});
                 const IrArray::Index input_3d_tensor_index(
-                    {z, y, x}, input_3d_tensor_shape, &ir_builder_);
+                    {z, y, x}, input_3d_tensor_shape, &b_);
                 const IrArray::Index input_index =
                     input_3d_tensor_index
                         .SourceIndexOfReshape(input_3d_tensor_shape,
-                                              normalized_input_shape,
-                                              &ir_builder_)
+                                              normalized_input_shape, &b_)
                         .SourceIndexOfTranspose(
                             normalized_input_shape, input_shape,
-                            transpose_dimension_mapping, &ir_builder_);
+                            transpose_dimension_mapping, &b_);
 
                 for (int i = 0; i != num_reduces; ++i) {
                   TF_ASSIGN_OR_RETURN(llvm::Value* const input_ir_value,
                                       input_gens[i](input_index));
-                  ir_builder_.CreateStore(input_ir_value, input_address);
+                  b_.CreateStore(input_ir_value, input_address);
                   TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
                       *reducers[i],
                       {partial_reduction_result_addresses[i], input_address},
@@ -1541,9 +1534,9 @@ Status IrEmitterUnnested::EmitRowReduction(
                      /*step=*/1, emit_z_tile_element_loop);
     };
 
-    llvm::Value* tile_in_bounds = ir_builder_.CreateOr(
-        ir_builder_.getInt1(width % (x_tile_size * kWarpSize) == 0),
-        ir_builder_.CreateICmpULT(last_x, index_typed_constant(width)));
+    llvm::Value* tile_in_bounds =
+        b_.CreateOr(b_.getInt1(width % (x_tile_size * kWarpSize) == 0),
+                    b_.CreateICmpULT(last_x, index_typed_constant(width)));
 
     TF_RETURN_IF_ERROR(
         ksl.If(tile_in_bounds,
@@ -1566,26 +1559,25 @@ Status IrEmitterUnnested::EmitRowReduction(
     // bitcast cannot be applied to aggregate types (even packed ones), so we
     // instead bitcast addresses of load/store to intN* of the same bit-width.
     llvm::Type* shuffle_ir_type = element_ir_type->isStructTy()
-                                      ? ir_builder_.getIntNTy(bit_width)
+                                      ? b_.getIntNTy(bit_width)
                                       : element_ir_type;
     for (int shuffle_distance = 16; shuffle_distance >= 1;
          shuffle_distance /= 2) {
-      llvm::Value* result_from_other_lane = ir_builder_.CreateAlloca(
-          element_ir_type, nullptr, "result_from_other_lane");
+      llvm::Value* result_from_other_lane =
+          b_.CreateAlloca(element_ir_type, nullptr, "result_from_other_lane");
       for (int i = 0; i != num_reduces; ++i) {
-        llvm::Value* partial_reduction_result = ir_builder_.CreateLoad(
-            ir_builder_.CreateBitCast(partial_reduction_result_addresses[i],
-                                      shuffle_ir_type->getPointerTo()),
+        llvm::Value* partial_reduction_result = b_.CreateLoad(
+            b_.CreateBitCast(partial_reduction_result_addresses[i],
+                             shuffle_ir_type->getPointerTo()),
             "partial_reduction_result");
         CHECK_EQ(launch_dimensions.threads_per_block() % kWarpSize, 0)
             << "Requires block size a multiple of the warp size, otherwise we "
                "will read undefined elements.";
-        ir_builder_.CreateStore(
+        b_.CreateStore(
             EmitFullWarpShuffleDown(partial_reduction_result,
-                                    ir_builder_.getInt32(shuffle_distance),
-                                    &ir_builder_),
-            ir_builder_.CreateBitCast(result_from_other_lane,
-                                      shuffle_ir_type->getPointerTo()));
+                                    b_.getInt32(shuffle_distance), &b_),
+            b_.CreateBitCast(result_from_other_lane,
+                             shuffle_ir_type->getPointerTo()));
         TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
             *reducers[i],
             {partial_reduction_result_addresses[i], result_from_other_lane},
@@ -1600,10 +1592,9 @@ Status IrEmitterUnnested::EmitRowReduction(
     // lane 0 (which holds the partially accumulated result for its warp) to the
     // output element.
     llvm_ir::LlvmIfData if_lane_id_is_zero_data = llvm_ir::EmitIfThenElse(
-        ir_builder_.CreateICmpEQ(lane_id, index_typed_constant(0)),
-        "lane_id_is_zero", &ir_builder_);
-    llvm_ir::SetToFirstInsertPoint(if_lane_id_is_zero_data.true_block,
-                                   &ir_builder_);
+        b_.CreateICmpEQ(lane_id, index_typed_constant(0)), "lane_id_is_zero",
+        &b_);
+    llvm_ir::SetToFirstInsertPoint(if_lane_id_is_zero_data.true_block, &b_);
     for (int i = 0; i != num_reduces; ++i) {
       llvm::Value* output_address =
           GetIrArray(*output, *output, reduce_output_shapes[i])
@@ -1611,8 +1602,8 @@ Status IrEmitterUnnested::EmitRowReduction(
                   IrArray::Index(y,
                                  ShapeUtil::GetSubshape(
                                      output->shape(), reduce_output_shapes[i]),
-                                 &ir_builder_),
-                  &ir_builder_, "output_element_address");
+                                 &b_),
+                  &b_, "output_element_address");
       // We don't need to emit atomic operations if there is only one tile of
       // results. 'depth' is the z dimension, 'width' is the x dimension.
       if (z_tile_size >= depth && x_tile_size >= width) {
@@ -1636,7 +1627,7 @@ Status IrEmitterUnnested::EmitRowReduction(
       static_cast<SequentialThunk*>(LastThunk())->thunks().back().get(),
       ir_emitter_context_->llvm_module());
   return ParallelLoopEmitter(loop_body_emitter, tiled_input_shape,
-                             launch_dimensions, &ir_builder_)
+                             launch_dimensions, &b_)
       .EmitLoop(IrName(reduce), index_ty);
 }
 
@@ -1762,12 +1753,11 @@ Status IrEmitterUnnested::HandleReduce(HloInstruction* reduce) {
 
     return EmitReductionToVector(
         reduce, input->shape(), {[&](const IrArray::Index& index) {
-          return GetIrArray(*input, *reduce)
-              .EmitReadArrayElement(index, &ir_builder_);
+          return GetIrArray(*input, *reduce).EmitReadArrayElement(index, &b_);
         }},
         {[&](const IrArray::Index& index) {
           return GetIrArray(*init_value, *reduce)
-              .EmitReadArrayElement(index, &ir_builder_);
+              .EmitReadArrayElement(index, &b_);
         }},
         dimensions_to_reduce, {reducer}, {{}}, {});
   }
@@ -1780,9 +1770,12 @@ Status IrEmitterUnnested::HandleReduce(HloInstruction* reduce) {
 Status IrEmitterUnnested::HandleTuple(HloInstruction* tuple) {
   bool all_tuple_elements_have_buffer =
       c_all_of(tuple->operands(), [&](HloInstruction* tuple_element) {
-        return ir_emitter_context_->buffer_assignment().HasTopLevelAllocation(
-            tuple_element);
+        return ir_emitter_context_->buffer_assignment()
+            .GetUniqueTopLevelSlice(tuple_element)
+            .ok();
       });
+  // TODO(b/111689850): This logic isn't quite correct.
+  //
   // Tuples (especially tuples that are the final result of a computation) can
   // be so huge that if we were to emit a kernel that took each tuple element as
   // a parameter, we would exceed the max allowable number of parameters to a
@@ -1790,9 +1783,9 @@ Status IrEmitterUnnested::HandleTuple(HloInstruction* tuple) {
   // buffer, we collect their buffer addresses in a host array, and then copy
   // that array to the tuple's buffer.
   //
-  // Some tuple elements (e.g. const or bitcast of const) might not have a
-  // buffer -- their contents are stored in code. In that case, we fall back to
-  // emitting kernels which have access to their buffer addresses in code.
+  // Some tuple elements might not have an unambiguous buffer (like the result
+  // of a select-tuple). In that case, we fall back to emitting kernels which
+  // have access to their buffer addresses in code.
   if (all_tuple_elements_have_buffer) {
     std::vector<BufferAllocation::Slice> tuple_element_buffers;
     for (const HloInstruction* tuple_element : tuple->operands()) {
@@ -1842,7 +1835,7 @@ Status IrEmitterUnnested::HandleSelectAndScatter(
   LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
       source->shape(), ir_emitter_context_->device_description());
   llvm::Type* index_type = GetIndexTypeForKernel(
-      select_and_scatter, launch_dimensions.launch_bound(), &ir_builder_);
+      select_and_scatter, launch_dimensions.launch_bound(), &b_);
   auto index_typed_constant = [&](uint64 c) -> llvm::Constant* {
     return llvm::ConstantInt::get(index_type, c);
   };
@@ -1873,19 +1866,18 @@ Status IrEmitterUnnested::HandleSelectAndScatter(
     llvm::Value* selected_value_address = llvm_ir::EmitAllocaAtFunctionEntry(
         llvm_ir::PrimitiveTypeToIrType(operand_element_type,
                                        ir_emitter_context_->llvm_module()),
-        "selected_value_address", &ir_builder_);
+        "selected_value_address", &b_);
     llvm::Value* selected_index_address =
         llvm_ir::EmitAllocaAtFunctionEntryWithCount(
             index_type, index_typed_constant(rank), "selected_index_address",
-            &ir_builder_);
+            &b_);
     llvm::Value* initialized_flag_address = llvm_ir::EmitAllocaAtFunctionEntry(
-        ir_builder_.getInt1Ty(), "initialized_flag_address", &ir_builder_);
-    ir_builder_.CreateStore(ir_builder_.getInt1(false),
-                            initialized_flag_address);
+        b_.getInt1Ty(), "initialized_flag_address", &b_);
+    b_.CreateStore(b_.getInt1(false), initialized_flag_address);
 
     // Create the inner loop to iterate over the window.
-    llvm_ir::ForLoopNest window_loops(IrName(select_and_scatter, "inner"),
-                                      &ir_builder_, index_type);
+    llvm_ir::ForLoopNest window_loops(IrName(select_and_scatter, "inner"), &b_,
+                                      index_type);
     std::vector<int64> window_size;
     for (const auto& dim : window.dimensions()) {
       window_size.push_back(dim.size());
@@ -1894,84 +1886,79 @@ Status IrEmitterUnnested::HandleSelectAndScatter(
     const IrArray::Index window_index = window_loops.AddLoopsForShape(
         ShapeUtil::MakeShape(operand_element_type, window_size), "window");
     llvm_ir::SetToFirstInsertPoint(window_loops.GetInnerLoopBodyBasicBlock(),
-                                   &ir_builder_);
+                                   &b_);
 
     // Compute the operand index to visit and evaluate the condition whether the
     // operand index is within the bounds. The unsigned comparison includes
     // checking whether the operand index >= 0.
     IrArray::Index operand_index(index_type, source_index.size());
-    llvm::Value* in_bounds_condition = ir_builder_.getInt1(true);
+    llvm::Value* in_bounds_condition = b_.getInt1(true);
     for (int64 i = 0; i < rank; ++i) {
-      llvm::Value* strided_index = ir_builder_.CreateNSWMul(
+      llvm::Value* strided_index = b_.CreateNSWMul(
           source_index[i], index_typed_constant(window.dimensions(i).stride()));
-      operand_index[i] = ir_builder_.CreateNSWSub(
-          ir_builder_.CreateNSWAdd(strided_index, window_index[i]),
+      operand_index[i] = b_.CreateNSWSub(
+          b_.CreateNSWAdd(strided_index, window_index[i]),
           index_typed_constant(window.dimensions(i).padding_low()));
-      llvm::Value* index_condition = ir_builder_.CreateICmpULT(
+      llvm::Value* index_condition = b_.CreateICmpULT(
           operand_index[i],
           index_typed_constant(ShapeUtil::GetDimension(operand->shape(), i)));
-      in_bounds_condition =
-          ir_builder_.CreateAnd(in_bounds_condition, index_condition);
+      in_bounds_condition = b_.CreateAnd(in_bounds_condition, index_condition);
     }
     CHECK(in_bounds_condition != nullptr);
 
     // Only need to do something if the operand index is within the bounds.
     // First check if the initialized_flag is set.
     llvm_ir::LlvmIfData if_in_bounds =
-        llvm_ir::EmitIfThenElse(in_bounds_condition, "in-bounds", &ir_builder_);
-    llvm_ir::SetToFirstInsertPoint(if_in_bounds.true_block, &ir_builder_);
+        llvm_ir::EmitIfThenElse(in_bounds_condition, "in-bounds", &b_);
+    llvm_ir::SetToFirstInsertPoint(if_in_bounds.true_block, &b_);
     llvm_ir::LlvmIfData if_initialized = llvm_ir::EmitIfThenElse(
-        ir_builder_.CreateLoad(initialized_flag_address), "initialized",
-        &ir_builder_);
+        b_.CreateLoad(initialized_flag_address), "initialized", &b_);
 
     // If the initialized_flag is false, initialize the selected value and index
     // with the currently visiting operand.
-    llvm_ir::SetToFirstInsertPoint(if_initialized.false_block, &ir_builder_);
+    llvm_ir::SetToFirstInsertPoint(if_initialized.false_block, &b_);
     const auto save_operand_index = [&](const IrArray::Index& operand_index) {
       for (int64 i = 0; i < rank; ++i) {
         llvm::Value* selected_index_address_slot =
-            ir_builder_.CreateInBoundsGEP(selected_index_address,
-                                          {ir_builder_.getInt32(i)});
-        ir_builder_.CreateStore(operand_index[i], selected_index_address_slot);
+            b_.CreateInBoundsGEP(selected_index_address, {b_.getInt32(i)});
+        b_.CreateStore(operand_index[i], selected_index_address_slot);
       }
     };
     IrArray operand_array = GetIrArray(*operand, *select_and_scatter);
     llvm::Value* operand_data =
-        operand_array.EmitReadArrayElement(operand_index, &ir_builder_);
-    ir_builder_.CreateStore(operand_data, selected_value_address);
+        operand_array.EmitReadArrayElement(operand_index, &b_);
+    b_.CreateStore(operand_data, selected_value_address);
     save_operand_index(operand_index);
-    ir_builder_.CreateStore(ir_builder_.getInt1(true),
-                            initialized_flag_address);
+    b_.CreateStore(b_.getInt1(true), initialized_flag_address);
 
     // If the initialized_flag is true, call the `select` function to
     // potentially update the selected value and index with the currently
     // visiting operand.
-    llvm_ir::SetToFirstInsertPoint(if_initialized.true_block, &ir_builder_);
+    llvm_ir::SetToFirstInsertPoint(if_initialized.true_block, &b_);
     const Shape output_shape = ShapeUtil::MakeShape(PRED, {});
     llvm::Value* operand_address =
-        operand_array.EmitArrayElementAddress(operand_index, &ir_builder_);
+        operand_array.EmitArrayElementAddress(operand_index, &b_);
     llvm::Value* select_return_buffer = llvm_ir::EmitAllocaAtFunctionEntry(
         llvm_ir::PrimitiveTypeToIrType(PRED,
                                        ir_emitter_context_->llvm_module()),
-        "select_return_buffer", &ir_builder_);
+        "select_return_buffer", &b_);
     TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
         *select_and_scatter->select(),
         {selected_value_address, operand_address}, select_return_buffer));
-    llvm::Value* result = ir_builder_.CreateLoad(select_return_buffer);
+    llvm::Value* result = b_.CreateLoad(select_return_buffer);
 
     // If the 'select' function returns false, update the selected value and the
     // index to the currently visiting operand.
-    llvm::Value* cond = ir_builder_.CreateICmpNE(
+    llvm::Value* cond = b_.CreateICmpNE(
         result,
         llvm::ConstantInt::get(llvm_ir::PrimitiveTypeToIrType(
                                    PRED, ir_emitter_context_->llvm_module()),
                                0),
         "boolean_predicate");
     llvm_ir::LlvmIfData if_select_lhs =
-        llvm_ir::EmitIfThenElse(cond, "if-select-lhs", &ir_builder_);
-    llvm_ir::SetToFirstInsertPoint(if_select_lhs.false_block, &ir_builder_);
-    ir_builder_.CreateStore(ir_builder_.CreateLoad(operand_address),
-                            selected_value_address);
+        llvm_ir::EmitIfThenElse(cond, "if-select-lhs", &b_);
+    llvm_ir::SetToFirstInsertPoint(if_select_lhs.false_block, &b_);
+    b_.CreateStore(b_.CreateLoad(operand_address), selected_value_address);
     save_operand_index(operand_index);
 
     // After iterating over the window elements, scatter the source element to
@@ -1979,20 +1966,19 @@ Status IrEmitterUnnested::HandleSelectAndScatter(
     // location is computed by calling the `scatter` function with the source
     // value and the current output value.
     llvm_ir::SetToFirstInsertPoint(window_loops.GetOuterLoopExitBasicBlock(),
-                                   &ir_builder_);
+                                   &b_);
     IrArray::Index selected_index(operand_index.GetType());
     for (int64 i = 0; i < rank; ++i) {
-      llvm::Value* selected_index_address_slot = ir_builder_.CreateInBoundsGEP(
-          selected_index_address, {ir_builder_.getInt32(i)});
-      selected_index.push_back(
-          ir_builder_.CreateLoad(selected_index_address_slot));
+      llvm::Value* selected_index_address_slot =
+          b_.CreateInBoundsGEP(selected_index_address, {b_.getInt32(i)});
+      selected_index.push_back(b_.CreateLoad(selected_index_address_slot));
     }
     llvm::Value* source_value_address =
         GetIrArray(*source, *select_and_scatter)
-            .EmitArrayElementAddress(source_index, &ir_builder_);
+            .EmitArrayElementAddress(source_index, &b_);
     llvm::Value* output_value_address =
         GetIrArray(*select_and_scatter, *select_and_scatter)
-            .EmitArrayElementAddress(selected_index, &ir_builder_);
+            .EmitArrayElementAddress(selected_index, &b_);
     return EmitAtomicOperationForNestedComputation(
         *select_and_scatter->scatter(), output_value_address,
         source_value_address);
@@ -2007,7 +1993,7 @@ Status IrEmitterUnnested::HandleSelectAndScatter(
       static_cast<SequentialThunk*>(LastThunk())->thunks().back().get(),
       ir_emitter_context_->llvm_module());
   return ParallelLoopEmitter(loop_body_emitter, source->shape(),
-                             launch_dimensions, &ir_builder_)
+                             launch_dimensions, &b_)
       .EmitLoop(IrName(select_and_scatter), index_type);
 }
 
@@ -2034,10 +2020,44 @@ Status IrEmitterUnnested::HandleWhile(HloInstruction* xla_while) {
   return Status::OK();
 }
 
-Status IrEmitterUnnested::HandleRng(HloInstruction* random) {
-  thunk_sequence_->push_back(
-      BuildKernelThunk(random, /*implements_whole_instruction=*/true));
-  return IrEmitter::HandleRng(random);
+Status IrEmitterUnnested::HandleRng(HloInstruction* rng) {
+  // Build the kernel to generate the random numbers.
+  //
+  // Unroll the kernel so that the duplicated computation that calculates the
+  // 128 bit sample can be optimized away by LLVM.
+  thunk_sequence_->emplace_back(
+      BuildKernelThunk(rng, /*implements_whole_instruction=*/false,
+                       ComputeMaxUnrollFactor(rng)));
+  ElementalIrEmitter::HloToElementGeneratorMap operand_to_generator;
+  for (const HloInstruction* operand : rng->operands()) {
+    operand_to_generator[operand] = [=](const llvm_ir::IrArray::Index& index) {
+      return GetIrArray(*operand, *rng).EmitReadArrayElement(index, &b_);
+    };
+  }
+  TF_RETURN_IF_ERROR(EmitTargetElementLoop(
+      *rng, GpuElementalIrEmitter(hlo_module_config_, module_, &b_,
+                                  GetNestedComputer())
+                .MakeElementGenerator(rng, operand_to_generator)));
+  std::unique_ptr<Thunk> rng_thunk = std::move(thunk_sequence_->back());
+  thunk_sequence_->pop_back();
+
+  // Emit a kernel to increment the global state for Philox RNG algorithm.
+  thunk_sequence_->emplace_back(
+      BuildKernelThunk(rng, /*implements_whole_instruction=*/false));
+  llvm_ir::IncrementVariableForPhiloxRngState(1, module_, &b_);
+  std::unique_ptr<Thunk> increment_seed_thunk =
+      std::move(thunk_sequence_->back());
+  thunk_sequence_->pop_back();
+
+  // Build the SequentialThunk for the RNG hlo.
+  std::vector<std::unique_ptr<Thunk>> thunks;
+  thunks.reserve(2);
+  thunks.push_back(std::move(rng_thunk));
+  thunks.push_back(std::move(increment_seed_thunk));
+  thunk_sequence_->emplace_back(
+      MakeUnique<SequentialThunk>(std::move(thunks), rng));
+
+  return Status::OK();
 }
 
 Status IrEmitterUnnested::HandleSelect(HloInstruction* select) {
@@ -2049,30 +2069,94 @@ Status IrEmitterUnnested::HandleSelect(HloInstruction* select) {
 Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
   std::vector<std::unique_ptr<Thunk>> thunks;
   auto values = sort->operand_count() > 1 ? sort->operand(1) : nullptr;
+  ShapeIndex keys_shape_index({});
+  ShapeIndex values_shape_index({});
   if (values != nullptr) {
-    // TODO(b/26783907): Also sort the values by their corresponding key.
-    return Unimplemented("Key/Value Sort is not implemented on GPU");
+    keys_shape_index = ShapeIndex({0});
+    values_shape_index = ShapeIndex({1});
   }
+  auto keys_destination = GetAllocationSlice(*sort, keys_shape_index);
 
-  // First copy the operand to the output, so that we can sort in-place.
+  // First copy the operand(s) to the output, so that we can sort in-place.
   // TODO(b/26783907): Share buffer of output and operand when it is possible.
   if (sort->operand(0)->IsConstant()) {
     thunks.push_back(MakeUnique<HostToDeviceCopyThunk>(
         /*source_address=*/sort->operand(0)->literal().untyped_data(),
-        /*destination_buffer=*/GetAllocationSlice(*sort),
-        /*mem_size=*/ShapeUtil::ByteSizeOf(sort->shape()), sort));
+        /*destination_buffer=*/keys_destination,
+        /*mem_size=*/ShapeUtil::ByteSizeOf(sort->operand(0)->shape()),
+        nullptr));
   } else {
     thunks.push_back(MakeUnique<DeviceToDeviceCopyThunk>(
         /*source_address=*/GetAllocationSlice(*sort->operand(0)),
-        /*destination_buffer=*/GetAllocationSlice(*sort),
-        /*mem_size=*/ShapeUtil::ByteSizeOf(sort->shape()), sort));
+        /*destination_buffer=*/keys_destination,
+        /*mem_size=*/ShapeUtil::ByteSizeOf(sort->operand(0)->shape()),
+        nullptr));
+  }
+  if (values != nullptr) {
+    if (values->IsConstant()) {
+      thunks.push_back(MakeUnique<HostToDeviceCopyThunk>(
+          /*source_address=*/sort->operand(1)->literal().untyped_data(),
+          /*destination_buffer=*/GetAllocationSlice(*sort, values_shape_index),
+          /*mem_size=*/ShapeUtil::ByteSizeOf(sort->operand(1)->shape()),
+          nullptr));
+    } else {
+      thunks.push_back(MakeUnique<DeviceToDeviceCopyThunk>(
+          /*source_address=*/GetAllocationSlice(*sort->operand(1)),
+          /*destination_buffer=*/GetAllocationSlice(*sort, values_shape_index),
+          /*mem_size=*/ShapeUtil::ByteSizeOf(sort->operand(1)->shape()),
+          nullptr));
+    }
+  }
+
+  int64 dimension_to_sort = sort->dimensions(0);
+  int64 dimension_to_sort_bound =
+      sort->operand(0)->shape().dimensions(dimension_to_sort);
+  int64 num_stages = tensorflow::Log2Ceiling(dimension_to_sort_bound);
+  auto index_type = b_.getInt64Ty();
+
+  // Naive C++ code for the outer loops:
+  //
+  // for (int64 stage = 0; stage < Log2Ceiling(dimension_to_sort_bound);
+  //     ++stage) {
+  //   int64 first_xor_mask = (1LL << (stage + 1)) - 1;
+  //   SortInPlace(first_xor_mask);
+  //   for (int64 mask = stage - 1; mask >= 0; --mask) {
+  //     int64 later_xor_mask = 1LL << mask;
+  //     SortInPlace(later_xor_mask);
+  //   }
+  // }
+  //
+  // This follows the algorithm described on Wikipedia:
+  // https://en.wikipedia.org/wiki/Bitonic_sorter
+
+  for (int64 stage = 0; stage < num_stages; ++stage) {
+    for (int64 mask = stage; mask >= 0; --mask) {
+      thunks.push_back(
+          BuildKernelThunk(sort, /*implements_whole_instruction=*/false));
+      LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
+          sort->operand(0)->shape(), ir_emitter_context_->device_description());
+      UpdateLaunchDimensions(launch_dimensions, thunks.back().get(),
+                             ir_emitter_context_->llvm_module());
+
+      llvm::Value* xor_mask;
+      if (mask == stage) {
+        xor_mask = llvm::ConstantInt::get(index_type, (1LL << (stage + 1)) - 1);
+      } else {
+        xor_mask = llvm::ConstantInt::get(index_type, 1LL << mask);
+      }
+
+      TF_RETURN_IF_ERROR(llvm_ir::EmitSortInPlace(
+          dimension_to_sort, GetIrArray(*sort, *sort, keys_shape_index),
+          values != nullptr ? tensorflow::gtl::make_optional<IrArray>(
+                                  GetIrArray(*sort, *sort, values_shape_index))
+                            : tensorflow::gtl::nullopt,
+          IrName(sort), xor_mask, &b_, &launch_dimensions));
+    }
   }
 
-  thunks.push_back(
-      BuildKernelThunk(sort, /*implements_whole_instruction=*/false));
   thunk_sequence_->emplace_back(
       MakeUnique<SequentialThunk>(std::move(thunks), sort));
-  return IrEmitter::HandleSort(sort);
+  return Status::OK();
 }
 
 Status IrEmitterUnnested::HandleTupleSelect(HloInstruction* tuple_select) {
@@ -2228,11 +2312,6 @@ GetHloBufferSlices(const HloInstruction* hlo,
 
   // Adds entries for all subshapes of instr to `slices`.
   auto add_slices_for = [&](const HloInstruction* instr) {
-    // GPU constants don't have buffers; don't bother looking for one.
-    if (instr->IsConstant()) {
-      return;
-    }
-
     ShapeUtil::ForEachSubshape(
         instr->shape(), [&](const Shape& /*shape*/, const ShapeIndex& index) {
           if (slices.count({instr, index})) {
@@ -2294,21 +2373,25 @@ std::unique_ptr<KernelThunk> IrEmitterUnnested::BuildKernelThunk(
 
   // We'll pass a pointer to each of the elements of `buffers` to our kernel, in
   // this order.
-  std::vector<const BufferAllocation*> buffers(buffers_needed.begin(),
-                                               buffers_needed.end());
-  std::sort(buffers.begin(), buffers.end(),
+  std::vector<const BufferAllocation*> non_constant_buffers;
+  c_copy_if(buffers_needed, std::back_inserter(non_constant_buffers),
+            [](const BufferAllocation* allocation) {
+              return !allocation->is_constant();
+            });
+
+  std::sort(non_constant_buffers.begin(), non_constant_buffers.end(),
             [](const BufferAllocation* a, const BufferAllocation* b) {
               return a->index() < b->index();
             });
 
-  llvm::Function* kernel = BuildKernelPrototype(*inst, buffers);
+  llvm::Function* kernel = BuildKernelPrototype(*inst, non_constant_buffers);
 
   // Build a map from a BufferAllocation to the corresponding argument in our
   // kernel.
   std::unordered_map<const BufferAllocation*, llvm::Value*> kernel_args;
   {
     auto arg_it = kernel->arg_begin();
-    auto buffers_it = buffers.begin();
+    auto buffers_it = non_constant_buffers.begin();
     for (; arg_it != kernel->arg_end(); ++arg_it, ++buffers_it) {
       kernel_args[*buffers_it] = arg_it;
     }
@@ -2326,18 +2409,24 @@ std::unique_ptr<KernelThunk> IrEmitterUnnested::BuildKernelThunk(
             << " is found in slice " << slice.ToString() << " at GTE index "
             << gte_index.ToString();
 
-    llvm::Value* loc =
-        ir_builder_.CreateInBoundsGEP(kernel_args.at(slice.allocation()),
-                                      {ir_builder_.getInt64(slice.offset())});
+    llvm::Value* loc;
+    if (slice.allocation()->is_constant()) {
+      loc = ir_emitter_context_->llvm_module()->getGlobalVariable(
+          llvm_ir::AsStringRef(llvm_ir::ConstantBufferAllocationToGlobalName(
+              *slice.allocation())));
+      CHECK_NE(loc, nullptr);
+    } else {
+      loc = b_.CreateInBoundsGEP(kernel_args.at(slice.allocation()),
+                                 {b_.getInt64(slice.offset())});
+    }
 
     // If gte_index is nonempty, we have to dereference `loc` to get to the
     // value we're ultimately interested in.
     llvm::Type* int8_double_pointer =
-        llvm::PointerType::get(ir_builder_.getInt8PtrTy(), /*AddressSpace=*/0);
+        llvm::PointerType::get(b_.getInt8PtrTy(), /*AddressSpace=*/0);
     for (int64 idx : gte_index) {
-      loc = ir_builder_.CreateBitCast(loc, int8_double_pointer);
-      loc = ir_builder_.CreateLoad(
-          ir_builder_.CreateInBoundsGEP(loc, {ir_builder_.getInt64(idx)}));
+      loc = b_.CreateBitCast(loc, int8_double_pointer);
+      loc = b_.CreateLoad(b_.CreateInBoundsGEP(loc, {b_.getInt64(idx)}));
     }
 
     bindings_.BindHloToIrValue(*instr, loc, index);
@@ -2349,12 +2438,12 @@ std::unique_ptr<KernelThunk> IrEmitterUnnested::BuildKernelThunk(
     bindings_.SetTempBufferBase(kernel_args.at(*temp_buffer));
   } else {
     bindings_.SetTempBufferBase(
-        llvm::ConstantPointerNull::get(ir_builder_.getInt8PtrTy()));
+        llvm::ConstantPointerNull::get(b_.getInt8PtrTy()));
   }
 
-  return MakeUnique<KernelThunk>(buffers, llvm_ir::AsString(kernel->getName()),
-                                 implements_whole_instruction ? inst : nullptr,
-                                 unroll_factor);
+  return MakeUnique<KernelThunk>(
+      non_constant_buffers, llvm_ir::AsString(kernel->getName()),
+      implements_whole_instruction ? inst : nullptr, unroll_factor);
 }
 
 std::unique_ptr<Thunk> IrEmitterUnnested::BuildHostToDeviceCopyThunk(
@@ -2591,15 +2680,24 @@ StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildInitializerThunk(
   // If the init_value was fused into this reduce we have to generate it first.
   if (fused && init_value_operand->opcode() != HloOpcode::kParameter) {
     CHECK_EQ(HloOpcode::kConstant, init_value_operand->opcode());
-    TF_RETURN_IF_ERROR(HandleConstant(const_cast<HloInstruction*>(init_value)));
+
+    const Literal& literal = init_value_operand->literal();
+    llvm::Constant* initializer =
+        llvm_ir::ConvertLiteralToIrConstant(literal, module_);
+
+    llvm::GlobalVariable* global_for_const = new llvm::GlobalVariable(
+        *module_, initializer->getType(),
+        /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, initializer,
+        /*Name=*/"");
+    global_for_const->setAlignment(kConstantBufferAlignBytes);
+    bindings_.BindHloToIrValue(*init_value_operand, global_for_const);
   }
   TF_RETURN_IF_ERROR(ParallelLoopEmitter(
                          [=](const IrArray::Index& index) {
                            return GetIrArray(*init_value, *hlo)
-                               .EmitReadArrayElement(index, &ir_builder_);
+                               .EmitReadArrayElement(index, &b_);
                          },
-                         GetIrArray(*hlo, *hlo, index), launch_dimensions,
-                         &ir_builder_)
+                         GetIrArray(*hlo, *hlo, index), launch_dimensions, &b_)
                          .EmitLoop(IrName(hlo)));
 
   // Clean up state left behind by emitting the loop above.  (This is normally
@@ -2710,13 +2808,13 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildWhileThunk(
   HloComputation* condition = hlo->while_condition();
   IrEmitterUnnested ir_emitter_condition(hlo_module_config_, condition,
                                          ir_emitter_context_);
-  TF_CHECK_OK(condition->root_instruction()->Accept(&ir_emitter_condition));
+  TF_CHECK_OK(condition->Accept(&ir_emitter_condition));
 
   // Generate thunk sequence for while 'body'.
   HloComputation* body = hlo->while_body();
   IrEmitterUnnested ir_emitter_body(hlo_module_config_, body,
                                     ir_emitter_context_);
-  TF_CHECK_OK(body->root_instruction()->Accept(&ir_emitter_body));
+  TF_CHECK_OK(body->Accept(&ir_emitter_body));
 
   return MakeUnique<WhileThunk>(
       GetAllocationSlice(*condition->root_instruction()),  // cond result
@@ -2734,7 +2832,7 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildForThunk(
   HloComputation* body = hlo->while_body();
   IrEmitterUnnested ir_emitter_body(hlo_module_config_, body,
                                     ir_emitter_context_);
-  TF_CHECK_OK(body->root_instruction()->Accept(&ir_emitter_body));
+  TF_CHECK_OK(body->Accept(&ir_emitter_body));
 
   return MakeUnique<ForThunk>(loop_limit,
                               ir_emitter_body.ConsumeThunkSequence(), hlo);
@@ -2750,12 +2848,12 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildConditionalThunk(
   HloComputation* true_computation = hlo->true_computation();
   IrEmitterUnnested ir_emitter_true(hlo_module_config_, true_computation,
                                     ir_emitter_context_);
-  TF_CHECK_OK(true_computation->root_instruction()->Accept(&ir_emitter_true));
+  TF_CHECK_OK(true_computation->Accept(&ir_emitter_true));
 
   HloComputation* false_computation = hlo->false_computation();
   IrEmitterUnnested ir_emitter_false(hlo_module_config_, false_computation,
                                      ir_emitter_context_);
-  TF_CHECK_OK(false_computation->root_instruction()->Accept(&ir_emitter_false));
+  TF_CHECK_OK(false_computation->Accept(&ir_emitter_false));
 
   return MakeUnique<ConditionalThunk>(
       GetAllocationSlice(*hlo->operand(0)),
@@ -2783,10 +2881,10 @@ Status IrEmitterUnnested::EmitTargetElementLoopInThunk(
                          ir_emitter_context_->llvm_module());
   if (!hlo.IsMultiOutputFusion()) {
     return ParallelLoopEmitter(element_generator, GetIrArray(hlo, hlo),
-                               launch_dimensions, &ir_builder_, unroll_factor)
-        .EmitLoop(IrName(&hlo),
-                  GetIndexTypeForKernel(&hlo, launch_dimensions.launch_bound(),
-                                        &ir_builder_));
+                               launch_dimensions, &b_, unroll_factor)
+        .EmitLoop(
+            IrName(&hlo),
+            GetIndexTypeForKernel(&hlo, launch_dimensions.launch_bound(), &b_));
   }
 
   // For multioutput fusion, we need to emit each operand and the root.
@@ -2796,25 +2894,24 @@ Status IrEmitterUnnested::EmitTargetElementLoopInThunk(
   }
   TF_RETURN_IF_ERROR(
       ParallelLoopEmitter(element_generator, output_arrays, launch_dimensions,
-                          &ir_builder_, unroll_factor)
+                          &b_, unroll_factor)
           .EmitLoop(IrName(&hlo),
                     GetIndexTypeForKernel(
-                        &hlo, launch_dimensions.launch_bound(), &ir_builder_)));
+                        &hlo, launch_dimensions.launch_bound(), &b_)));
 
   std::vector<llvm::Value*> tuple_operand_ptrs;
   for (int64 i = 0; i < output_arrays.size(); ++i) {
     tuple_operand_ptrs.push_back(output_arrays[i].GetBasePointer());
   }
-  ir_builder_.SetInsertPoint(ir_builder_.GetInsertBlock()->getTerminator());
-  llvm_ir::EmitTuple(GetIrArray(hlo, hlo), tuple_operand_ptrs, &ir_builder_,
-                     module_);
+  b_.SetInsertPoint(b_.GetInsertBlock()->getTerminator());
+  llvm_ir::EmitTuple(GetIrArray(hlo, hlo), tuple_operand_ptrs, &b_, module_);
   return Status::OK();
 }
 
 Status IrEmitterUnnested::EmitTargetElementLoop(
     const HloInstruction& hlo,
     const llvm_ir::ElementGenerator& element_generator) {
-  CHECK(Thunk::Kind::kKernel == LastThunk()->kind());
+  CHECK_EQ(Thunk::Kind::kKernel, LastThunk()->kind());
   return EmitTargetElementLoopInThunk(hlo, element_generator,
                                       static_cast<KernelThunk*>(LastThunk()));
 }
@@ -2858,14 +2955,14 @@ int IrEmitterUnnested::ConstructOutputReducedShapeAndCastOutputIrArrayToShape(
       output_reduced_shapes->push_back(ShapeUtil::MakeShapeWithDescendingLayout(
           ShapeUtil::GetSubshape(hlo.shape(), {i}).element_type(),
           reduced_output_dims));
-      output_in_reduced_shape_arrays->push_back(output_arrays[i].CastToShape(
-          (*output_reduced_shapes)[i], &ir_builder_));
+      output_in_reduced_shape_arrays->push_back(
+          output_arrays[i].CastToShape((*output_reduced_shapes)[i], &b_));
     }
   } else {
     output_reduced_shapes->push_back(ShapeUtil::MakeShapeWithDescendingLayout(
         hlo.shape().element_type(), reduced_output_dims));
-    output_in_reduced_shape_arrays->push_back(output_arrays[0].CastToShape(
-        (*output_reduced_shapes)[0], &ir_builder_));
+    output_in_reduced_shape_arrays->push_back(
+        output_arrays[0].CastToShape((*output_reduced_shapes)[0], &b_));
   }
   return num_outputs;
 }
@@ -2889,8 +2986,8 @@ int IrEmitterUnnested::ConstructInputReducedShapeAndCastInputIrArrayToShape(
     param_reduced_shapes->push_back(ShapeUtil::MakeShapeWithDescendingLayout(
         param->shape().element_type(),
         Permute({0, 2, 1}, reduced_output_dims)));
-    param_in_reduced_shape_arrays->push_back(param_arrays[id].CastToShape(
-        (*param_reduced_shapes)[id], &ir_builder_));
+    param_in_reduced_shape_arrays->push_back(
+        param_arrays[id].CastToShape((*param_reduced_shapes)[id], &b_));
   }
   return num_params;
 }
@@ -3039,7 +3136,7 @@ LaunchDimensions IrEmitterUnnested::EmitHlo021Tile(
         kTileSize);
     const int kNVPTXSharedMemoryAddrSpace = 3;
     auto* tile_base_ptr = new llvm::GlobalVariable(
-        *ir_builder_.GetInsertBlock()->getParent()->getParent(), tile_type,
+        *b_.GetInsertBlock()->getParent()->getParent(), tile_type,
         /*isConstant=*/false, llvm::GlobalValue::PrivateLinkage,
         llvm::UndefValue::get(tile_type),
         llvm_ir::AsStringRef(IrName(hlo, StrCat("tile", id))), nullptr,
@@ -3063,8 +3160,8 @@ LaunchDimensions IrEmitterUnnested::EmitHlo021Tile(
       c_accumulate(output_dims_in_tiles, 1, std::multiplies<int64>());
   LaunchDimensions launch_dimensions(num_tiles, kThreadsPerTile);
 
-  llvm::Type* index_ty = GetIndexTypeForKernel(
-      hlo, launch_dimensions.launch_bound(), &ir_builder_);
+  llvm::Type* index_ty =
+      GetIndexTypeForKernel(hlo, launch_dimensions.launch_bound(), &b_);
   auto index_typed_constant = [&](uint64 c) -> llvm::Constant* {
     return llvm::ConstantInt::get(index_ty, c);
   };
@@ -3092,23 +3189,23 @@ LaunchDimensions IrEmitterUnnested::EmitHlo021Tile(
   llvm::Value* x;
   llvm::Value* y;
   std::tie(y, x) = CalculateYXCoordinateWithinTile(
-      &ir_builder_, index_typed_constant(kTileSize), kThreadsPerTile);
+      &b_, index_typed_constant(kTileSize), kThreadsPerTile);
 
   // Calculate the index for the current output tile from block_id.
   const IrArray::Index output_tile_index(
-      GetBlockIdx(&ir_builder_, index_ty, num_tiles),
+      GetBlockIdx(&b_, index_ty, num_tiles),
       ShapeUtil::MakeShapeWithDescendingLayout(PRED /*arbitrary*/,
                                                output_dims_in_tiles),
-      &ir_builder_);
+      &b_);
 
   // Output tile origin is the index for the first element of the current output
   // tile.
   const IrArray::Index output_tile_origin = [&] {
     IrArray::Index index = output_tile_index;
     for (int i = 1; i < 3; ++i) {
-      index[i] = ir_builder_.CreateMul(output_tile_index[i],
-                                       index_typed_constant(kTileSize),
-                                       "tile_origin." + std::to_string(i));
+      index[i] =
+          b_.CreateMul(output_tile_index[i], index_typed_constant(kTileSize),
+                       "tile_origin." + std::to_string(i));
     }
     return index;
   }();
@@ -3121,16 +3218,15 @@ LaunchDimensions IrEmitterUnnested::EmitHlo021Tile(
   std::vector<llvm::Value*> output_tile_bounds(3);
   for (int i = 1; i < 3; ++i) {
     // Only last row or column may not have full size.
-    output_tile_bounds[i] = ir_builder_.CreateSelect(
-        ir_builder_.CreateICmpEQ(
-            output_tile_index[i],
-            index_typed_constant(output_dims_in_tiles[i] - 1)),
+    output_tile_bounds[i] = b_.CreateSelect(
+        b_.CreateICmpEQ(output_tile_index[i],
+                        index_typed_constant(output_dims_in_tiles[i] - 1)),
         index_typed_constant(reduced_output_dims[i] -
                              (output_dims_in_tiles[i] - 1) * kTileSize),
         index_typed_constant(kTileSize), "kTileSize");
   }
 
-  KernelSupportLibrary ksl(&ir_builder_, llvm_ir::UnrollMode::kDefaultUnroll);
+  KernelSupportLibrary ksl(&b_, llvm_ir::UnrollMode::kDefaultUnroll);
 
   // Curry a few parameters to EmitTiledElementalCodeWithBoundsCheck.
   auto emit_tiled_elemental_code_with_bounds_check =
@@ -3139,13 +3235,13 @@ LaunchDimensions IrEmitterUnnested::EmitHlo021Tile(
           const std::function<void(const IrArray::Index&, llvm::Value*)>&
               emit_elem_function) {
         EmitTiledElementalCodeWithBoundsCheck(
-            kTileSize, kNumRows, index, loop_name, &ksl, &ir_builder_, y, x,
-            tile_width, tile_height, emit_elem_function);
+            kTileSize, kNumRows, index, loop_name, &ksl, &b_, y, x, tile_width,
+            tile_height, emit_elem_function);
       };
 
   // Adds `addend` to the given `dim` of `index`.
   auto offset_dim = [&](IrArray::Index index, llvm::Value* addend, int64 dim) {
-    index[dim] = ir_builder_.CreateAdd(index[dim], addend);
+    index[dim] = b_.CreateAdd(index[dim], addend);
     return index;
   };
   const IrArray::Index input_index =
@@ -3161,19 +3257,17 @@ LaunchDimensions IrEmitterUnnested::EmitHlo021Tile(
           llvm::Value* shmem_buffer = param_shmem_buffers[id];
           // TODO(jlebar): Add AA metadata to this store.  Tile buffers are
           // global variables, so LLVM can't infer much about it.
-          ir_builder_.CreateStore(
-              input_in_logical_shape.EmitReadArrayElement(index, &ir_builder_,
+          b_.CreateStore(
+              input_in_logical_shape.EmitReadArrayElement(index, &b_,
                                                           "input_element"),
-              ir_builder_.CreateGEP(shmem_buffer,
-                                    {index_typed_constant(0), y_loc, x}));
+              b_.CreateGEP(shmem_buffer, {index_typed_constant(0), y_loc, x}));
         }
       });
 
   // Wait for all threads to reach this point, lest we copy a value from tile to
   // output before the other thread copies it from input to tile.
   // This is `__syncthreads` in CUDA.
-  llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_barrier0, {}, {},
-                               &ir_builder_);
+  llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_barrier0, {}, {}, &b_);
 
   llvm_ir::TiledParameterInfo tiled_param_info(param_shmem_buffers, y, x);
 
@@ -3187,27 +3281,26 @@ LaunchDimensions IrEmitterUnnested::EmitHlo021Tile(
         output_index, "output", output_tile_bounds[2], output_tile_bounds[1],
         [&](const IrArray::Index& index, llvm::Value* y_loc) {
           // TODO(jlebar): Add AA metadata to this load.
-          llvm::Instruction* load_from_shmem_buffer = ir_builder_.CreateLoad(
-              ir_builder_.CreateGEP(param_shmem_buffers[0],
-                                    {ir_builder_.getInt64(0), x, y_loc}),
+          llvm::Instruction* load_from_shmem_buffer = b_.CreateLoad(
+              b_.CreateGEP(param_shmem_buffers[0], {b_.getInt64(0), x, y_loc}),
               "output_element");
           output_in_reduced_shape_arrays[0].EmitWriteArrayElement(
-              index, load_from_shmem_buffer, &ir_builder_);
+              index, load_from_shmem_buffer, &b_);
         });
   } else {
     CHECK_EQ(hlo->opcode(), HloOpcode::kFusion);
     emit_tiled_elemental_code_with_bounds_check(
         output_index, "output", output_tile_bounds[2], output_tile_bounds[1],
         [&](const IrArray::Index& index, llvm::Value* y_loc) {
-          GpuElementalIrEmitter elem_emitter(hlo_module_config_, module_,
-                                             &ir_builder_, GetNestedComputer());
+          GpuElementalIrEmitter elem_emitter(hlo_module_config_, module_, &b_,
+                                             GetNestedComputer());
           FusedIrEmitter fused_emitter(param_arrays, &elem_emitter);
           tiled_param_info.set_y(y_loc);
           fused_emitter.SetTiledParameterInfo(&tiled_param_info);
           TF_CHECK_OK(hlo->fused_expression_root()->Accept(&fused_emitter));
           IrArray::Index untiled_index = llvm_ir::GetUnreducedOutputIndex(
               index, output_reduced_shapes[0], output_arrays[0].GetShape(),
-              &ir_builder_);
+              &b_);
           const llvm_ir::ElementGenerator& output_generator =
               fused_emitter.GetRootGenerator();
           llvm::Value* output_value =
@@ -3218,12 +3311,11 @@ LaunchDimensions IrEmitterUnnested::EmitHlo021Tile(
                      output_in_reduced_shape_arrays.size());
             for (int64 i = 0; i < output_in_reduced_shape_arrays.size(); ++i) {
               output_in_reduced_shape_arrays[i].EmitWriteArrayElement(
-                  index, ir_builder_.CreateExtractValue(output_value, i),
-                  &ir_builder_);
+                  index, b_.CreateExtractValue(output_value, i), &b_);
             }
           } else {
             output_in_reduced_shape_arrays[0].EmitWriteArrayElement(
-                index, output_value, &ir_builder_);
+                index, output_value, &b_);
           }
         });
   }
@@ -3234,7 +3326,7 @@ LaunchDimensions IrEmitterUnnested::EmitHlo021Tile(
     for (int64 i = 0; i < output_arrays.size(); ++i) {
       tuple_operand_ptrs.push_back(output_arrays[i].GetBasePointer());
     }
-    llvm_ir::EmitTuple(GetIrArray(*hlo, *hlo), tuple_operand_ptrs, &ir_builder_,
+    llvm_ir::EmitTuple(GetIrArray(*hlo, *hlo), tuple_operand_ptrs, &b_,
                        module_);
   }
 
@@ -3285,6 +3377,40 @@ bool IrEmitterUnnested::CheckAndEmitHloWithTile021(HloInstruction* hlo) {
     return false;
   }
 
+  // Each of our shared memory tiles has 32*33 elements (so ~4kb, if the
+  // elements are of size 4 bytes), and CUDA has an architectural limit of 48kb
+  // shared memory per SM.  (This is increased to 96kb in Volta, but we don't
+  // use this, in part because it eats into our L1 cache space.)
+  //
+  // For correctness we need to ensure that we don't make more than 48kb worth
+  // of shmem tiles per block.  And for performance, we'd probably like to use
+  // significantly less, so that we can fit more than one block at a time on a
+  // gpu core.
+  //
+  // We say without benchmarks that we want at least 3 threads/block,
+  // corresponding to 3 shmem tiles if the elements are 32 bits wide.  We choose
+  // which params get the shmem transpose treatment arbitrarily; it's not clear
+  // if there's a Right Choice.
+  //
+  // This is only sound if tiled transposes are the only place where we use
+  // shared memory in fusions.  If in the future other fusile ops use shared
+  // memory, we'll have to adjust this heuristic.
+  constexpr int kMinBlocksPerCore = 3;
+  constexpr int64 kShmemPerCore = 48 * 1024;
+  int64 shmem_used = 0;
+  for (int64 i = 0; i < params_012.size(); ++i) {
+    const HloInstruction* operand = hlo->operand(params_012[i]);
+    shmem_used +=
+        32 * 33 *
+        ShapeUtil::ByteSizeOfPrimitiveType(operand->shape().element_type());
+
+    if (kMinBlocksPerCore * shmem_used > kShmemPerCore) {
+      // Erase this element and everything after it from params_012.
+      params_012.resize(i);
+      break;
+    }
+  }
+
   VLOG(3) << "EmitHlo021Tile Emitting hlo tile 0-2-1" << hlo->ToString();
   thunk_sequence_->emplace_back(
       BuildKernelThunk(hlo, /*implements_whole_instruction=*/true));
@@ -3296,5 +3422,47 @@ bool IrEmitterUnnested::CheckAndEmitHloWithTile021(HloInstruction* hlo) {
   return true;
 }
 
+Status IrEmitterUnnested::EmitConstantGlobals() {
+  for (const BufferAllocation& allocation :
+       ir_emitter_context_->buffer_assignment().Allocations()) {
+    if (!allocation.is_constant()) {
+      continue;
+    }
+
+    const Literal& literal = llvm_ir::LiteralForConstantAllocation(allocation);
+    const bool should_emit_initializer = ShouldEmitLiteralInLlvmIr(literal);
+    llvm::ArrayType* global_type =
+        llvm::ArrayType::get(b_.getInt8Ty(), allocation.size());
+    llvm::Constant* initializer =
+        should_emit_initializer
+            ? llvm_ir::ConvertLiteralToIrConstant(literal, module_)
+            : llvm::ConstantAggregateZero::get(global_type);
+    if (should_emit_initializer) {
+      VLOG(3) << "Emitted initializer for constant with shape "
+              << ShapeUtil::HumanString(literal.shape());
+    }
+
+    // These globals will be looked up by name by GpuExecutable so we need to
+    // give them an external linkage.  Not all of their uses are visible in the
+    // LLVM IR (e.g. TupleThunk) so we can't give then a linkage that merely
+    // preserves their names (like available_externally), we also need to ensure
+    // that they stick around even if they're "unused".
+    //
+    // We may have to be more more clever here in the future if we notice that
+    // we're keeping around too many globals because of their linkage.
+    llvm::GlobalVariable* global_for_const = new llvm::GlobalVariable(
+        global_type, /*isConstant=*/should_emit_initializer,
+        llvm::GlobalValue::ExternalLinkage,
+        /*Initializer=*/initializer,
+        llvm_ir::AsStringRef(
+            llvm_ir::ConstantBufferAllocationToGlobalName(allocation)));
+    global_for_const->setAlignment(kConstantBufferAlignBytes);
+    ir_emitter_context_->llvm_module()->getGlobalList().push_back(
+        global_for_const);
+  }
+
+  return Status::OK();
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
index 616d8a2206..5254419907 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
@@ -92,6 +92,9 @@ class IrEmitterUnnested : public IrEmitter {
       const HloInstruction& hlo, const llvm_ir::ElementGenerator& body_emitter,
       KernelThunk* thunk);
 
+  // Emits LLVM global variables corresponding to constant instructions.
+  Status EmitConstantGlobals();
+
  private:
   // Builds the appropriate thunk for the instruction hlo and returns the owning
   // pointer to it. The caller needs to make sure `inst` outlives the lifetime
diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
index f95fbb01f9..c67dcbce77 100644
--- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <utility>
 
 #include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -112,7 +113,7 @@ bool IsInputFusibleReduction(HloInstruction* instr) {
 // of input parameters differ. In such situtations it is beneficial not to fuse.
 // We consider input params with maximum rank only. Params with smaller ranks
 // will be broadcasted and have not been observed to cause data locality issues.
-// TODO(b/110927656): Improve reduce emitters to remove this limitation.
+// TODO(b/111977086): Improve reduce emitters to remove this limitation.
 bool ReduceFriendlyInputLayouts(HloInstruction* instr) {
   int64 max_rank = 0;
   const Layout* max_rank_layout;
@@ -163,16 +164,22 @@ bool GpuMultiOutputFusion::LegalToFuse(HloInstruction* instr1,
   if (!MultiOutputFusion::LegalToFuse(instr1, instr2)) {
     return false;
   }
+
   // If we're fusing fusions only do it if the fusion kind matches. Loop fusions
   // merge into bigger loop fusions and input (reduce) fusions become fusions
   // with multiple reduce outputs. We could fuse reduce and loop fusions
   // together too (the result being an input fusion) if we find cases where this
   // improves things.
   CHECK(instr1->opcode() == HloOpcode::kFusion);
-  if (instr2->opcode() == HloOpcode::kFusion) {
-    return instr1->fusion_kind() == instr2->fusion_kind();
+  if ((instr2->opcode() == HloOpcode::kFusion &&
+       instr1->fusion_kind() != instr2->fusion_kind()) ||
+      (instr2->opcode() != HloOpcode::kFusion &&
+       instr1->fusion_kind() == HloInstruction::FusionKind::kLoop)) {
+    return false;
   }
-  return instr1->fusion_kind() != HloInstruction::FusionKind::kLoop;
+
+  // Do this check last, as it may be expensive.
+  return !GpuInstructionFusion::FusionWouldBeTooLarge(instr1, instr2);
 }
 
 bool GpuMultiOutputFusion::DoProducerConsumerMultiOutputFusion() {
diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
index 451e49f23a..ec4234b8d9 100644
--- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/multi_output_fusion.h"
 
+#include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -419,5 +420,58 @@ TEST_F(MultiOutputFusionTest,
   ASSERT_FALSE(GpuMultiOutputFusion().Run(module.get()).ValueOrDie());
 }
 
+// Check that we limit the number of operands to fusions we create.
+TEST_F(MultiOutputFusionTest, AvoidsLargeFusion) {
+  constexpr int64 kNumParams = 200;
+  ASSERT_GT(kNumParams, GpuInstructionFusion::kMaxOperandsAndOutputsPerFusion);
+
+  // Compute
+  //   p0 * p1,
+  //   p0 * p1 + p1 * p2
+  //   p0 * p1 + p1 * p2 + p2 * p3
+  //   ...
+  // where each of the (pi * pj)'s is represented as a fusion node so that
+  // multi-output fusion will pay attention to it.
+  auto module = CreateNewModule();
+  HloComputation::Builder b(TestName());
+  Shape shape = ShapeUtil::MakeShape(F32, {10, 100});
+
+  std::vector<HloInstruction*> params;
+  for (int64 i = 0; i < kNumParams; ++i) {
+    params.push_back(
+        b.AddInstruction(HloInstruction::CreateParameter(i, shape, "p")));
+  }
+
+  // Creates a fusion node that calculates x*y.
+  auto make_fusion = [&](HloInstruction* x, HloInstruction* y) {
+    HloComputation::Builder sub_builder("subcomp");
+    auto* p0 = sub_builder.AddInstruction(
+        HloInstruction::CreateParameter(0, shape, "p"));
+    auto* p1 = sub_builder.AddInstruction(
+        HloInstruction::CreateParameter(1, shape, "p"));
+    sub_builder.AddInstruction(
+        HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, p0, p1));
+    HloComputation* subcomp =
+        module->AddEmbeddedComputation(sub_builder.Build());
+    return HloInstruction::CreateFusion(
+        shape, HloInstruction::FusionKind::kLoop, {x, y}, subcomp);
+  };
+
+  auto* sum = b.AddInstruction(make_fusion(params[0], params[1]));
+  for (int64 i = 2; i < kNumParams; ++i) {
+    sum = b.AddInstruction(HloInstruction::CreateBinary(
+        shape, HloOpcode::kAdd, sum,
+        b.AddInstruction(make_fusion(params[i - 1], params[i]))));
+  }
+  auto computation = module->AddEntryComputation(b.Build());
+  EXPECT_TRUE(GpuMultiOutputFusion().Run(module.get()).ValueOrDie());
+  SCOPED_TRACE(module->ToString());
+  for (const HloInstruction* instr : computation->instructions()) {
+    EXPECT_LE(instr->operand_count() + ShapeUtil::SubshapeCount(instr->shape()),
+              GpuInstructionFusion::kMaxOperandsAndOutputsPerFusion)
+        << instr->ToString();
+  }
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
index ad29862d83..6d8996dac1 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
@@ -540,11 +540,13 @@ StatusOr<std::unique_ptr<Executable>> NVPTXCompiler::RunBackend(
   // temporary buffers are required to run the computation.
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<BufferAssignment> buffer_assignment,
-      BufferAssigner::Run(module.get(), hlo_schedule->ConsumeHloOrdering(),
-                          BufferSizeBytesFunction(),
-                          /*color_alignment=*/[](LogicalBuffer::Color) {
-                            return kCudaMallocAlignBytes;
-                          }));
+      BufferAssigner::Run(
+          module.get(), hlo_schedule->ConsumeHloOrdering(),
+          BufferSizeBytesFunction(),
+          /*color_alignment=*/
+          [](LogicalBuffer::Color) { return kXlaAllocatedBufferAlignBytes; },
+          /*allow_input_output_aliasing=*/false,
+          /*allocate_buffers_for_constants=*/true));
   // BufferAssignment::Stats::ToString() and BufferAssignment::ToString()
   // include headers, so no need for us to print them ourselves.
   XLA_VLOG_LINES(1, buffer_assignment->GetStats().ToString());
@@ -565,6 +567,9 @@ StatusOr<std::unique_ptr<Executable>> NVPTXCompiler::RunBackend(
   HloComputation* entry_computation = module->entry_computation();
   IrEmitterUnnested ir_emitter(module->config(), entry_computation,
                                &ir_emitter_context);
+
+  TF_RETURN_IF_ERROR(ir_emitter.EmitConstantGlobals());
+
   {
     XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::RunBackend - IR emission");
     TF_RETURN_IF_ERROR(entry_computation->Accept(&ir_emitter));
diff --git a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc
index cd833ec7bd..3838fee674 100644
--- a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc
@@ -32,27 +32,27 @@ namespace gpu {
 
 ParallelLoopEmitter::ParallelLoopEmitter(
     BodyEmitter body_emitter, const Shape& shape,
-    const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* ir_builder,
+    const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* b,
     int unroll_factor)
-    : LoopEmitter(body_emitter, shape, ir_builder),
+    : LoopEmitter(body_emitter, shape, b),
       launch_dimensions_(launch_dimensions),
       unroll_factor_(unroll_factor) {}
 
 ParallelLoopEmitter::ParallelLoopEmitter(
     const llvm_ir::ElementGenerator& target_element_generator,
     tensorflow::gtl::ArraySlice<llvm_ir::IrArray> target_arrays,
-    const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* ir_builder,
+    const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* b,
     int unroll_factor)
-    : LoopEmitter(target_element_generator, target_arrays, ir_builder),
+    : LoopEmitter(target_element_generator, target_arrays, b),
       launch_dimensions_(launch_dimensions),
       unroll_factor_(unroll_factor) {}
 
 ParallelLoopEmitter::ParallelLoopEmitter(
     const llvm_ir::ElementGenerator& target_element_generator,
     const llvm_ir::IrArray& target_array,
-    const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* ir_builder,
+    const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* b,
     int unroll_factor)
-    : LoopEmitter(target_element_generator, target_array, ir_builder),
+    : LoopEmitter(target_element_generator, target_array, b),
       launch_dimensions_(launch_dimensions),
       unroll_factor_(unroll_factor) {}
 
@@ -74,29 +74,27 @@ ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
   CHECK_NE(index_type, nullptr);
   std::vector<llvm_ir::IrArray::Index> array_indices;
   llvm::Value* block_id = llvm_ir::EmitCallToIntrinsic(
-      llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x, {}, {}, ir_builder_);
+      llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x, {}, {}, b_);
   llvm_ir::AddRangeMetadata(0, launch_dimensions_.block_count(),
                             static_cast<llvm::Instruction*>(block_id));
-  block_id = ir_builder_->CreateZExtOrTrunc(block_id, index_type, "block_id");
+  block_id = b_->CreateZExtOrTrunc(block_id, index_type, "block_id");
 
   // Per the PTX documentation:
   //   "It is guaranteed that [...] 0  <=  %tid.x <  %ntid.x"
   //
   // %ntid.x is currently specified as 1024.
   llvm::Value* thread_id = llvm_ir::EmitCallToIntrinsic(
-      llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x, {}, {}, ir_builder_);
+      llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x, {}, {}, b_);
   llvm_ir::AddRangeMetadata(0, launch_dimensions_.threads_per_block(),
                             static_cast<llvm::Instruction*>(thread_id));
-  thread_id =
-      ir_builder_->CreateZExtOrTrunc(thread_id, index_type, "thread_id");
-
-  llvm::Value* linear_index_base = ir_builder_->CreateAdd(
-      ir_builder_->CreateMul(
-          block_id,
-          llvm::ConstantInt::get(index_type,
-                                 launch_dimensions_.threads_per_block()),
-          "",
-          /*HasNUW=*/true, /*HasNSW=*/true),
+  thread_id = b_->CreateZExtOrTrunc(thread_id, index_type, "thread_id");
+
+  llvm::Value* linear_index_base = b_->CreateAdd(
+      b_->CreateMul(block_id,
+                    llvm::ConstantInt::get(
+                        index_type, launch_dimensions_.threads_per_block()),
+                    "",
+                    /*HasNUW=*/true, /*HasNSW=*/true),
       thread_id, "linear_index", /*HasNUW=*/true, /*HasNSW=*/true);
 
   // Add an @llvm.assume(linear_index < threads_per_block * num_blocks).
@@ -109,41 +107,41 @@ ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
   // conditions in the same basic block as their operands.
   llvm_ir::EmitCallToIntrinsic(
       llvm::Intrinsic::assume,
-      {ir_builder_->CreateICmpULT(
+      {b_->CreateICmpULT(
           linear_index_base,
           llvm::ConstantInt::get(index_type,
                                  launch_dimensions_.threads_per_block() *
                                      launch_dimensions_.block_count()),
           "linear_index_in_range")},
-      {}, ir_builder_);
+      {}, b_);
 
   if (unroll_factor_ > 1) {
-    linear_index_base = ir_builder_->CreateMul(
+    linear_index_base = b_->CreateMul(
         linear_index_base, llvm::ConstantInt::get(index_type, unroll_factor_),
         "linear_index_base", /*HasNUW=*/true, /*HasNSW=*/true);
   }
 
-  array_indices.emplace_back(linear_index_base, shape_, ir_builder_);
+  array_indices.emplace_back(linear_index_base, shape_, b_);
   for (int i = 1; i < unroll_factor_; ++i) {
-    llvm::Value* linear_index = ir_builder_->CreateAdd(
-        linear_index_base, llvm::ConstantInt::get(index_type, i),
-        "linear_index",
-        /*HasNUW=*/true, /*HasNSW=*/true);
-    array_indices.emplace_back(linear_index, shape_, ir_builder_);
+    llvm::Value* linear_index =
+        b_->CreateAdd(linear_index_base, llvm::ConstantInt::get(index_type, i),
+                      "linear_index",
+                      /*HasNUW=*/true, /*HasNSW=*/true);
+    array_indices.emplace_back(linear_index, shape_, b_);
   }
 
   auto if_in_bounds = llvm_ir::EmitIfThenElse(
-      ir_builder_->CreateICmpULT(
+      b_->CreateICmpULT(
           linear_index_base,
           llvm::ConstantInt::get(index_type, ShapeUtil::ElementsIn(shape_))),
-      llvm_ir::IrName(loop_name, "in_bounds"), ir_builder_, false);
+      llvm_ir::IrName(loop_name, "in_bounds"), b_, false);
 
   // Set exit_bb_ to the exit block of the if structure.
   exit_bb_ = if_in_bounds.after_block;
   CHECK_NE(nullptr, exit_bb_);
 
   // Set IR builder insertion point to the body of the if structure.
-  llvm_ir::SetToFirstInsertPoint(if_in_bounds.true_block, ir_builder_);
+  llvm_ir::SetToFirstInsertPoint(if_in_bounds.true_block, b_);
 
   return array_indices;
 }
diff --git a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h
index 302e1bf1bc..b82a23419d 100644
--- a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h
@@ -34,13 +34,13 @@ class ParallelLoopEmitter : public llvm_ir::LoopEmitter {
   // The meanings of other parameters are the same as LoopEmitter.
   ParallelLoopEmitter(BodyEmitter body_emitter, const Shape& shape,
                       const LaunchDimensions& launch_dimensions,
-                      llvm::IRBuilder<>* ir_builder, int unroll_factor = 1);
+                      llvm::IRBuilder<>* b, int unroll_factor = 1);
   // Constructs a ParallelLoopEmitter from an element generator that generates
   // each element of the given target array.
   ParallelLoopEmitter(const llvm_ir::ElementGenerator& target_element_generator,
                       const llvm_ir::IrArray& target_array,
                       const LaunchDimensions& launch_dimensions,
-                      llvm::IRBuilder<>* ir_builder, int unroll_factor = 1);
+                      llvm::IRBuilder<>* b, int unroll_factor = 1);
 
   // Constructs a loop emitter for a loop that generates on element of each of N
   // arrays on each iteration.
@@ -50,7 +50,7 @@ class ParallelLoopEmitter : public llvm_ir::LoopEmitter {
   ParallelLoopEmitter(
       const llvm_ir::ElementGenerator& target_element_generator,
       tensorflow::gtl::ArraySlice<llvm_ir::IrArray> target_arrays,
-      const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* ir_builder,
+      const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* b,
       int unroll_factor = 1);
 
   ParallelLoopEmitter(const ParallelLoopEmitter&) = delete;
diff --git a/tensorflow/compiler/xla/service/gpu/stream_assignment.cc b/tensorflow/compiler/xla/service/gpu/stream_assignment.cc
index e4cfc6999f..0806dd5161 100644
--- a/tensorflow/compiler/xla/service/gpu/stream_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/stream_assignment.cc
@@ -33,13 +33,13 @@ int StreamAssignment::StreamNumberForHlo(const HloInstruction& hlo) const {
 }
 
 void StreamAssignment::AssignStreamToHlo(const HloInstruction* hlo,
-                                         int stream_no) {
-  CHECK_GE(stream_no, 0);
-  if (stream_no >= stream_count_) {
-    stream_count_ = stream_no + 1;
+                                         int stream_num) {
+  CHECK_GE(stream_num, 0);
+  if (stream_num >= stream_count_) {
+    stream_count_ = stream_num + 1;
   }
-  InsertOrDie(&hlo_to_stream_number_, hlo, stream_no);
-  VLOG(2) << "Assign stream #" << stream_no << " to " << hlo->ToString();
+  InsertOrDie(&hlo_to_stream_number_, hlo, stream_num);
+  VLOG(2) << "Assign stream #" << stream_num << " to " << hlo->ToString();
 }
 
 namespace {
@@ -51,6 +51,12 @@ bool CanRunConcurrently(const HloInstruction& a, const HloInstruction& b,
   return !reachability.IsConnected(&a, &b);
 }
 
+constexpr int kInvalidStreamNum = -1;
+//  Returns true iff `stream_num` is an invalid stream number.
+inline bool IsStreamNumValid(int stream_num) {
+  return stream_num != kInvalidStreamNum;
+}
+
 // Returns which existing stream to assign to `hlo`, or -1 if a stream is not
 // needed. `stream_assignment` is the existing stream assignment for all
 // instructions topologically before `hlo`. `seen_gemms` contains all GEMMs that
@@ -62,7 +68,7 @@ int ComputeStreamToAssign(
   if (hlo.opcode() == HloOpcode::kParameter ||
       hlo.opcode() == HloOpcode::kConstant) {
     // kParameter and kConstant do not need a thunk.
-    return -1;
+    return kInvalidStreamNum;
   }
 
   if (hlo.GetModule()
@@ -75,17 +81,17 @@ int ComputeStreamToAssign(
   if (!ImplementedAsGemm(hlo)) {
     // If `hlo` is not implemented as a GEMM, keep it close to its operands to
     // avoid excessive synchronization.
-    int stream_no = -1;
+    int stream_num = -1;
     for (const auto* operand : hlo.operands()) {
       if (stream_assignment.HasStreamAssigned(*operand)) {
-        stream_no =
-            std::max(stream_no, stream_assignment.StreamNumberForHlo(*operand));
+        stream_num = std::max(stream_num,
+                              stream_assignment.StreamNumberForHlo(*operand));
       }
     }
-    if (stream_no == -1) {
-      stream_no = 0;
+    if (!IsStreamNumValid(stream_num)) {
+      stream_num = 0;
     }
-    return stream_no;
+    return stream_num;
   }
 
   // Assign different streams to concurrent GEMMs. The code below uses a
@@ -94,17 +100,17 @@ int ComputeStreamToAssign(
   // `hlo` a different stream.
   std::set<int> forbidden_stream_numbers;
   for (const auto* seen_gemm : seen_gemms) {
-    int stream_no = stream_assignment.StreamNumberForHlo(*seen_gemm);
-    if (!forbidden_stream_numbers.count(stream_no) &&
+    int stream_num = stream_assignment.StreamNumberForHlo(*seen_gemm);
+    if (!forbidden_stream_numbers.count(stream_num) &&
         CanRunConcurrently(*seen_gemm, hlo, reachability)) {
-      forbidden_stream_numbers.insert(stream_no);
+      forbidden_stream_numbers.insert(stream_num);
     }
   }
 
-  for (int stream_no = 0; stream_no < stream_assignment.StreamCount();
-       ++stream_no) {
-    if (!forbidden_stream_numbers.count(stream_no)) {
-      return stream_no;
+  for (int stream_num = 0; stream_num < stream_assignment.StreamCount();
+       ++stream_num) {
+    if (!forbidden_stream_numbers.count(stream_num)) {
+      return stream_num;
     }
   }
   return stream_assignment.StreamCount();
@@ -118,11 +124,27 @@ std::unique_ptr<StreamAssignment> AssignStreams(const HloModule& module) {
   std::unique_ptr<HloReachabilityMap> reachability =
       computation.ComputeReachability();
   std::vector<const HloInstruction*> seen_gemms;
+  // The execution of different RNG Hlo instructions in the same module updates
+  // a common global variable. To avoid a race condition, we simply assign all
+  // RNG kernels to the same stream to make them run sequentially.
+  //
+  // TODO(b/111791052): If we remove such a common variable, we will need to
+  // clean up the code here.
+  int stream_num_for_rng = kInvalidStreamNum;
   for (const auto* hlo : computation.MakeInstructionPostOrder()) {
-    int stream_no = ComputeStreamToAssign(*hlo, *stream_assignment,
-                                          *reachability, seen_gemms);
-    if (stream_no != -1) {
-      stream_assignment->AssignStreamToHlo(hlo, stream_no);
+    // If we ever enable fusion of RNG instructions, we will need to extend this
+    // code to look inside a fused instruction.
+    int stream_num = (hlo->opcode() == HloOpcode::kRng &&
+                      IsStreamNumValid(stream_num_for_rng))
+                         ? stream_num_for_rng
+                         : ComputeStreamToAssign(*hlo, *stream_assignment,
+                                                 *reachability, seen_gemms);
+    if (IsStreamNumValid(stream_num)) {
+      stream_assignment->AssignStreamToHlo(hlo, stream_num);
+      if (hlo->opcode() == HloOpcode::kRng &&
+          !IsStreamNumValid(stream_num_for_rng)) {
+        stream_num_for_rng = stream_num;
+      }
     }
     if (ImplementedAsGemm(*hlo)) {
       seen_gemms.push_back(hlo);
diff --git a/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc b/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc
index a50ddf6ac6..f313a9f172 100644
--- a/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc
+++ b/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc
@@ -20,10 +20,10 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-using stream_executor::dnn::DataLayout;
-using stream_executor::dnn::DataLayoutString;
-using stream_executor::dnn::FilterLayout;
-using stream_executor::dnn::FilterLayoutString;
+using se::dnn::DataLayout;
+using se::dnn::DataLayoutString;
+using se::dnn::FilterLayout;
+using se::dnn::FilterLayoutString;
 
 StatusOr<std::tuple<Layout, Layout, Layout>>
 StreamExecutorConvLayoutsToXlaLayouts(const ConvolutionDimensionNumbers& dnums,
diff --git a/tensorflow/compiler/xla/service/gpu/stream_executor_util.h b/tensorflow/compiler/xla/service/gpu/stream_executor_util.h
index 39a6a38d00..5c5e7f2a56 100644
--- a/tensorflow/compiler/xla/service/gpu/stream_executor_util.h
+++ b/tensorflow/compiler/xla/service/gpu/stream_executor_util.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_STREAM_EXECUTOR_UTIL_H_
 
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
@@ -29,14 +30,13 @@ namespace gpu {
 // layouts.
 StatusOr<std::tuple<Layout, Layout, Layout>>
 StreamExecutorConvLayoutsToXlaLayouts(const ConvolutionDimensionNumbers& dnums,
-                                      stream_executor::dnn::DataLayout input,
-                                      stream_executor::dnn::FilterLayout filter,
-                                      stream_executor::dnn::DataLayout output);
+                                      se::dnn::DataLayout input,
+                                      se::dnn::FilterLayout filter,
+                                      se::dnn::DataLayout output);
 
 // Returns (input, filter, output) StreamExecutor layouts given the XLA layouts.
-StatusOr<std::tuple<stream_executor::dnn::DataLayout,
-                    stream_executor::dnn::FilterLayout,
-                    stream_executor::dnn::DataLayout>>
+StatusOr<
+    std::tuple<se::dnn::DataLayout, se::dnn::FilterLayout, se::dnn::DataLayout>>
 XlaConvLayoutsToStreamExecutorLayouts(const ConvolutionDimensionNumbers& dnums,
                                       const Layout& input, const Layout& filter,
                                       const Layout& output);
diff --git a/tensorflow/compiler/xla/service/gpu/tests/BUILD b/tensorflow/compiler/xla/service/gpu/tests/BUILD
new file mode 100644
index 0000000000..4fad3f46cf
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/BUILD
@@ -0,0 +1,223 @@
+# Description: GPU-specific XLA tests. For example, codegen tests that
+# verify the IR emitted.
+#
+# TODO(jlebar): None of these tests actually use the GPU, so they should not
+# need to run on machines with GPUs present.
+
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = [":friends"])
+
+package_group(
+    name = "friends",
+    includes = [
+        "//tensorflow/compiler/xla:friends",
+    ],
+)
+
+# Filegroup used to collect source files for dependency checking.
+filegroup(
+    name = "c_srcs",
+    data = glob([
+        "**/*.cc",
+        "**/*.h",
+    ]),
+)
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+cc_library(
+    name = "gpu_codegen_test",
+    testonly = True,
+    srcs = ["gpu_codegen_test.cc"],
+    hdrs = ["gpu_codegen_test.h"],
+    tags = [
+        "requires-gpu-sm35",
+    ],
+    deps = [
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
+        "//tensorflow/compiler/xla/service:gpu_plugin",
+        "//tensorflow/compiler/xla/service/gpu:gpu_executable",
+        "//tensorflow/compiler/xla/tests:filecheck",
+        "//tensorflow/compiler/xla/tests:llvm_irgen_test_base",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "gpu_copy_test",
+    srcs = ["gpu_copy_test.cc"],
+    tags = [
+        "requires-gpu-sm35",
+    ],
+    deps = [
+        ":gpu_codegen_test",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_cc_test(
+    name = "gpu_ftz_test",
+    srcs = ["gpu_ftz_test.cc"],
+    tags = [
+        "requires-gpu-sm35",
+    ],
+    deps = [
+        ":gpu_codegen_test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_cc_test(
+    name = "gpu_index_test",
+    srcs = ["gpu_index_test.cc"],
+    tags = [
+        "requires-gpu-sm35",
+    ],
+    deps = [
+        ":gpu_codegen_test",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla:xla_proto",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_module_config",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_cc_test(
+    name = "gpu_infeed_test",
+    srcs = ["infeed_test.cc"],
+    tags = [
+        "requires-gpu-sm35",
+    ],
+    deps = [
+        ":gpu_codegen_test",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:global_data",
+        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_cc_test(
+    name = "gpu_kernel_tiling_test",
+    srcs = ["gpu_kernel_tiling_test.cc"],
+    tags = [
+        "requires-gpu-sm35",
+    ],
+    deps = [
+        ":gpu_codegen_test",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_module_config",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_cc_test(
+    name = "gpu_ldg_test",
+    srcs = ["gpu_ldg_test.cc"],
+    tags = ["requires-gpu-sm35"],
+    deps = [
+        ":gpu_codegen_test",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_cc_test(
+    name = "gpu_noalias_test",
+    srcs = ["gpu_noalias_test.cc"],
+    tags = [
+        "requires-gpu-sm35",
+    ],
+    deps = [
+        ":gpu_codegen_test",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_cc_test(
+    name = "gpu_fusion_test",
+    srcs = ["gpu_fusion_test.cc"],
+    tags = [
+        "requires-gpu-sm35",
+    ],
+    deps = [
+        ":gpu_codegen_test",
+        "//tensorflow/compiler/xla/service:hlo_module_config",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_cc_test(
+    name = "gpu_unrolling_test",
+    srcs = ["gpu_unrolling_test.cc"],
+    tags = [
+        "requires-gpu-sm35",
+    ],
+    deps = [
+        ":gpu_codegen_test",
+        "//tensorflow/compiler/xla/service:hlo_module_config",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_cc_test(
+    name = "gpu_alignment_test",
+    testonly = True,
+    srcs = ["gpu_alignment_test.cc"],
+    tags = [
+        "requires-gpu-sm35",
+    ],
+    deps = [
+        ":gpu_codegen_test",
+        "//tensorflow/compiler/xla/service:gpu_plugin",
+        "//tensorflow/compiler/xla/service/cpu:custom_call_target_registry",
+        "//tensorflow/compiler/xla/service/llvm_ir:alias_analysis",
+        "//tensorflow/compiler/xla/tests:filecheck",
+        "//tensorflow/compiler/xla/tests:llvm_irgen_test_base",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_alignment_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_alignment_test.cc
new file mode 100644
index 0000000000..672c68e59b
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_alignment_test.cc
@@ -0,0 +1,54 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h"
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h"
+#include "tensorflow/compiler/xla/tests/filecheck.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class GpuAlignmentTest : public GpuCodegenTest {};
+
+TEST_F(GpuAlignmentTest, Test) {
+  const char* hlo_string = R"(
+HloModule GpuAlignmentTest
+
+ENTRY main {
+  zero = f32[] constant(0)
+  tok = token[] after-all()
+  a = f32[100] parameter(0)
+  b_tup = (f32[200], token[]) infeed(tok)
+  b = f32[200] get-tuple-element(b_tup), index=0
+  a_padded = f32[150] pad(a, zero), padding=0_50
+  b_sliced = f32[150] slice(b), slice={[0:150]}
+  ROOT c = f32[150] add(a_padded, b_sliced)
+}
+)";
+
+  CompileAndVerifyIr(hlo_string, R"(
+CHECK: @fusion(i8* align 64 dereferenceable(600) %alloc0, i8* align 16 dereferenceable(400) %alloc1, i8* align 64 dereferenceable(864) %temp_buf)
+)");
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc
new file mode 100644
index 0000000000..4b8415fe91
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc
@@ -0,0 +1,50 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
+#include "tensorflow/compiler/xla/tests/filecheck.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace xla {
+namespace gpu {
+
+std::unique_ptr<HloModule> GpuCodegenTest::CreateNewModuleWithFTZ(bool ftz) {
+  HloModuleConfig config;
+  auto debug_options = legacy_flags::GetDebugOptionsFromFlags();
+  debug_options.set_xla_gpu_ftz(ftz);
+  debug_options.set_xla_gpu_max_kernel_unroll_factor(1);
+  // TODO(b/38354253): Change tests to use Parameters instead of Constants.
+  debug_options.add_xla_disable_hlo_passes("constant_folding");
+  config.set_debug_options(debug_options);
+
+  return MakeUnique<HloModule>(TestName(), config);
+}
+
+void GpuCodegenTest::CompileAndVerifyPtx(std::unique_ptr<HloModule> hlo_module,
+                                         const string& pattern) {
+  std::unique_ptr<Executable> executable =
+      std::move(CompileToExecutable(std::move(hlo_module)).ValueOrDie());
+  string ptx_str =
+      std::string(static_cast<GpuExecutable*>(executable.get())->ptx());
+  StatusOr<bool> filecheck_result = RunFileCheck(ptx_str, pattern);
+  ASSERT_TRUE(filecheck_result.ok());
+  EXPECT_TRUE(filecheck_result.ValueOrDie());
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h
new file mode 100644
index 0000000000..e4a3573bab
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h
@@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_TESTS_GPU_CODEGEN_TEST_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_TESTS_GPU_CODEGEN_TEST_H_
+
+#include <string>
+
+#include "tensorflow/compiler/xla/tests/llvm_irgen_test_base.h"
+
+namespace xla {
+namespace gpu {
+
+// Tests that verify IR or PTX emitted by the GPU backend is as expected.
+class GpuCodegenTest : public LlvmIrGenTestBase {
+ protected:
+  // Like HloTestBase::CreateNewModule(), with a flag for configuring the ftz
+  // option.
+  std::unique_ptr<HloModule> CreateNewModuleWithFTZ(bool ftz);
+
+  // Compiles the given HLO module to PTX and verifies the PTX matches the given
+  // FileCheck pattern.  (See http://llvm.org/docs/CommandGuide/FileCheck.html).
+  void CompileAndVerifyPtx(std::unique_ptr<HloModule> hlo_module,
+                           const string& pattern);
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_TESTS_GPU_CODEGEN_TEST_H_
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc
new file mode 100644
index 0000000000..ce69e058e6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc
@@ -0,0 +1,59 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace gpu {
+
+class GpuCopyTest : public GpuCodegenTest {};
+
+// The GPU backend should not emit a copy kernel for the kCopy instruction in
+// this test. Instead, it should generate a CopyThunk which invokes cuMemcpy at
+// runtime.
+TEST_F(GpuCopyTest, UseMemcpy) {
+  HloComputation::Builder builder(TestName());
+
+  std::unique_ptr<Literal> literal =
+      LiteralUtil::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
+  HloInstruction* constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(std::move(literal)));
+  builder.AddInstruction(HloInstruction::CreateUnary(
+      constant->shape(), HloOpcode::kCopy, constant));
+
+  std::unique_ptr<HloComputation> computation = builder.Build();
+
+  auto hlo_module = CreateNewModule();
+  hlo_module->AddEntryComputation(std::move(computation));
+
+  // There should not be any kernel prefixed "copy".
+  CompileAndVerifyIr(std::move(hlo_module), "; CHECK-NOT: define void @_copy",
+                     /*match_optimized_ir=*/false);
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc
new file mode 100644
index 0000000000..177b94934c
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc
@@ -0,0 +1,119 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+
+// Check that the ftz (flush denormals to zero) flag is reflected in PTX as
+// expected.
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class GpuFtzTest : public GpuCodegenTest {
+ public:
+  explicit GpuFtzTest(bool ftz) : ftz_(ftz) {}
+
+  // Creates an HLO module that performs the given binary operation on some
+  // data.
+  std::unique_ptr<HloModule> CreateBinaryOpModule(HloOpcode op) {
+    HloComputation::Builder builder(TestName());
+
+    Shape param_shape = ShapeUtil::MakeShapeWithLayout(
+        F32, /*dimensions=*/{100, 100}, /*minor_to_major=*/{1, 0});
+    HloInstruction* x = builder.AddInstruction(HloInstruction::CreateParameter(
+        /* parameter_number=*/0, param_shape, "x"));
+    HloInstruction* y = builder.AddInstruction(HloInstruction::CreateParameter(
+        /* parameter_number=*/1, param_shape, "y"));
+    builder.AddInstruction(HloInstruction::CreateBinary(param_shape, op, x, y));
+
+    auto hlo_module = CreateNewModuleWithFTZ(ftz_);
+    hlo_module->AddEntryComputation(builder.Build());
+    return hlo_module;
+  }
+
+  // Creates an HLO module that performs the given unary operation on some data.
+  std::unique_ptr<HloModule> CreateUnaryOpModule(HloOpcode op) {
+    HloComputation::Builder builder(TestName());
+
+    Shape param_shape = ShapeUtil::MakeShapeWithLayout(
+        F32, /*dimensions=*/{100, 100}, /*minor_to_major=*/{1, 0});
+    HloInstruction* x = builder.AddInstruction(HloInstruction::CreateParameter(
+        /* parameter_number=*/0, param_shape, "x"));
+    builder.AddInstruction(HloInstruction::CreateUnary(param_shape, op, x));
+
+    auto hlo_module = CreateNewModuleWithFTZ(ftz_);
+    hlo_module->AddEntryComputation(builder.Build());
+    return hlo_module;
+  }
+
+  bool ftz_;
+};
+
+class GpuFtzEnabledTest : public GpuFtzTest {
+ public:
+  GpuFtzEnabledTest() : GpuFtzTest(/*ftz=*/true) {}
+};
+
+class GpuFtzDisabledTest : public GpuFtzTest {
+ public:
+  GpuFtzDisabledTest() : GpuFtzTest(/*ftz=*/false) {}
+};
+
+// Check that we emit mul.ftz.f32 when in ftz mode, and plain mul.f32 otherwise.
+TEST_F(GpuFtzEnabledTest, MultiplyFtz) {
+  CompileAndVerifyPtx(CreateBinaryOpModule(HloOpcode::kMultiply), R"(
+    CHECK-NOT: mul.f32
+    CHECK: mul.ftz.f32
+    CHECK-NOT: mul.f32
+  )");
+}
+TEST_F(GpuFtzDisabledTest, MultiplyFtz) {
+  CompileAndVerifyPtx(CreateBinaryOpModule(HloOpcode::kMultiply), R"(
+    CHECK-NOT: mul.ftz.f32
+    CHECK: mul.f32
+    CHECK-NOT: mul.ftz.f32
+  )");
+}
+
+// In NVPTX, exp(float) is implemented in libdevice, and consults __nvvm_reflect
+// to determine whether or not ftz is enabled.  The implementation uses two
+// calls to ex2.approx.  When ftz is on, we get two calls to the ftz version;
+// when ftz is off, we get one call to the ftz version and one call to the
+// regular version.
+TEST_F(GpuFtzEnabledTest, ExpFtz) {
+  CompileAndVerifyPtx(CreateUnaryOpModule(HloOpcode::kExp), R"(
+    CHECK-NOT: ex2.approx.f32
+    CHECK:     ex2.approx.ftz.f32
+    CHECK-NOT: ex2.approx.f32
+    CHECK:     ex2.approx.ftz.f32
+    CHECK-NOT: ex2.approx.f32
+    CHECK-NOT: ex2.approx.ftz.f32
+  )");
+}
+
+TEST_F(GpuFtzDisabledTest, ExpFtz) {
+  CompileAndVerifyPtx(CreateUnaryOpModule(HloOpcode::kExp), R"(
+    CHECK-NOT: ex2.approx.f32
+    CHECK-DAG: ex2.approx.ftz.f32
+    CHECK-DAG: ex2.approx.f32
+    CHECK-NOT: ex2.approx.f32
+    CHECK-NOT: ex2.approx.ftz.f32
+  )");
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_fusion_test.cc
new file mode 100644
index 0000000000..674b436a8e
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_fusion_test.cc
@@ -0,0 +1,59 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class GpuFusionTest : public GpuCodegenTest {};
+
+TEST_F(GpuFusionTest, FusedReshape) {
+  const char* hlo_text = R"(
+    HloModule test_module
+
+    fused_computation {
+      p0.param_0 = f32[4,1,1]{2,1,0} parameter(0)
+      p1.param_1 = f32[4,1]{1,0} parameter(1)
+      reshape = f32[4,1]{1,0} reshape(p0.param_0)
+      ROOT add = f32[4,1] add(reshape, p1.param_1)
+    }
+
+    ENTRY BroadcastIntoAdd {
+      p0 = f32[4,1,1]{2,1,0} parameter(0)
+      p1 = f32[4,1]{1,0} parameter(1)
+      ROOT fusion = f32[4,1]{1,0} fusion(p0, p1), kind=kLoop,
+                                                  calls=fused_computation
+    }
+)";
+
+  CompileAndVerifyIr(hlo_text,
+                     R"(
+; CHECK-LABEL: @fusion
+; CHECK: fadd
+; CHECK: }
+      )");
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc
new file mode 100644
index 0000000000..e5958165ef
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc
@@ -0,0 +1,147 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/xla.pb.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace gpu {
+
+// This file tests the index expressions used to reference source tensors. When
+// the destination tensor and source tensor have compatible shapes, the linear
+// index is used to access the source tensor. Otherwise, dimensional indices
+// computed from the linear index are used to access the source tensor.
+
+class GpuIndexTest : public GpuCodegenTest {};
+
+TEST_F(GpuIndexTest, CompatibleUseLinearIndex) {
+  HloComputation::Builder builder(TestName());
+
+  auto param_shape = ShapeUtil::MakeShape(F32, {5, 7, 2});
+  HloInstruction* param_x = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, param_shape, "x"));
+  HloInstruction* param_y = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, param_shape, "y"));
+  builder.AddInstruction(HloInstruction::CreateBinary(
+      ShapeUtil::MakeShape(PRED, {5, 7, 2}), HloOpcode::kGe, param_x, param_y));
+
+  auto hlo_module = CreateNewModule();
+  hlo_module->AddEntryComputation(builder.Build());
+
+  // Check the optimized IR as the unoptimized IR contains dead udiv and urem.
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+; CHECK-NOT: udiv
+; CHECK-NOT: urem
+      )",
+                     /*match_optimized_ir=*/true);
+}
+
+TEST_F(GpuIndexTest, CompatibleUseLinearIndexWithReshape) {
+  HloModuleConfig config;
+  config.set_debug_options(HloTestBase::GetDebugOptionsForTest());
+  auto module = ParseHloString(R"(
+    HloModule test_module
+
+    ENTRY CompatibleUseLinearIndexWithReshape {
+      x = f32[5,7,2]{2,1,0} parameter(0)
+      y = f32[5,14]{1,0} parameter(1)
+      reshape = f32[5,7,2]{2,1,0} reshape(y)
+      ROOT gte = pred[5,7,2]{2,1,0} greater-than-or-equal-to(x, reshape)
+    })",
+                               config)
+                    .ValueOrDie();
+
+  // Check the optimized IR as the unoptimized IR contains dead udiv and urem.
+  CompileAndVerifyIr(std::move(module),
+                     R"(
+; CHECK-NOT: udiv
+; CHECK-NOT: urem
+      )",
+                     /*match_optimized_ir=*/true);
+}
+
+TEST_F(GpuIndexTest, CompatibleUseLinearIndexWithReshapeAndBroadcast) {
+  HloModuleConfig config;
+  config.set_debug_options(HloTestBase::GetDebugOptionsForTest());
+  auto module = ParseHloString(R"(
+    HloModule test_module
+
+    ENTRY CompatibleUseLinearIndexWithReshape {
+      x = f32[5,7,2]{2,1,0} parameter(0)
+      y = f32[14]{0} parameter(1)
+      reshape = f32[7,2]{1,0} reshape(y)
+      broadcast = f32[5,7,2]{2,1,0} broadcast(reshape), dimensions={1,2}
+      ROOT gte = pred[5,7,2]{2,1,0} greater-than-or-equal-to(x, broadcast)
+    })",
+                               config)
+                    .ValueOrDie();
+
+  // Check the optimized IR reuses the linear index by calculating modulo 14.
+  CompileAndVerifyIr(std::move(module),
+                     R"(
+; CHECK: %[[urem1:.*]] = urem i{{[0-9]*}} %[[linear_index:.*]], 14
+; CHECK: %[[bitcast:.*]] = bitcast i8 addrspace(1)* %[[alloc:.*]] to float addrspace(1)*
+; CHECK: %[[idx1:.*]] = zext i{{[0-9]*}} %[[urem1]] to i64
+; CHECK: getelementptr inbounds float, float addrspace(1)* %[[bitcast]], i64 %[[idx1]]
+      )",
+                     /*match_optimized_ir=*/true);
+}
+
+TEST_F(GpuIndexTest, CompatibleUseLinearIndexWithSizeOneDimensions) {
+  HloModuleConfig config;
+  auto debug_options = HloTestBase::GetDebugOptionsForTest();
+  debug_options.set_xla_gpu_max_kernel_unroll_factor(1);
+  config.set_debug_options(debug_options);
+
+  auto module = ParseHloString(R"(
+    HloModule  test_module
+
+    ENTRY CompatibleUseLinearIndexWithSizeOneDimensions  {
+      x = f32[1,1024,1,256]{3,2,1,0} parameter(0)
+      ROOT y = f16[1,1024,1,256]{2,3,1,0} convert(x)
+    })",
+                               config)
+                    .ValueOrDie();
+
+  // Check that the unoptimized IR reuses the linear index.
+  CompileAndVerifyIr(std::move(module),
+                     R"(
+; CHECK-LABEL: @fusion
+; CHECK: udiv i32 %[[linear_index:.*]], 262144
+; CHECK: %[[ld_addr:.*]] = getelementptr inbounds float, float* {{.*}}, i32 %[[linear_index]]
+; CHECK: load float, float* %[[ld_addr]]
+; CHECK: %[[st_addr:.*]] = getelementptr inbounds half, half* {{.*}}, i32 %[[linear_index]]
+; CHECK: store half {{.*}}, half* %[[st_addr]]
+      )",
+                     /*match_optimized_ir=*/false);
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
new file mode 100644
index 0000000000..cca35316f0
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
@@ -0,0 +1,177 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class GpuKernelTilingTest : public GpuCodegenTest {
+ protected:
+  GpuKernelTilingTest() {
+    auto debug_options = HloTestBase::GetDebugOptionsForTest();
+    config_.set_debug_options(debug_options);
+    // Disable layout_assignment to use the preassigned layouts.
+    debug_options.add_xla_disable_hlo_passes("layout_assignment");
+  }
+  HloModuleConfig config_;
+};
+
+TEST_F(GpuKernelTilingTest, UnnestedTransposeWithProperDimensionsTiled) {
+  const char *const kHloString = R"(
+    HloModule unnested_transpose_1
+
+    ENTRY unnested_transpose_1 {
+      para0 = f16[32,3,64]{2,1,0} parameter(0)
+      ROOT copy1 = f16[32,3,64]{1,0,2} copy(para0)
+    })";
+
+  // Check that a call to llvm.nvvm.barrier0 is generated.
+  auto hlo_module = ParseHloString(kHloString, config_).ValueOrDie();
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+; CHECK-LABEL: define void @copy
+; CHECK: tail call void @llvm.nvvm.barrier0()
+; CHECK: }
+)",
+                     /*match_optimized_ir=*/true);
+
+  // Check that the kernel runs correctly.
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{0.0}));
+}
+
+TEST_F(GpuKernelTilingTest, UnnestedTransposeWithSmallDimensionsNotTiled) {
+  const char *const kHloString = R"(
+    HloModule unnested_transpose_2
+
+    ENTRY unnested_transpose_2 {
+      para0 = f16[2,3,64]{2,1,0} parameter(0)
+      ROOT copy1 = f16[2,3,64]{1,0,2} copy(para0)
+    })";
+
+  // Check that a call to llvm.nvvm.barrier0 is not generated.
+  auto hlo_module = ParseHloString(kHloString, config_).ValueOrDie();
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+; CHECK-LABEL: define void @copy
+; CHECK-NOT: tail call void @llvm.nvvm.barrier0()
+; CHECK: }
+)",
+                     /*match_optimized_ir=*/true);
+}
+
+TEST_F(GpuKernelTilingTest, SimpleFusionWithTransposeTiled) {
+  const char *const kHloString = R"(
+    HloModule multiple_output_fusion_1
+    fused_computation.1 {
+      param0 = f32[4,5,6,7,8]{4,3,2,1,0} parameter(0)
+      copy = f32[4,5,6,7,8]{2,1,4,3,0} copy(param0)
+      ROOT convert = f16[4,5,6,7,8]{2,1,4,3,0} convert(copy)
+    }
+
+    ENTRY copy_in_fusion_run_without_hlo_passes {
+      para0 = f32[4,5,6,7,8]{4,3,2,1,0} parameter(0)
+      ROOT fusion.1 = f16[4,5,6,7,8]{2,1,4,3,0} fusion(para0), kind=kLoop,
+        calls=fused_computation.1
+    })";
+
+  // Check that a call to llvm.nvvm.barrier0 is generated.
+  auto hlo_module = ParseHloString(kHloString, config_).ValueOrDie();
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+; CHECK-LABEL: define void @fusion
+; CHECK: tail call void @llvm.nvvm.barrier0()
+; CHECK: }
+)",
+                     /*match_optimized_ir=*/true);
+
+  // Check that the kernel runs correctly.
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{0.0}));
+}
+
+TEST_F(GpuKernelTilingTest, MultipleOutputFusionWithOnePossibleTransposeTiled) {
+  const char *const kHloString = R"(
+    HloModule multiple_output_fusion_1
+    fused_computation.1 {
+      param0 = f16[8,31,31,65]{3,2,1,0} parameter(0)
+      param1 = f16[8,31,31,65]{3,2,1,0} parameter(1)
+      copy0 = f16[8,31,31,65]{2,1,3,0} copy(param0)
+      copy1 = f16[8,31,31,65]{2,1,3,0} copy(param1)
+      ROOT tuple1 = (f16[8,31,31,65]{2,1,3,0}, f16[8,31,31,65]{2,1,3,0})
+        tuple(copy0, copy1)
+    }
+
+    ENTRY multiple_output_fusion_1 {
+      para0 = f16[8,31,31,65]{3,2,1,0} parameter(0)
+      para1 = f16[8,31,31,65]{3,2,1,0} parameter(1)
+      ROOT fusion.1 = (f16[8,31,31,65]{2,1,3,0}, f16[8,31,31,65]{2,1,3,0})
+        fusion(para0,para1), kind=kLoop, calls=fused_computation.1
+    })";
+
+  // Check that a call to llvm.nvvm.barrier0 is generated.
+  auto hlo_module = ParseHloString(kHloString, config_).ValueOrDie();
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+; CHECK-LABEL: define void @fusion
+; CHECK: tail call void @llvm.nvvm.barrier0()
+; CHECK: }
+)",
+                     /*match_optimized_ir=*/true);
+
+  // Check that the kernel runs correctly.
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{0.0}));
+}
+
+TEST_F(GpuKernelTilingTest,
+       MultipleOutputFusionWithTwoPossibleTransposesNotTiled) {
+  const char *const kHloString = R"(
+    HloModule multiple_output_fusion_2
+    fused_computation.1 {
+      param0 = f16[8,31,31,65]{3,2,1,0} parameter(0)
+      param1 = f16[8,31,31,65]{1,3,2,0} parameter(1)
+      copy2 = f16[8,31,31,65]{2,1,3,0} copy(param0)
+      copy3 = f16[8,31,31,65]{2,1,3,0} copy(param1)
+      ROOT tuple1 = (f16[8,31,31,65]{2,1,3,0}, f16[8,31,31,65]{2,1,3,0})
+        tuple(copy2, copy3)
+    }
+
+    ENTRY multiple_output_fusion_2 {
+      para0 = f16[8,31,31,65]{3,2,1,0} parameter(0)
+      para1 = f16[8,31,31,65]{1,3,2,0} parameter(1)
+      ROOT fusion1 = (f16[8,31,31,65]{2,1,3,0}, f16[8,31,31,65]{2,1,3,0})
+        fusion(para0,para1), kind=kLoop, calls=fused_computation.1
+    })";
+
+  // Check that a call to llvm.nvvm.barrier0 is not generated.
+  auto hlo_module = ParseHloString(kHloString, config_).ValueOrDie();
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+; CHECK-LABEL: define void @fusion
+; CHECK-NOT: tail call void @llvm.nvvm.barrier0()
+; CHECK: }
+)",
+                     /*match_optimized_ir=*/true);
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc
new file mode 100644
index 0000000000..6c9ae7bada
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc
@@ -0,0 +1,141 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Tests that we emit ld.global.nc (the PTX instruction corresponding to CUDA's
+// __ldg builtin) for reads of buffers that don't change during a kernel's
+// execution.
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace gpu {
+
+class GpuLdgTest : public GpuCodegenTest {};
+
+// Parameters are never overwritten, so parameter reads should get ld.global.nc
+// reads.
+TEST_F(GpuLdgTest, LdgForParamRead) {
+  HloComputation::Builder builder(TestName());
+
+  auto shape = ShapeUtil::MakeShape(F32, {2, 2});
+  HloInstruction* param =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "x"));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param, param));
+  std::unique_ptr<HloComputation> computation = builder.Build();
+
+  auto hlo_module = CreateNewModule();
+  hlo_module->AddEntryComputation(std::move(computation));
+
+  CompileAndVerifyPtx(std::move(hlo_module), R"(
+    CHECK-NOT: ld.global.f32
+    CHECK: ld.global.nc.f32
+  )");
+}
+
+// Check that reading a buffer produced by a non-parameter HLO also results in
+// ld.global.nc, if that buffer isn't modified within the instruction that reads
+// it.
+TEST_F(GpuLdgTest, LdgForNonParamRead) {
+  HloComputation::Builder builder(TestName());
+
+  auto shape = ShapeUtil::MakeShape(F32, {2, 2});
+  HloInstruction* param =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "x"));
+  HloInstruction* add = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param, param));
+  HloInstruction* square = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, add, add));
+  builder.AddInstruction(HloInstruction::CreateTuple({add, square}));
+  std::unique_ptr<HloComputation> computation = builder.Build();
+
+  auto hlo_module = CreateNewModule();
+  hlo_module->AddEntryComputation(std::move(computation));
+
+  CompileAndVerifyPtx(std::move(hlo_module), R"(
+    CHECK: {
+    CHECK-NOT: ld.global.f32
+    CHECK: ld.global.nc.f32
+    CHECK: }
+  )");
+}
+
+// Check that reading a buffer that's modified in-place does not produce
+// ld.global.nc.
+//
+// We do this by creating a reduce that feeds into a sin.  We don't currently
+// fuse sin into reduce, and the sin is elementwise, so it reuses its input
+// buffer as its output.
+//
+// It seems like a fair bet that we won't start fusing sin into the output of
+// reduce in the foreseeable future.  But if that turns out to be wrong, I give
+// you, future reader, permission to delete this test.
+TEST_F(GpuLdgTest, NoLdgWhenSharingBuffer) {
+  auto hlo_module = CreateNewModule();
+  HloComputation::Builder builder(TestName());
+
+  HloComputation* reduce_computation;
+  {
+    auto embedded_builder = HloComputation::Builder("add");
+    auto lhs = embedded_builder.AddInstruction(HloInstruction::CreateParameter(
+        0, ShapeUtil::MakeShape(F32, {}), "lhs"));
+    auto rhs = embedded_builder.AddInstruction(HloInstruction::CreateParameter(
+        1, ShapeUtil::MakeShape(F32, {}), "rhs"));
+    embedded_builder.AddInstruction(
+        HloInstruction::CreateBinary(lhs->shape(), HloOpcode::kAdd, lhs, rhs));
+    reduce_computation =
+        hlo_module->AddEmbeddedComputation(embedded_builder.Build());
+  }
+
+  auto param_shape = ShapeUtil::MakeShape(F32, {2, 2});
+  auto reduce_shape = ShapeUtil::MakeShape(F32, {2});
+  HloInstruction* param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, param_shape, "x"));
+  HloInstruction* reduce = builder.AddInstruction(HloInstruction::CreateReduce(
+      reduce_shape,
+      builder.AddInstruction(HloInstruction::CreateBinary(
+          param_shape, HloOpcode::kAdd, param, param)),
+      builder.AddInstruction(
+          HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0))),
+      {0}, reduce_computation));
+  builder.AddInstruction(
+      HloInstruction::CreateUnary(reduce_shape, HloOpcode::kSin, reduce));
+
+  std::unique_ptr<HloComputation> computation = builder.Build();
+  hlo_module->AddEntryComputation(std::move(computation));
+
+  CompileAndVerifyPtx(std::move(hlo_module), R"(
+    CHECK-LABEL: .entry sin
+    CHECK: {
+    CHECK-NOT: ld.global.nc.f32
+    CHECK: ld.global.f32
+    CHECK: }
+  )");
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc
new file mode 100644
index 0000000000..c42e5704a4
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc
@@ -0,0 +1,68 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace gpu {
+
+class GpuNoAliasTest : public GpuCodegenTest {};
+
+TEST_F(GpuNoAliasTest, Concat) {
+  HloComputation::Builder builder(TestName());
+
+  auto param_shape = ShapeUtil::MakeShape(F32, {2, 2});
+  HloInstruction* param_x = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, param_shape, "x"));
+  HloInstruction* param_y = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, param_shape, "y"));
+  HloInstruction* concat =
+      builder.AddInstruction(HloInstruction::CreateConcatenate(
+          ShapeUtil::MakeShape(F32, {2, 4}), {param_x, param_y}, 1));
+  builder.AddInstruction(HloInstruction::CreateConcatenate(
+      ShapeUtil::MakeShape(F32, {2, 6}), {concat, param_x}, 1));
+
+  std::unique_ptr<HloComputation> computation = builder.Build();
+
+  auto hlo_module = CreateNewModule();
+  hlo_module->AddEntryComputation(std::move(computation));
+
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+; CHECK: %[[x_gep:.*]] = getelementptr inbounds [2 x [2 x float]], [2 x [2 x float]]* %x{{.*}}, i32 0
+; CHECK: load float, float* %[[x_gep]], {{.*}}, !noalias ![[param_noalias:.*]]
+; CHECK: %[[y_gep:.*]] = getelementptr inbounds [2 x [2 x float]], [2 x [2 x float]]* %y{{.*}}, i32 0
+; CHECK: load float, float* %[[y_gep]], {{.*}}, !noalias ![[param_noalias]]
+; CHECK: %[[result_ptr:.*]] = bitcast [2 x [6 x float]]* %fusion{{.*}} to float*
+; CHECK: %[[result_gep:.*]] = getelementptr inbounds float, float* %[[result_ptr]]
+; CHECK: store float {{.*}}, float* %[[result_gep]], !alias.scope ![[param_noalias]]
+; CHECK: ![[param_noalias]] = !{![[retval_buffer:.*]]}
+      )",
+                     /*match_optimized_ir=*/false);
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_unrolling_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_unrolling_test.cc
new file mode 100644
index 0000000000..9622936306
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_unrolling_test.cc
@@ -0,0 +1,185 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class GpuUnrollingTest : public GpuCodegenTest {};
+
+const char *const kAddModule = R"(
+    HloModule test_module
+
+    fused_computation {
+      p0.param_0 = f32[2,2]{1,0} parameter(0)
+      p1.param_1 = f32[2,2]{1,0} parameter(1)
+      ROOT add = f32[2,2] add(p0.param_0, p1.param_1)
+    }
+
+    ENTRY BroadcastIntoAdd {
+      p0 = f32[2,2]{1,0} parameter(0)
+      p1 = f32[2,2]{1,0} parameter(1)
+      ROOT fusion = f32[2,2]{1,0} fusion(p0, p1), kind=kLoop,
+                                                  calls=fused_computation
+    })";
+
+TEST_F(GpuUnrollingTest, DoNotUnroll) {
+  HloModuleConfig config;
+  auto debug_options = HloTestBase::GetDebugOptionsForTest();
+  debug_options.set_xla_gpu_max_kernel_unroll_factor(1);
+  config.set_debug_options(debug_options);
+  auto hlo_module = ParseHloString(kAddModule, config).ValueOrDie();
+
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+; CHECK-LABEL: @fusion
+; CHECK: fadd
+; CHECK-NOT: fadd
+; CHECK: }
+      )",
+                     /*match_optimized_ir=*/true);
+}
+
+TEST_F(GpuUnrollingTest, UnrollFourTimes) {
+  HloModuleConfig config;
+  auto debug_options = HloTestBase::GetDebugOptionsForTest();
+  // We request a factor of 8, but the computation works on 4 elements, limiting
+  // the maximum unroll factor.
+  debug_options.set_xla_gpu_max_kernel_unroll_factor(8);
+  config.set_debug_options(debug_options);
+  auto hlo_module = ParseHloString(kAddModule, config).ValueOrDie();
+
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+; CHECK-LABEL: @fusion
+; CHECK: fadd
+; CHECK: fadd
+; CHECK: fadd
+; CHECK: fadd
+; CHECK-NOT: fadd
+; CHECK: }
+      )",
+                     /*match_optimized_ir=*/true);
+}
+
+TEST_F(GpuUnrollingTest, UnrollDefaultTimes) {
+  // The default unrolling factor is 4.
+  HloModuleConfig config;
+  config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
+  auto hlo_module = ParseHloString(kAddModule, config).ValueOrDie();
+
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+; CHECK-LABEL: @fusion
+; CHECK: load <4 x float>
+; CHECK: fadd
+; CHECK: fadd
+; CHECK: fadd
+; CHECK: fadd
+; CHECK-NOT: fadd
+; CHECK: store <4 x float>
+; CHECK: }
+      )",
+                     /*match_optimized_ir=*/true);
+}
+
+TEST_F(GpuUnrollingTest, UnrollUnfusedAdd) {
+  HloModuleConfig config;
+  auto debug_options = HloTestBase::GetDebugOptionsForTest();
+  debug_options.set_xla_gpu_max_kernel_unroll_factor(4);
+  config.set_debug_options(debug_options);
+
+  const char *const kUnfusedAddModule = R"(
+    HloModule test_module
+
+    ENTRY AddFunc {
+      p0 = f32[2,2]{1,0} parameter(0)
+      p1 = f32[2,2]{1,0} parameter(1)
+      ROOT add = f32[2,2]{1,0} add(p0, p1)
+    })";
+  auto hlo_module = ParseHloString(kUnfusedAddModule, config).ValueOrDie();
+
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+; CHECK-LABEL: @add
+; CHECK: load <4 x float>
+; CHECK: fadd
+; CHECK: fadd
+; CHECK: fadd
+; CHECK: fadd
+; CHECK-NOT: fadd
+; CHECK: store <4 x float>
+; CHECK: }
+      )",
+                     /*match_optimized_ir=*/true);
+}
+
+TEST_F(GpuUnrollingTest, UnrollMultiOutputFusion) {
+  HloModuleConfig config;
+  auto debug_options = HloTestBase::GetDebugOptionsForTest();
+  debug_options.set_xla_gpu_max_kernel_unroll_factor(2);
+  config.set_debug_options(debug_options);
+
+  const char *const kMultiOutputFusionModule = R"(
+    HloModule test_module
+
+    fused_computation {
+      p0.param_0 = f32[2,2]{1,0} parameter(0)
+      p1.param_1 = f32[2,2]{1,0} parameter(1)
+      add = f32[2,2]{1,0} add(p0.param_0, p1.param_1)
+      mul = f32[2,2]{1,0} multiply(p0.param_0, p1.param_1)
+      ROOT tuple = (f32[2,2]{1,0}, f32[2,2]{1,0}) tuple(add, mul)
+    }
+
+    ENTRY BroadcastIntoAdd {
+      p0 = f32[2,2]{1,0} parameter(0)
+      p1 = f32[2,2]{1,0} parameter(1)
+      ROOT fusion = (f32[2,2]{1,0}, f32[2,2]{1,0}) fusion(p0, p1), kind=kLoop,
+                                                   calls=fused_computation
+    })";
+  auto hlo_module =
+      ParseHloString(kMultiOutputFusionModule, config).ValueOrDie();
+
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+; CHECK-LABEL: @fusion
+; CHECK: load <2 x float>
+; CHECK: load <2 x float>
+; CHECK-NOT: load <2 x float>
+; CHECK: fadd
+; CHECK: fmul
+; CHECK: fadd
+; CHECK: fmul
+; CHECK: store <2 x float>
+; CHECK: store <2 x float>
+; CHECK-NOT: store <2 x float>
+; CHECK-NOT: fadd
+; CHECK-NOT: fmul
+; CHECK: }
+      )",
+                     /*match_optimized_ir=*/true);
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/infeed_test.cc b/tensorflow/compiler/xla/service/gpu/tests/infeed_test.cc
new file mode 100644
index 0000000000..9072b30317
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/infeed_test.cc
@@ -0,0 +1,121 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <unistd.h>
+#include <memory>
+
+#include "tensorflow/compiler/xla/client/global_data.h"
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/math/math_util.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace {
+
+class InfeedTest : public ClientLibraryTestBase {
+ protected:
+  // Transfers the given literal to the infeed interface of the device, and
+  // check if the returned data from Infeed HLO is same as the literal.
+  void TestInfeedRoundTrip(const Literal& literal) {
+    // TODO(b/30481585) Explicitly reset the Infeed state so that the
+    // test is not affected by the state from the previous tests.
+    ASSERT_IS_OK(client_->TransferToInfeed(literal));
+    XlaBuilder builder(TestName());
+    Infeed(&builder, literal.shape());
+    if (ShapeUtil::IsTuple(literal.shape())) {
+      // TODO(b/30609564): Use ComputeAndCompareLiteral instead.
+      ComputeAndCompareTuple(&builder, literal, {});
+    } else {
+      ComputeAndCompareLiteral(&builder, literal, {});
+    }
+  }
+};
+
+TEST_F(InfeedTest, SingleInfeedR0Bool) {
+  TestInfeedRoundTrip(*LiteralUtil::CreateR0<bool>(true));
+}
+
+TEST_F(InfeedTest, SingleInfeedR1U32) {
+  TestInfeedRoundTrip(*LiteralUtil::CreateR1<uint32>({1, 2, 3}));
+}
+
+TEST_F(InfeedTest, SingleInfeedR2F32) {
+  TestInfeedRoundTrip(*LiteralUtil::CreateR2F32Linspace(0.0, 1.0, 128, 64));
+}
+
+TEST_F(InfeedTest, SingleInfeedR3F32) {
+  TestInfeedRoundTrip(
+      *LiteralUtil::CreateR3({{{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}},
+                              {{1.1f, 2.1f, 3.1f}, {6.1f, 3.5f, 2.8f}}}));
+}
+
+TEST_F(InfeedTest, SingleInfeedR3F32DifferentLayout) {
+  const Layout r3_dim0minor = LayoutUtil::MakeLayout({0, 1, 2});
+  const Layout r3_dim0major = LayoutUtil::MakeLayout({2, 1, 0});
+
+  TestInfeedRoundTrip(*LiteralUtil::CreateR3WithLayout(
+      {{{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}},
+       {{1.1f, 2.1f, 3.1f}, {6.1f, 3.5f, 2.8f}}},
+      r3_dim0minor));
+
+  TestInfeedRoundTrip(*LiteralUtil::CreateR3WithLayout(
+      {{{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}},
+       {{1.1f, 2.1f, 3.1f}, {6.1f, 3.5f, 2.8f}}},
+      r3_dim0major));
+}
+
+TEST_F(InfeedTest, SingleInfeedR4S32) {
+  TestInfeedRoundTrip(*LiteralUtil::CreateR4(
+      {{{{1, -2}, {-4, 5}, {6, 7}}, {{8, 9}, {10, 11}, {12, 13}}},
+       {{{10, 3}, {7, -2}, {3, 6}}, {{2, 5}, {-11, 5}, {-2, -5}}}}));
+}
+
+// Tests that a large infeed can be handled.
+TEST_F(InfeedTest, LargeInfeed) {
+  Array4D<float> array(80, 100, 8, 128);
+  array.FillIota(1.0f);
+  TestInfeedRoundTrip(*LiteralUtil::CreateR4FromArray4D<float>(array));
+}
+
+TEST_F(InfeedTest, SingleInfeedTuple) {
+  TestInfeedRoundTrip(
+      *LiteralUtil::MakeTuple({LiteralUtil::CreateR1<uint32>({1, 2, 3}).get(),
+                               LiteralUtil::CreateR0<bool>(false).get()}));
+}
+
+TEST_F(InfeedTest, SingleInfeedEmptyTuple) {
+  TestInfeedRoundTrip(*LiteralUtil::MakeTuple({}));
+}
+
+// Tests that a large tuple infeed can be handled.
+TEST_F(InfeedTest, SingleInfeedLargeTuple) {
+  Array4D<float> array(40, 100, 8, 128);
+  array.FillIota(1.0f);
+  TestInfeedRoundTrip(*LiteralUtil::MakeTuple(
+      {LiteralUtil::CreateR4FromArray4D<float>(array).get(),
+       LiteralUtil::CreateR0<int32>(5).get()}));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/thunk.cc b/tensorflow/compiler/xla/service/gpu/thunk.cc
new file mode 100644
index 0000000000..c78605cebb
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/thunk.cc
@@ -0,0 +1,59 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/thunk.h"
+
+namespace xla {
+namespace gpu {
+
+std::ostream& operator<<(std::ostream& os, Thunk::Kind kind) {
+  switch (kind) {
+    case Thunk::kConditional:
+      return os << "kConditional";
+    case Thunk::kConvolution:
+      return os << "kConvolution";
+    case Thunk::kCopy:
+      return os << "kCopy";
+    case Thunk::kCudnnBatchNormBackward:
+      return os << "kCudnnBatchNormBackward";
+    case Thunk::kCudnnBatchNormForwardInference:
+      return os << "kCudnnBatchNormForwardInference";
+    case Thunk::kCudnnBatchNormForwardTraining:
+      return os << "kCudnnBatchNormForwardTraining";
+    case Thunk::kFft:
+      return os << "kFft";
+    case Thunk::kGemm:
+      return os << "kGemm";
+    case Thunk::kInfeed:
+      return os << "kInfeed";
+    case Thunk::kKernel:
+      return os << "kKernel";
+    case Thunk::kMemset32BitValue:
+      return os << "kMemset32BitValue";
+    case Thunk::kMemzero:
+      return os << "kMemzero";
+    case Thunk::kOutfeed:
+      return os << "kOutfeed";
+    case Thunk::kSequential:
+      return os << "kSequential";
+    case Thunk::kTuple:
+      return os << "kTuple";
+    case Thunk::kWhile:
+      return os << "kWhile";
+  }
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/thunk.h b/tensorflow/compiler/xla/service/gpu/thunk.h
index 99a1a0eae9..4df0bb005b 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/thunk.h
@@ -41,7 +41,7 @@ class GpuExecutable;
 // This is thread-compatible.
 class Thunk {
  public:
-  enum class Kind {
+  enum Kind {
     kConditional,
     kConvolution,
     kCopy,
@@ -111,6 +111,8 @@ class Thunk {
 // A sequence of thunks.
 using ThunkSequence = std::vector<std::unique_ptr<Thunk>>;
 
+std::ostream& operator<<(std::ostream& os, Thunk::Kind kind);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/gpu/while_thunk.cc b/tensorflow/compiler/xla/service/gpu/while_thunk.cc
index 1315a4183a..d81d87e7dc 100644
--- a/tensorflow/compiler/xla/service/gpu/while_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/while_thunk.cc
@@ -57,6 +57,7 @@ Status WhileThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
   while (true) {
     // Invoke thunk sequence for while 'condition' computation.
     profiler->StartHloComputation();
+    VLOG(3) << "Executing condition computation";
     TF_RETURN_IF_ERROR(condition_thunk_sequence_->ExecuteOnStream(
         buffer_allocations, stream, profiler));
     profiler->FinishHloComputation(hlo_instruction()->while_condition());
@@ -64,6 +65,7 @@ Status WhileThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
     // Copy the result of condition computation and break the loop if 'false'.
     bool condition_result;
     stream->ThenMemcpy(&condition_result, condition_result_data, sizeof(bool));
+    VLOG(3) << "condition_result = " << condition_result;
     Status block_status = stream->BlockHostUntilDone();
     if (!block_status.ok()) {
       return InternalError(
@@ -78,6 +80,7 @@ Status WhileThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
     // We measure the time of one execution of the while body computation. The
     // while body may be executed more than once, the last measurement "wins".
     profiler->StartHloComputation();
+    VLOG(3) << "Executing body computation";
     // Invoke thunk sequence for while 'body' computation, and pass on
     // 'profiler' to measure the timing of the thunks in 'body_thunk_sequence_'.
     TF_RETURN_IF_ERROR(body_thunk_sequence_->ExecuteOnStream(buffer_allocations,
diff --git a/tensorflow/compiler/xla/service/gpu/xfeed_queue.h b/tensorflow/compiler/xla/service/gpu/xfeed_queue.h
index 737c7eb025..dd46ff433b 100644
--- a/tensorflow/compiler/xla/service/gpu/xfeed_queue.h
+++ b/tensorflow/compiler/xla/service/gpu/xfeed_queue.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_XFEED_QUEUE_H_
 
 #include <deque>
+#include <functional>
 #include <vector>
 
 #include "tensorflow/core/platform/mutex.h"
diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto
index 87abc0e74f..63a8a813cd 100644
--- a/tensorflow/compiler/xla/service/hlo.proto
+++ b/tensorflow/compiler/xla/service/hlo.proto
@@ -244,8 +244,9 @@ message BufferAllocationProto {
   int64 index = 1;
   int64 size = 2;
   bool is_thread_local = 3;
-  bool is_reusable = 4;
+  bool is_tuple = 11;
   bool is_entry_computation_parameter = 5;
+  bool is_constant = 12;
   int64 parameter_number = 6;
   repeated int64 parameter_shape_index = 10;
   bool maybe_live_out = 7;
diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis.h b/tensorflow/compiler/xla/service/hlo_alias_analysis.h
index afb0c20f0c..1fea544730 100644
--- a/tensorflow/compiler/xla/service/hlo_alias_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_alias_analysis.h
@@ -42,7 +42,7 @@ class HloAliasAnalysis {
   static StatusOr<std::unique_ptr<HloAliasAnalysis>> Run(
       HloModule* module,
       const HloDataflowAnalysis::FusionCanShareBufferFunction&
-          fusion_can_share_buffer = nullptr);
+          fusion_can_share_buffer);
 
   string ToString() const;
 
diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
index 403d4df6b5..da94ab5346 100644
--- a/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
@@ -47,7 +47,9 @@ class HloAliasAnalysisTest : public HloTestBase {
   // reference to the generated analysis stored in analysis_.
   HloAliasAnalysis& RunAnalysis() {
     hlo_graph_dumper::MaybeDumpHloModule(*module_, "Before alias analysis");
-    analysis_ = HloAliasAnalysis::Run(module_.get()).ConsumeValueOrDie();
+    analysis_ = HloAliasAnalysis::Run(module_.get(),
+                                      /*fusion_can_share_buffer=*/nullptr)
+                    .ConsumeValueOrDie();
     return *analysis_;
   }
 
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index c49cf7f5db..1f672502f7 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -155,6 +155,10 @@ Status HloCostAnalysis::HandleConstant(const HloInstruction*) {
   return Status::OK();
 }
 
+Status HloCostAnalysis::HandleIota(const HloInstruction*) {
+  return Status::OK();
+}
+
 Status HloCostAnalysis::HandleGetTupleElement(const HloInstruction*) {
   // GetTupleElement forwards a pointer and does not touch each element in the
   // output.
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.h b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
index 0181138a6d..82d650dc7b 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
@@ -52,6 +52,7 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
   Status HandleElementwiseUnary(const HloInstruction* hlo) override;
   Status HandleElementwiseBinary(const HloInstruction* hlo) override;
   Status HandleConstant(const HloInstruction* constant) override;
+  Status HandleIota(const HloInstruction* iota) override;
   Status HandleGetTupleElement(
       const HloInstruction* get_tuple_element) override;
   Status HandleSelect(const HloInstruction* hlo) override;
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
index 9fd0363f57..2c854eea18 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
@@ -22,8 +22,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/padding.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/local_service.h"
 #include "tensorflow/compiler/xla/service/service.h"
diff --git a/tensorflow/compiler/xla/service/hlo_cse_test.cc b/tensorflow/compiler/xla/service/hlo_cse_test.cc
index 76b9c66651..90fbaa37c5 100644
--- a/tensorflow/compiler/xla/service/hlo_cse_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_cse_test.cc
@@ -239,7 +239,7 @@ TEST_F(HloCseTest, IdenticalInstructions) {
   EXPECT_EQ(5, computation->instruction_count());
   EXPECT_THAT(tuple, op::Tuple(exp1, exp2, exp3));
 
-  HloCSE cse(/*is_layout_sensitive=*/false);
+  HloCSE cse(/*is_layout_sensitive=*/true);
   EXPECT_TRUE(cse.Run(module.get()).ValueOrDie());
 
   EXPECT_EQ(3, computation->instruction_count());
@@ -248,6 +248,183 @@ TEST_F(HloCseTest, IdenticalInstructions) {
   EXPECT_THAT(tuple, op::Tuple(first_operand, first_operand, first_operand));
 }
 
+// Test two identical while loops with same inputs
+TEST_F(HloCseTest, WhileLoopsIdenticalConditionsAndBodiesSameInput) {
+  auto module = ParseHloString(R"(
+    HloModule WhileLoopsIdenticalConditionsAndBodiesSameInput
+
+    %body (param: (f32[], f32[])) -> (f32[], f32[]) {
+      %param = (f32[], f32[]) parameter(0)
+      %get-tuple-element = f32[] get-tuple-element((f32[], f32[]) %param),
+index=0 %get-tuple-element.1 = f32[] get-tuple-element((f32[], f32[]) %param),
+index=1 %add = f32[] add(f32[] %get-tuple-element, f32[] %get-tuple-element.1)
+      ROOT %tuple = (f32[], f32[]) tuple(f32[] %get-tuple-element, f32[] %add)
+    }
+
+    %condition (param.1: (f32[], f32[])) -> pred[] {
+      %param.1 = (f32[], f32[]) parameter(0)
+      ROOT %constant = pred[] constant(false)
+    }
+
+    %condition.1 (param.2: (f32[], f32[])) -> pred[] {
+      %param.2 = (f32[], f32[]) parameter(0)
+      ROOT %constant.1 = pred[] constant(false)
+    }
+
+    ENTRY %WhileLoopsIdenticalConditionsAndBodiesSameInput () -> (f32[], f32[])
+{ %constant.2 = f32[] constant(1) %constant.3 = f32[] constant(2) %tuple.1 =
+(f32[], f32[]) tuple(f32[] %constant.2, f32[] %constant.3) %while = (f32[],
+f32[]) while((f32[], f32[]) %tuple.1), condition=%condition, body=%body ROOT
+%while.1 = (f32[], f32[]) while((f32[], f32[]) %tuple.1),
+condition=%condition.1, body=%body
+    }
+    )")
+                    .ValueOrDie();
+
+  auto computation = module->entry_computation();
+
+  EXPECT_EQ(5, computation->instruction_count());
+  HloCSE cse(true);
+  EXPECT_TRUE(cse.Run(module.get()).ValueOrDie());
+  EXPECT_EQ(4, computation->instruction_count());
+}
+
+// Test two while loops with same conditions, same inputs, but different
+// bodies
+TEST_F(HloCseTest, WhileLoopsIdenticalConditionsSameInputAndDifferentBodies) {
+  auto module = ParseHloString(R"(
+    HloModule WhileLoopsIdenticalConditionsSameInputAndDifferentBodies
+
+    %body (param: (f32[], f32[])) -> (f32[], f32[]) {
+      %param = (f32[], f32[]) parameter(0)
+      %get-tuple-element = f32[] get-tuple-element((f32[], f32[]) %param),
+index=0 %get-tuple-element.1 = f32[] get-tuple-element((f32[], f32[]) %param),
+index=1 %add = f32[] add(f32[] %get-tuple-element, f32[] %get-tuple-element.1)
+      ROOT %tuple = (f32[], f32[]) tuple(f32[] %get-tuple-element, f32[] %add)
+    }
+
+    %body2 (param.1: (f32[], f32[])) -> (f32[], f32[]) {
+      %param.1 = (f32[], f32[]) parameter(0)
+      %get-tuple-element.2 = f32[] get-tuple-element((f32[], f32[]) %param.1),
+index=0 %get-tuple-element.3 = f32[] get-tuple-element((f32[], f32[]) %param.1),
+index=1 %sub = f32[] subtract(f32[] %get-tuple-element.2, f32[]
+%get-tuple-element.3) ROOT %tuple.2 = (f32[], f32[]) tuple(f32[]
+%get-tuple-element.2, f32[] %sub)
+    }
+
+    %condition (param.2: (f32[], f32[])) -> pred[] {
+      %param.2 = (f32[], f32[]) parameter(0)
+      ROOT %constant = pred[] constant(false)
+    }
+
+    %condition.1 (param.3: (f32[], f32[])) -> pred[] {
+      %param.3 = (f32[], f32[]) parameter(0)
+      ROOT %constant.1 = pred[] constant(false)
+    }
+
+    ENTRY %WhileLoopsIdenticalConditionsSameInputAndDifferentBodies () ->
+(f32[], f32[]) { %constant.2 = f32[] constant(1) %constant.3 = f32[] constant(2)
+      %tuple.1 = (f32[], f32[]) tuple(f32[] %constant.2, f32[] %constant.3)
+      %while = (f32[], f32[]) while((f32[], f32[]) %tuple.1),
+condition=%condition, body=%body ROOT %while.1 = (f32[], f32[]) while((f32[],
+f32[]) %tuple.1), condition=%condition.1, body=%body2
+    }
+    )")
+                    .ValueOrDie();
+
+  auto computation = module->entry_computation();
+
+  EXPECT_EQ(5, computation->instruction_count());
+  HloCSE cse(true);
+  EXPECT_FALSE(cse.Run(module.get()).ValueOrDie());
+  EXPECT_EQ(5, computation->instruction_count());
+}
+
+// Test two identical while loops with different inputs
+TEST_F(HloCseTest, WhileLoopsIdenticalConditionsAndBodiesDifferentInput) {
+  auto module = ParseHloString(R"(
+    HloModule WhileLoopsIdenticalConditionsAndBodiesDifferentInput
+
+    %body (param: (f32[], f32[])) -> (f32[], f32[]) {
+      %param = (f32[], f32[]) parameter(0)
+      %get-tuple-element = f32[] get-tuple-element((f32[], f32[]) %param),
+index=0 %get-tuple-element.1 = f32[] get-tuple-element((f32[], f32[]) %param),
+index=1 %add = f32[] add(f32[] %get-tuple-element, f32[] %get-tuple-element.1)
+      ROOT %tuple = (f32[], f32[]) tuple(f32[] %get-tuple-element, f32[] %add)
+    }
+
+    %condition (param.1: (f32[], f32[])) -> pred[] {
+      %param.1 = (f32[], f32[]) parameter(0)
+      ROOT %constant = pred[] constant(false)
+    }
+
+    %condition.1 (param.2: (f32[], f32[])) -> pred[] {
+      %param.2 = (f32[], f32[]) parameter(0)
+      ROOT %constant.1 = pred[] constant(false)
+    }
+
+    ENTRY %WhileLoopsIdenticalConditionsAndBodiesDifferentInput () -> (f32[],
+f32[]) { %constant.2 = f32[] constant(1) %constant.3 = f32[] constant(2)
+      %tuple.1 = (f32[], f32[]) tuple(f32[] %constant.2, f32[] %constant.3)
+      %while = (f32[], f32[]) while((f32[], f32[]) %tuple.1),
+condition=%condition, body=%body %constant.4 = f32[] constant(1) %constant.5 =
+f32[] constant(2) %tuple.2 = (f32[], f32[]) tuple(f32[] %constant.4, f32[]
+%constant.5) ROOT %while.1 = (f32[], f32[]) while((f32[], f32[]) %tuple.2),
+condition=%condition.1, body=%body
+    }
+
+    )")
+                    .ValueOrDie();
+
+  auto computation = module->entry_computation();
+
+  EXPECT_EQ(8, computation->instruction_count());
+  HloCSE cse(true);
+  EXPECT_FALSE(cse.Run(module.get()).ValueOrDie());
+  EXPECT_EQ(8, computation->instruction_count());
+}
+
+// Test two while loops with identical bodies and same inputs, but different
+// conditions
+TEST_F(HloCseTest, WhileLoopsIdenticalBodiesAndInputDifferntConditions) {
+  auto module = ParseHloString(R"(
+    HloModule WhileLoopsIdenticalBodiesAndInputDifferntConditions
+
+    %body (param: (f32[], f32[])) -> (f32[], f32[]) {
+      %param = (f32[], f32[]) parameter(0)
+      %get-tuple-element = f32[] get-tuple-element((f32[], f32[]) %param),
+index=0 %get-tuple-element.1 = f32[] get-tuple-element((f32[], f32[]) %param),
+index=1 %add = f32[] add(f32[] %get-tuple-element, f32[] %get-tuple-element.1)
+      ROOT %tuple = (f32[], f32[]) tuple(f32[] %get-tuple-element, f32[] %add)
+    }
+
+    %condition (param.1: (f32[], f32[])) -> pred[] {
+      %param.1 = (f32[], f32[]) parameter(0)
+      ROOT %constant = pred[] constant(false)
+    }
+
+    %condition.1 (param.2: (f32[], f32[])) -> pred[] {
+      %param.2 = (f32[], f32[]) parameter(0)
+      ROOT %constant.1 = pred[] constant(true)
+    }
+
+    ENTRY %WhileLoopsIdenticalBodiesAndInputDifferntConditions () -> (f32[],
+f32[]) { %constant.2 = f32[] constant(1) %constant.3 = f32[] constant(2)
+      %tuple.1 = (f32[], f32[]) tuple(f32[] %constant.2, f32[] %constant.3)
+      %while = (f32[], f32[]) while((f32[], f32[]) %tuple.1),
+condition=%condition, body=%body ROOT %while.1 = (f32[], f32[]) while((f32[],
+f32[]) %tuple.1), condition=%condition.1, body=%body
+    })")
+                    .ValueOrDie();
+
+  auto computation = module->entry_computation();
+
+  EXPECT_EQ(5, computation->instruction_count());
+  HloCSE cse(true);
+  EXPECT_FALSE(cse.Run(module.get()).ValueOrDie());
+  EXPECT_EQ(5, computation->instruction_count());
+}
+
 TEST_F(HloCseTest, IdenticalInstructionsDifferentLayoutsSensitive) {
   // Test that two identical instructions with different layouts are *not*
   // commoned if the pass is layout sensitive.
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
index de1a32d8bd..1abfcb7703 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
@@ -1017,19 +1017,17 @@ bool HloDataflowAnalysis::CanShareOperandBufferWithUser(
   }
 
   if (user->opcode() == HloOpcode::kFusion) {
+    if (fusion_can_share_buffer_ != nullptr) {
+      return fusion_can_share_buffer_(user, operand);
+    }
     // Get the parameter associated with 'operand';
     HloInstruction* fusion_param =
         user->fused_parameter(user->operand_index(operand));
 
     const HloValue& value = GetValueDefinedAt(fusion_param, operand_index);
-    if (value.uses().size() != 1) {
-      if (MultiDynamicSliceUseShareSameIndices(value.uses())) {
-        return true;
-      }
-      return false;
+    if (MultiDynamicSliceUseShareSameIndices(value.uses())) {
+      return true;
     }
-    const HloUse& use = value.uses()[0];
-
     if (user->fusion_kind() == HloInstruction::FusionKind::kLoop ||
         user->fusion_kind() == HloInstruction::FusionKind::kInput) {
       if (user->fused_expression_root()->opcode() ==
@@ -1039,13 +1037,17 @@ bool HloDataflowAnalysis::CanShareOperandBufferWithUser(
         // Returns true iff there is exactly one use of 'operand' at shape index
         // 'operand_index', and this singleton use is the fused root at operand
         // index 0.
-        return use.instruction == user->fused_expression_root() &&
-               use.operand_number == 0;
-      } else {
-        return AreTransitiveUsesElementwiseOrTuple(fusion_param);
+        if (value.uses().size() == 1) {
+          const HloUse& use = value.uses()[0];
+          return use.instruction == user->fused_expression_root() &&
+                 use.operand_number == 0;
+        }
+        return false;
       }
-    } else if (user->fusion_kind() == HloInstruction::FusionKind::kOutput &&
-               user->fused_expression_root()->opcode() == HloOpcode::kAdd) {
+      return AreTransitiveUsesElementwiseOrTuple(fusion_param);
+    }
+    if (user->fusion_kind() == HloInstruction::FusionKind::kOutput &&
+        user->fused_expression_root()->opcode() == HloOpcode::kAdd) {
       // Output fusion with kAdd fused root.
 
       // Check if one operand of kAdd fused root is kDot or kConvolution.
@@ -1066,11 +1068,12 @@ bool HloDataflowAnalysis::CanShareOperandBufferWithUser(
       // Returns true iff there is exactly one use of 'operand' at shape index
       // 'operand_index', and this singleton use is the fused root (at operand
       // index 'other_add_operand_index').
-      return use.instruction == user->fused_expression_root() &&
-             use.operand_number == other_add_operand_index;
-    } else if (fusion_can_share_buffer_ != nullptr &&
-               fusion_can_share_buffer_(user, operand)) {
-      return true;
+      if (value.uses().size() == 1) {
+        const HloUse& use = value.uses()[0];
+        return use.instruction == user->fused_expression_root() &&
+               use.operand_number == other_add_operand_index;
+      }
+      return false;
     }
   }
 
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
index 5f575b24a1..cba72469ce 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
index c0a8ea8bcb..d5b4be7e12 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
@@ -1997,6 +1997,30 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     return HandleReducePrecision<ElementwiseT>(reduce_precision);
   }
 
+  template <typename NativeT,
+            typename std::enable_if<
+                std::is_same<NativeT, float>::value ||
+                std::is_same<NativeT, int32>::value ||
+                std::is_same<NativeT, uint32>::value>::type* = nullptr>
+  Status HandleIota(HloInstruction* iota) {
+    auto result = MakeUnique<Literal>(iota->shape());
+    auto data = result->data<ReturnT>();
+    std::iota(data.begin(), data.end(), 0);
+    parent_->evaluated_[iota] = std::move(result);
+    return Status::OK();
+  }
+  template <typename NativeT,
+            typename std::enable_if<
+                !(std::is_same<NativeT, float>::value ||
+                  std::is_same<NativeT, int32>::value ||
+                  std::is_same<NativeT, uint32>::value)>::type* = nullptr>
+  Status HandleIota(HloInstruction* iota) {
+    return InvalidArgument("Unsupported type for iota");
+  }
+  Status HandleIota(HloInstruction* iota) override {
+    return HandleIota<ReturnT>(iota);
+  }
+
  private:
   // Creates a vector of multipliers which can be used to create a linear index
   // into shape.
@@ -2054,10 +2078,6 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
                              start_indices_typed.end());
 
     // Clamp the start indices so the slice is in-bounds w.r.t the operand.
-
-    // TODO(b/74360564): This is implementation defined behavior, but is
-    // currently respected by all implementations. Change this if we ever decide
-    // to officially document different behavior.
     for (int64 i = 0; i < start.size(); ++i) {
       start[i] = std::min<int64>(
           std::max(int64{0}, start[i]),
@@ -2091,10 +2111,6 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
                              start_indices_typed.end());
     // Clamp the update start indices so the slice is in-bounds w.r.t the
     // operand.
-
-    // TODO(b/74360564): This is implementation defined behavior, but is
-    // currently respected by all implementations. Change this if we ever decide
-    // to oficially document different behavior.
     for (int64 i = 0; i < rank; ++i) {
       start[i] = std::min<int64>(
           std::max<int64>(0, start[i]),
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index 57cf34d7de..fd5085bed2 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -948,6 +948,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kGe:
     case HloOpcode::kGt:
     case HloOpcode::kImag:
+    case HloOpcode::kIota:
     case HloOpcode::kIsFinite:
     case HloOpcode::kLe:
     case HloOpcode::kLog:
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index ae30d2ad8d..8b9bdd2f46 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -463,6 +463,11 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
   return MakeUnique<HloConstantInstruction>(std::move(literal));
 }
 
+/* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateIota(
+    const Shape& shape) {
+  return WrapUnique(new HloInstruction(HloOpcode::kIota, shape));
+}
+
 /* static */ std::unique_ptr<HloInstruction>
 HloInstruction::CreateGetTupleElement(const Shape& shape,
                                       HloInstruction* operand, int64 index) {
@@ -1119,6 +1124,7 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kDynamicSlice:
     case HloOpcode::kSort:
     case HloOpcode::kGather:
+    case HloOpcode::kIota:
       clone = CloneWithNewOperandsImpl(shape, new_operands, context);
       break;
     // Unary ops.
@@ -1516,8 +1522,7 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kTupleSelect:
       return true;
 
-    // These opcodes have complex or special behavior so just return false.
-    case HloOpcode::kWhile:
+    // This opcode has complex or special behavior so just return false.
     case HloOpcode::kAfterAll:
       return false;
 
@@ -1533,6 +1538,14 @@ bool HloInstruction::IdenticalSlowPath(
       return eq_computations(true_computation(), other.true_computation()) &&
              eq_computations(false_computation(), other.false_computation());
 
+    case HloOpcode::kWhile: {
+      if (eq_computations(while_body(), other.while_body()) &&
+          eq_computations(while_condition(), other.while_condition())) {
+        return true;
+      }
+      return false;
+    }
+
     case HloOpcode::kDomain:
       return operand_side_metadata().Matches(other.operand_side_metadata()) &&
              user_side_metadata().Matches(other.user_side_metadata());
@@ -1556,6 +1569,7 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kMap:
     case HloOpcode::kSlice:
     case HloOpcode::kConstant:
+    case HloOpcode::kIota:
     case HloOpcode::kTrace:
     case HloOpcode::kFusion:
     case HloOpcode::kRng:
@@ -1576,6 +1590,7 @@ bool HloInstruction::IdenticalSlowPath(
       LOG(FATAL) << "Base class impl called for opcode with subclass: "
                  << opcode();
   }
+  return false;
 }
 
 void HloInstruction::RemoveUser(HloInstruction* user) {
@@ -2300,6 +2315,8 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
       return visitor->HandleDomain(this);
     case HloOpcode::kAfterAll:
       return visitor->HandleAfterAll(this);
+    case HloOpcode::kIota:
+      return visitor->HandleIota(this);
 
     // These opcodes are not handled here.
     case HloOpcode::kTrace:
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index c6faa69a78..30bff286c2 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -346,6 +346,9 @@ class HloInstruction {
   static std::unique_ptr<HloInstruction> CreateConstant(
       std::unique_ptr<Literal> literal);
 
+  // Creates an Iota instruction.
+  static std::unique_ptr<HloInstruction> CreateIota(const Shape& shape);
+
   // Creates a get tuple element instruction.
   static std::unique_ptr<HloInstruction> CreateGetTupleElement(
       const Shape& shape, HloInstruction* operand, int64 index);
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h
index 39e12c4815..59e9a5a94a 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.h
+++ b/tensorflow/compiler/xla/service/hlo_opcode.h
@@ -87,6 +87,7 @@ namespace xla {
   V(kHostCompute, "host-compute")                            \
   V(kImag, "imag")                                           \
   V(kInfeed, "infeed")                                       \
+  V(kIota, "iota")                                           \
   V(kIsFinite, "is-finite")                                  \
   V(kLe, "less-than-or-equal-to", kHloOpcodeIsComparison)    \
   V(kLog, "log")                                             \
diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc
index 496eca0739..d71d3c8170 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser.cc
@@ -492,6 +492,14 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
           HloInstruction::CreateConstant(std::move(literal)));
       break;
     }
+    case HloOpcode::kIota: {
+      if (!ParseOperands(&operands, /*expected_size=*/0) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction = builder->AddInstruction(HloInstruction::CreateIota(shape));
+      break;
+    }
     // Unary ops.
     case HloOpcode::kAbs:
     case HloOpcode::kRoundNearestAfz:
@@ -1124,13 +1132,24 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
     }
     case HloOpcode::kCustomCall: {
       optional<string> custom_call_target;
+      optional<Window> window;
+      optional<ConvolutionDimensionNumbers> dnums;
       attrs["custom_call_target"] = {/*required=*/true, AttrTy::kString,
                                      &custom_call_target};
+      attrs["window"] = {/*required=*/false, AttrTy::kWindow, &window};
+      attrs["dim_labels"] = {/*required=*/false,
+                             AttrTy::kConvolutionDimensionNumbers, &dnums};
       if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
         return false;
       }
       instruction = builder->AddInstruction(HloInstruction::CreateCustomCall(
           shape, operands, *custom_call_target));
+      if (window.has_value()) {
+        instruction->set_window(*window);
+      }
+      if (dnums.has_value()) {
+        instruction->set_convolution_dimension_numbers(*dnums);
+      }
       break;
     }
     case HloOpcode::kHostCompute: {
diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc
index 6ba34cf22a..1c08c51220 100644
--- a/tensorflow/compiler/xla/service/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc
@@ -1005,6 +1005,28 @@ ENTRY CrossReplicaSumWithSubgroups {
 }
 
 )"
+},
+// Iota
+{
+"Iota",
+R"(HloModule iota
+
+ENTRY Iota {
+  ROOT iota = f32[100]{0} iota()
+}
+
+)"
+},
+// custom-call with window and dim_labels
+{
+"CustomCallWithWindowAndDimLabels",
+R"(HloModule CustomCallWithWindowAndDimLabels
+
+ENTRY Computation {
+  ROOT r = f32[100]{0} custom-call(), window={size=2x2}, dim_labels=b01f_01io->b01f, custom_call_target="target"
+}
+
+)"
 }
   });
   // clang-format on
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.cc b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
index 59a8800a7d..cf0be30c7a 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
@@ -1203,7 +1203,7 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
 StatusOr<bool> HloRematerialization::Run(
     HloModule* module, SequentialHloOrdering::HloModuleSequence* sequence,
     int64 memory_limit_bytes, RematerializationSizes* sizes,
-    bool run_copy_elision) {
+    CopyInsertion* copy_insertion) {
   // The sequence is constructed entirely by this method.
   TF_RET_CHECK(sequence->empty());
 
@@ -1238,13 +1238,14 @@ StatusOr<bool> HloRematerialization::Run(
                                        return size_function_(buffer.shape());
                                      },
                                      scheduler_algorithm_));
-  if (run_copy_elision) {
+  if (copy_insertion) {
     // We run a separate pass of copy elision here because the sequential
     // ordering from the HLO schedule allows for more copies to be eliminated.
     // TODO(b/80249101): Instead of a separate copy elision pass, use the
     // ordering from the HLO schedule directly for copy insertion.
     SequentialHloOrdering ordering(module, *sequence);
-    TF_RETURN_IF_ERROR(RemoveUnnecessaryCopies(ordering, module));
+    TF_RETURN_IF_ERROR(
+        copy_insertion->RemoveUnnecessaryCopies(ordering, module));
   }
 
   // Compute peak memory usage of all computations in the module called in a
@@ -1349,10 +1350,10 @@ StatusOr<bool> HloRematerialization::Run(
     int64 memory_limit_bytes, HloModule* hlo_module,
     MemorySchedulerAlgorithm scheduler_algorithm,
     SequentialHloOrdering::HloModuleSequence* sequence,
-    RematerializationSizes* sizes, bool run_copy_elision) {
+    RematerializationSizes* sizes, CopyInsertion* copy_insertion) {
   HloRematerialization remat(scheduler_algorithm, size_function);
   return remat.Run(hlo_module, sequence, memory_limit_bytes, sizes,
-                   run_copy_elision);
+                   copy_insertion);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.h b/tensorflow/compiler/xla/service/hlo_rematerialization.h
index 59b4cf5dcc..2ec004350a 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.h
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.h
@@ -17,6 +17,7 @@
 
 #include "tensorflow/compiler/xla/service/buffer_liveness.h"
 #include "tensorflow/compiler/xla/service/call_graph.h"
+#include "tensorflow/compiler/xla/service/copy_insertion.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
@@ -57,8 +58,9 @@ class HloRematerialization {
   //   sizes: Optional outparam that indicates the peak memory usage of the HLO
   //     module before/after rematerialization.
   //
-  //   run_copy_elision: Enable copy elision. This pass is used to eliminate
-  //     copies that were inserted before HLO scheduling.
+  //   copy_insertion: If non-null, run copy elision after scheduling. This
+  //     pass is used to eliminate copies that were inserted by copy insertion
+  //     before HLO scheduling.
   //
   // TODO(b/80249101): Remove the 'run_copy_elision' parameter when copy
   // insertion is integrated with HLO scheduling.
@@ -74,7 +76,7 @@ class HloRematerialization {
       const ShapeSizeFunction& size_function, int64 memory_limit_bytes,
       HloModule* hlo_module, MemorySchedulerAlgorithm scheduler_algorithm,
       SequentialHloOrdering::HloModuleSequence* sequence,
-      RematerializationSizes* sizes, bool run_copy_elision = true);
+      RematerializationSizes* sizes, CopyInsertion* copy_insertion = nullptr);
 
  protected:
   HloRematerialization(MemorySchedulerAlgorithm scheduler_algorithm,
@@ -90,7 +92,7 @@ class HloRematerialization {
   StatusOr<bool> Run(HloModule* module,
                      SequentialHloOrdering::HloModuleSequence* sequence,
                      int64 memory_limit, RematerializationSizes* sizes,
-                     bool run_copy_elision);
+                     CopyInsertion* copy_insertion);
 
   // Rematerializes instructions within the given computation. 'order' is the
   // order in which the computation's instructions will be emitted in the
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
index cd131147e6..ac8c97d380 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
@@ -147,7 +147,7 @@ class HloRematerializationTest : public HloTestBase {
     TF_EXPECT_OK(verifier().Run(module).status());
     return HloRematerialization::RematerializeAndSchedule(
         ByteSizeOf, memory_limit_bytes, module, DefaultMemoryScheduler,
-        sequence, /*sizes=*/nullptr, /*run_copy_elision=*/false);
+        sequence, /*sizes=*/nullptr);
   }
 
   // Various shapes used in the canned computations.
diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc
index 4f0569f405..b2725e2918 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.cc
+++ b/tensorflow/compiler/xla/service/hlo_runner.cc
@@ -180,8 +180,12 @@ StatusOr<ScopedShapedBuffer> HloRunner::ExecuteWithDeviceBuffers(
 
   TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
                       CreateExecutable(std::move(module), run_hlo_passes));
-  return executable->ExecuteOnStreamWrapper(&service_run_options,
-                                            /*profile=*/profile, arguments);
+  TF_ASSIGN_OR_RETURN(
+      ScopedShapedBuffer retval,
+      executable->ExecuteOnStreamWrapper(&service_run_options,
+                                         /*profile=*/profile, arguments));
+  TF_RETURN_IF_ERROR(stream.BlockHostUntilDone());
+  return std::move(retval);
 }
 
 StatusOr<ScopedShapedBuffer> HloRunner::ExecuteWithDeviceBuffers(
@@ -309,6 +313,7 @@ StatusOr<std::vector<std::unique_ptr<Literal>>> HloRunner::ExecuteReplicated(
 
   std::vector<std::unique_ptr<Literal>> exec_results;
   for (int64 i = 0; i < options.num_replicas; ++i) {
+    TF_RETURN_IF_ERROR(streams[i]->BlockHostUntilDone());
     TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> literal,
                         backend().transfer_manager()->TransferLiteralFromDevice(
                             streams[i].get(), results[i]));
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index 6a32093b6e..25fa319faf 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -119,7 +119,7 @@ Status CheckIsTokenOperand(const HloInstruction* instruction,
   const HloInstruction* token = instruction->operand(operand_no);
   if (!ShapeUtil::Equal(token->shape(), ShapeUtil::MakeTokenShape())) {
     return InternalError(
-        "Expected operand %lld to be token-shaped, actual shape is"
+        "Expected operand %lld to be token-shaped, actual shape is "
         "%s:\n%s",
         operand_no, ShapeUtil::HumanString(token->shape()).c_str(),
         instruction->ToString().c_str());
@@ -210,6 +210,12 @@ Status ShapeVerifier::HandleConstant(HloInstruction* constant) {
   return CheckShape(constant, constant->literal().shape());
 }
 
+Status ShapeVerifier::HandleIota(HloInstruction* iota) {
+  return ShapeUtil::Rank(iota->shape()) == 1
+             ? Status::OK()
+             : InternalError("Iota only supports arrays of rank 1.");
+}
+
 Status ShapeVerifier::HandleGetTupleElement(HloInstruction* get_tuple_element) {
   return CheckShape(get_tuple_element,
                     ShapeInference::InferGetTupleElementShape(
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h
index 810c66cf02..79f7aa9f4c 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.h
+++ b/tensorflow/compiler/xla/service/hlo_verifier.h
@@ -37,6 +37,7 @@ class ShapeVerifier : public DfsHloVisitor {
   Status HandleSelect(HloInstruction* select) override;
   Status HandleTupleSelect(HloInstruction* tuple_select) override;
   Status HandleConcatenate(HloInstruction* concatenate) override;
+  Status HandleIota(HloInstruction* iota) override;
   Status HandleConvert(HloInstruction* convert) override;
   Status HandleBitcastConvert(HloInstruction* convert) override;
   Status HandleCopy(HloInstruction* copy) override;
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index da91262130..af07370135 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -73,6 +73,7 @@ bool IsAlwaysDuplicable(const HloInstruction& instruction) {
     case HloOpcode::kGt:
     case HloOpcode::kImag:
     case HloOpcode::kInfeed:
+    case HloOpcode::kIota:
     case HloOpcode::kIsFinite:
     case HloOpcode::kLe:
     case HloOpcode::kLt:
diff --git a/tensorflow/compiler/xla/service/llvm_ir/BUILD b/tensorflow/compiler/xla/service/llvm_ir/BUILD
index 462be543bc..cdd3daf73b 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/BUILD
+++ b/tensorflow/compiler/xla/service/llvm_ir/BUILD
@@ -188,7 +188,10 @@ cc_library(
         ":ir_array",
         ":llvm_loop",
         ":llvm_util",
+        ":loop_emitter",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla/service/gpu:parallel_loop_emitter",
+        "//tensorflow/compiler/xla/service/gpu:partition_assignment",
         "//tensorflow/core:lib",
         "@llvm//:core",
     ],
@@ -220,3 +223,22 @@ cc_library(
         "@llvm//:core",
     ],
 )
+
+cc_library(
+    name = "buffer_assignment_util",
+    srcs = ["buffer_assignment_util.cc"],
+    hdrs = ["buffer_assignment_util.h"],
+    deps = [
+        "//tensorflow/compiler/xla/service:buffer_assignment",
+    ],
+)
+
+cc_library(
+    name = "math_ops",
+    srcs = ["math_ops.cc"],
+    hdrs = ["math_ops.h"],
+    deps = [
+        ":llvm_util",
+        "@llvm//:core",
+    ],
+)
diff --git a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc
index 93a8c130e1..e5370eca56 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc
@@ -28,8 +28,7 @@ namespace llvm_ir {
 // Sentry allocation used to represent parameters of the entry computation in
 // alias_scope_metadata_ and noalias_metadata_.
 static const BufferAllocation* kParameterAllocation = new BufferAllocation(
-    /*index=*/-1, /*size=*/0, /*is_thread_local=*/false, /*is_reusable=*/false,
-    LogicalBuffer::Color(0));
+    /*index=*/-1, /*size=*/0, LogicalBuffer::Color(0));
 
 void AliasAnalysis::AddAliasingInformationToIrArray(const HloInstruction& hlo,
                                                     llvm_ir::IrArray* array,
diff --git a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis_test.cc b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis_test.cc
index 2552ff4a6a..941d940684 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis_test.cc
@@ -58,7 +58,7 @@ ENTRY while3 {
   CompileAndVerifyIr(hlo_string, R"(
 ; CHECK-LABEL: @body(i8* align 4 dereferenceable(4) %retval
 ; CHECK: %[[add_result:.*]] = fadd fast float %[[fadd_lhs:.*]], %[[fadd_rhs:.*]]
-; CHECK: store float %[[add_result]], float* %[[store_dest:.*]], !alias.scope ![[alias_scope_md_for_store:.*]]
+; CHECK: store float %[[add_result]], float* %[[store_dest:.*]], !alias.scope ![[alias_scope_md_for_store:[0-9]+]]
 ;
 ; CHECK-LABEL: @condition(i8* align 1 dereferenceable(1) %fusion, i8* noalias %run_options, i8** noalias %params
 ; CHECK: %[[cond_state_buf_ptr:.*]] = getelementptr inbounds i8*, i8** %params, i64 0
diff --git a/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.cc b/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.cc
new file mode 100644
index 0000000000..4eb5d9fb47
--- /dev/null
+++ b/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.cc
@@ -0,0 +1,59 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h"
+
+namespace xla {
+namespace llvm_ir {
+static const HloInstruction& InstrForConstantBufferAllocation(
+    const BufferAllocation& allocation) {
+  CHECK(allocation.is_constant());
+  HloInstruction* const_instr = nullptr;
+  for (const auto& buffer_offset_pair : allocation.assigned_buffers()) {
+    const LogicalBuffer* buffer = buffer_offset_pair.first;
+    // BufferAssignment may have assigned non-constant instructions to this
+    // allocation too so we can't CHECK this condition.  E.g. for
+    //
+    //   while(init = constant, body = identity, cond = ...)
+    //
+    // the LogicalBuffer for the kWhile instruction will have the same
+    // BufferAllocation as the LogicalBuffer for the (init) constant.
+    if (buffer->instruction()->opcode() == HloOpcode::kConstant) {
+      CHECK_EQ(const_instr, nullptr)
+          << const_instr->ToString() << " " << buffer->ToString();
+      const_instr = buffer->instruction();
+    }
+  }
+  CHECK_NE(const_instr, nullptr);
+  return *const_instr;
+}
+
+string ConstantBufferAllocationToGlobalName(
+    const BufferAllocation& allocation) {
+  string instr_name = InstrForConstantBufferAllocation(allocation).name();
+  for (char& c : instr_name) {
+    if (c == '.') {
+      c = '_';
+    }
+  }
+  return tensorflow::strings::StrCat("buffer_for_", instr_name);
+}
+
+const Literal& LiteralForConstantAllocation(
+    const BufferAllocation& allocation) {
+  return InstrForConstantBufferAllocation(allocation).literal();
+}
+}  // namespace llvm_ir
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h b/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h
new file mode 100644
index 0000000000..bfb6eecb87
--- /dev/null
+++ b/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h
@@ -0,0 +1,34 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_BUFFER_ASSIGNMENT_UTIL_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_BUFFER_ASSIGNMENT_UTIL_H_
+
+#include "tensorflow/compiler/xla/service/buffer_assignment.h"
+
+namespace xla {
+namespace llvm_ir {
+// In XLA:GPU we map constant buffer allocations to globals in the generated
+// LLVM IR.  This function gives us the name of the global variable a constant
+// buffer is mapped to.  Not used on XLA:CPU.
+string ConstantBufferAllocationToGlobalName(const BufferAllocation& allocation);
+
+// Returns the Literal corresponding to `allocation`, which must be a constant
+// allocation.
+const Literal& LiteralForConstantAllocation(const BufferAllocation& allocation);
+}  // namespace llvm_ir
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_BUFFER_ASSIGNMENT_UTIL_H_
diff --git a/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc b/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc
index 7048fcfdc9..27fbb11e2e 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc
@@ -40,14 +40,14 @@ static Status EmitDynamicUpdateSliceInPlaceImpl(
     const Shape& update_shape, const ElementGenerator& start_indices_generator,
     bool is_signed, ElementGenerator update_array_generator,
     const IrArray& output_array, const gpu::LaunchDimensions* launch_dimensions,
-    tensorflow::StringPiece name, llvm::IRBuilder<>* ir_builder) {
+    tensorflow::StringPiece name, llvm::IRBuilder<>* b) {
   const Shape& output_shape = output_array.GetShape();
 
   // Read start indices from start_indices_generator.
   const int64 rank = ShapeUtil::Rank(output_shape);
-  IrArray::Index start_index(ir_builder->getInt64Ty(), rank);
+  IrArray::Index start_index(b->getInt64Ty(), rank);
   for (int64 i = 0; i < rank; ++i) {
-    IrArray::Index dim_index({ir_builder->getInt64(i)});
+    IrArray::Index dim_index({b->getInt64(i)});
     TF_ASSIGN_OR_RETURN(start_index[i], start_indices_generator(dim_index));
     llvm::Value* output_dim_size = llvm::ConstantInt::get(
         start_index[i]->getType(), output_shape.dimensions(i));
@@ -56,24 +56,19 @@ static Status EmitDynamicUpdateSliceInPlaceImpl(
 
     // Clamp the start index so that the update region fits in the operand.
     // start_index = clamp(start_index, 0, output_dim_size - update_dim_size)
-
-    // TODO(b/74360564): This is implementation defined behavior, but is
-    // currently respected by all implementations. Change this if we ever decide
-    // to officially document different behavior.
-    llvm::Value* max_bound =
-        ir_builder->CreateSub(output_dim_size, update_dim_size);
+    llvm::Value* max_bound = b->CreateSub(output_dim_size, update_dim_size);
     llvm::Value* zero = llvm::ConstantInt::get(start_index[i]->getType(), 0);
-    start_index[i] = ir_builder->CreateSelect(
-        ir_builder->CreateICmp(
-            is_signed ? llvm::ICmpInst::ICMP_SGE : llvm::ICmpInst::ICMP_UGE,
-            zero, start_index[i]),
-        zero, start_index[i]);
-
-    start_index[i] = ir_builder->CreateSelect(
-        ir_builder->CreateICmp(
-            is_signed ? llvm::ICmpInst::ICMP_SLE : llvm::ICmpInst::ICMP_ULE,
-            max_bound, start_index[i]),
-        max_bound, start_index[i]);
+    start_index[i] =
+        b->CreateSelect(b->CreateICmp(is_signed ? llvm::ICmpInst::ICMP_SGE
+                                                : llvm::ICmpInst::ICMP_UGE,
+                                      zero, start_index[i]),
+                        zero, start_index[i]);
+
+    start_index[i] =
+        b->CreateSelect(b->CreateICmp(is_signed ? llvm::ICmpInst::ICMP_SLE
+                                                : llvm::ICmpInst::ICMP_ULE,
+                                      max_bound, start_index[i]),
+                        max_bound, start_index[i]);
   }
 
   auto loop_body_emitter = [&](const IrArray::Index& update_index) -> Status {
@@ -84,31 +79,30 @@ static Status EmitDynamicUpdateSliceInPlaceImpl(
     //
     IrArray::Index output_index(start_index.GetType(), rank);
     for (int64 i = 0; i < rank; ++i) {
-      llvm::Value* start_index0 = ir_builder->CreateSExtOrBitCast(
-          start_index[i], update_index[i]->getType());
-      output_index[i] = ir_builder->CreateAdd(start_index0, update_index[i]);
+      llvm::Value* start_index0 =
+          b->CreateSExtOrBitCast(start_index[i], update_index[i]->getType());
+      output_index[i] = b->CreateAdd(start_index0, update_index[i]);
     }
 
     // Do output[output_index] = update[update_index].
     TF_ASSIGN_OR_RETURN(llvm::Value * update_data,
                         update_array_generator(update_index));
-    output_array.EmitWriteArrayElement(output_index, update_data, ir_builder);
+    output_array.EmitWriteArrayElement(output_index, update_data, b);
     return Status::OK();
   };
 
   if (launch_dimensions != nullptr) {
     return gpu::ParallelLoopEmitter(loop_body_emitter, update_shape,
-                                    *launch_dimensions, ir_builder)
+                                    *launch_dimensions, b)
         .EmitLoop(name);
   }
-  return LoopEmitter(loop_body_emitter, update_shape, ir_builder)
-      .EmitLoop(name);
+  return LoopEmitter(loop_body_emitter, update_shape, b).EmitLoop(name);
 }
 
 Status EmitDynamicUpdateSliceInPlace(
     tensorflow::gtl::ArraySlice<IrArray> operand_arrays,
     const IrArray& output_array, tensorflow::StringPiece name,
-    llvm::IRBuilder<>* ir_builder) {
+    llvm::IRBuilder<>* b) {
   VLOG(2) << "EmitDynamicUpdateSliceInPlace for " << name;
 
   // No need to use operand_arrays[0], the input array of the
@@ -119,16 +113,16 @@ Status EmitDynamicUpdateSliceInPlace(
   Shape update_shape = update_array.GetShape();
 
   ElementGenerator start_indices_generator = [&](const IrArray::Index& index) {
-    return start_indices_array.EmitReadArrayElement(index, ir_builder);
+    return start_indices_array.EmitReadArrayElement(index, b);
   };
   ElementGenerator update_array_generator = [&](const IrArray::Index& index) {
-    return update_array.EmitReadArrayElement(index, ir_builder);
+    return update_array.EmitReadArrayElement(index, b);
   };
 
   bool is_signed = ShapeUtil::ElementIsSigned(start_indices_array.GetShape());
   return EmitDynamicUpdateSliceInPlaceImpl(
       update_shape, start_indices_generator, is_signed, update_array_generator,
-      output_array, /*launch_dimensions=*/nullptr, name, ir_builder);
+      output_array, /*launch_dimensions=*/nullptr, name, b);
 }
 
 // Shared implementation for EmitFusedDynamicUpdateSliceInPlace and
@@ -139,8 +133,7 @@ static Status EmitFusedDynamicUpdateSliceInPlaceImpl(
     HloInstruction* fusion,
     tensorflow::gtl::ArraySlice<IrArray> fusion_operand_arrays,
     const IrArray& fusion_output_array, ElementalIrEmitter* elemental_emitter,
-    const gpu::LaunchDimensions* launch_dimensions,
-    llvm::IRBuilder<>* ir_builder) {
+    const gpu::LaunchDimensions* launch_dimensions, llvm::IRBuilder<>* b) {
   CHECK_EQ(fusion->opcode(), HloOpcode::kFusion);
   VLOG(2) << "EmitFusedDynamicUpdateSliceInPlace for "
           << fusion->ToShortString();
@@ -177,28 +170,27 @@ static Status EmitFusedDynamicUpdateSliceInPlaceImpl(
   bool is_signed = ShapeUtil::ElementIsSigned(start_indices->shape());
   return EmitDynamicUpdateSliceInPlaceImpl(
       update_shape, start_indices_generator, is_signed, update_array_generator,
-      fusion_output_array, launch_dimensions, IrName(fusion), ir_builder);
+      fusion_output_array, launch_dimensions, IrName(fusion), b);
 }
 
 Status EmitFusedDynamicUpdateSliceInPlace(
     HloInstruction* fusion,
     tensorflow::gtl::ArraySlice<IrArray> fusion_operand_arrays,
     const IrArray& fusion_output_array, ElementalIrEmitter* elemental_emitter,
-    llvm::IRBuilder<>* ir_builder) {
+    llvm::IRBuilder<>* b) {
   return EmitFusedDynamicUpdateSliceInPlaceImpl(
       fusion, fusion_operand_arrays, fusion_output_array, elemental_emitter,
-      /*launch_dimensions=*/nullptr, ir_builder);
+      /*launch_dimensions=*/nullptr, b);
 }
 
 Status EmitParallelFusedDynamicUpdateSliceInPlace(
     HloInstruction* fusion,
     tensorflow::gtl::ArraySlice<IrArray> fusion_operand_arrays,
     const IrArray& fusion_output_array, ElementalIrEmitter* elemental_emitter,
-    const gpu::LaunchDimensions& launch_dimensions,
-    llvm::IRBuilder<>* ir_builder) {
+    const gpu::LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* b) {
   return EmitFusedDynamicUpdateSliceInPlaceImpl(
       fusion, fusion_operand_arrays, fusion_output_array, elemental_emitter,
-      &launch_dimensions, ir_builder);
+      &launch_dimensions, b);
 }
 
 }  // namespace llvm_ir
diff --git a/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h b/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h
index 7f73fb6b29..3502577d23 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h
@@ -66,7 +66,7 @@ inline bool CanEmitFusedDynamicUpdateSliceInPlace(
 Status EmitDynamicUpdateSliceInPlace(
     tensorflow::gtl::ArraySlice<IrArray> operand_arrays,
     const IrArray& output_array, tensorflow::StringPiece name,
-    llvm::IRBuilder<>* ir_builder);
+    llvm::IRBuilder<>* b);
 
 // Given a loop-fusion node whose root is a dynamic-update-slice op whose
 // array-to-be-updated and output share the same buffer slice, emits
@@ -76,7 +76,7 @@ Status EmitFusedDynamicUpdateSliceInPlace(
     HloInstruction* fusion,
     tensorflow::gtl::ArraySlice<IrArray> fusion_operand_arrays,
     const IrArray& fusion_output_array, ElementalIrEmitter* elemental_emitter,
-    llvm::IRBuilder<>* ir_builder);
+    llvm::IRBuilder<>* b);
 
 // Same as EmitFusedDynamicUpdateSliceInPlace, except emits a parallel loop with
 // the given launch dimensions.
@@ -84,8 +84,7 @@ Status EmitParallelFusedDynamicUpdateSliceInPlace(
     HloInstruction* fusion,
     tensorflow::gtl::ArraySlice<IrArray> fusion_operand_arrays,
     const IrArray& fusion_output_array, ElementalIrEmitter* elemental_emitter,
-    const gpu::LaunchDimensions& launch_dimensions,
-    llvm::IRBuilder<>* ir_builder);
+    const gpu::LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* b);
 
 }  // namespace llvm_ir
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
index b12ce97e28..72ede377e1 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
@@ -52,7 +52,7 @@ Status FusedIrEmitter::DefaultAction(HloInstruction* hlo) {
       // that would be regenerated without caching. But this might increase the
       // JIT compilation time.
       if (generated_value_bb == nullptr ||
-          generated_value_bb == ir_builder_->GetInsertBlock()) {
+          generated_value_bb == b_->GetInsertBlock()) {
         VLOG(3) << "The cached generated value is reused.";
         return generated_value;
       }
@@ -60,8 +60,7 @@ Status FusedIrEmitter::DefaultAction(HloInstruction* hlo) {
                  "a different BB ("
               << llvm_ir::AsString(generated_value_bb->getName())
               << ") from the current insertion block ("
-              << llvm_ir::AsString(ir_builder_->GetInsertBlock()->getName())
-              << ").";
+              << llvm_ir::AsString(b_->GetInsertBlock()->getName()) << ").";
     }
 
     TF_ASSIGN_OR_RETURN(
@@ -77,14 +76,14 @@ Status FusedIrEmitter::HandleConstant(HloInstruction* constant) {
   llvm::Constant* initializer =
       llvm_ir::ConvertLiteralToIrConstant(literal, module_);
   llvm::GlobalVariable* global = new llvm::GlobalVariable(
-      *ir_builder_->GetInsertBlock()->getModule(), initializer->getType(),
+      *b_->GetInsertBlock()->getModule(), initializer->getType(),
       /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, initializer,
       /*Name=*/"");
   llvm::Constant* shape_constant = llvm::ConstantExpr::getBitCast(
       global, llvm_ir::ShapeToIrType(literal.shape(), module_)->getPointerTo());
   generators_[constant] = [=](const IrArray::Index& index) {
     return IrArray(shape_constant, constant->shape())
-        .EmitReadArrayElement(index, ir_builder_);
+        .EmitReadArrayElement(index, b_);
   };
 
   return Status::OK();
@@ -104,7 +103,7 @@ Status FusedIrEmitter::HandleGetTupleElement(
   // Emit code to lookup tuple element pointer, and store it in 'gte_values_'.
   llvm::Value* tuple_element_ptr = llvm_ir::EmitGetTupleElement(
       get_tuple_element->shape(), get_tuple_element->tuple_index(),
-      /*alignment=*/1, it->second, ir_builder_, module_);
+      /*alignment=*/1, it->second, b_, module_);
   gte_values_.insert(std::make_pair(get_tuple_element, tuple_element_ptr));
   // Emit code to read base tuple element array (if non-tuple shaped).
   if (!ShapeUtil::IsTuple(get_tuple_element->shape())) {
@@ -112,7 +111,7 @@ Status FusedIrEmitter::HandleGetTupleElement(
         [=](const IrArray::Index& index) -> StatusOr<llvm::Value*> {
       // TODO(b/34080002) Add aliasing information to tuple element IrArray.
       return IrArray(tuple_element_ptr, get_tuple_element->shape())
-          .EmitReadArrayElement(index, ir_builder_);
+          .EmitReadArrayElement(index, b_);
     };
   }
   return Status::OK();
@@ -129,16 +128,15 @@ Status FusedIrEmitter::HandleParameter(HloInstruction* parameter) {
         // want the AA info to be present before address spaces are inferred
         // (which is pretty late in the pipeline), so even if we had
         // address-space-based AA in LLVM, it wouldn't help us much here.
-        return ir_builder_->CreateLoad(
-            ir_builder_->CreateGEP(
-                param_tile_buffer,
-                {index.GetConstantWithIndexType(0), tiled_parameter_info_->x(),
-                 tiled_parameter_info_->y()}),
+        return b_->CreateLoad(
+            b_->CreateGEP(param_tile_buffer, {index.GetConstantWithIndexType(0),
+                                              tiled_parameter_info_->x(),
+                                              tiled_parameter_info_->y()}),
             "tiled_buffer");
       }
     }
     return parameter_arrays_[parameter->parameter_number()]
-        .EmitReadArrayElement(index, ir_builder_);
+        .EmitReadArrayElement(index, b_);
   };
   // Store ir value for fusion operand associated with fusion parameter to be
   // accessed by subsequent fused GetTupleElement instructions.
@@ -157,11 +155,11 @@ Status FusedIrEmitter::HandleTuple(HloInstruction* tuple) {
   }
   generators_[tuple] =
       [=](const IrArray::Index& index) -> StatusOr<llvm::Value*> {
-    llvm::Value* ret = llvm::UndefValue::get(llvm::StructType::get(
-        ir_builder_->getContext(), operand_elemental_ir_types));
+    llvm::Value* ret = llvm::UndefValue::get(
+        llvm::StructType::get(b_->getContext(), operand_elemental_ir_types));
     for (size_t i = 0; i < ShapeUtil::TupleElementCount(tuple->shape()); ++i) {
       TF_ASSIGN_OR_RETURN(llvm::Value * val_i, generators_[operands[i]](index));
-      ret = ir_builder_->CreateInsertValue(ret, val_i, i);
+      ret = b_->CreateInsertValue(ret, val_i, i);
     }
     return ret;
   };
diff --git a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
index a6ceec7b23..30471480c4 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
@@ -59,7 +59,7 @@ class FusedIrEmitter : public DfsHloVisitorWithDefault {
       : parameter_arrays_(parameter_arrays),
         tiled_parameter_info_(nullptr),
         elemental_emitter_(elemental_emitter),
-        ir_builder_(elemental_emitter->ir_builder()),
+        b_(elemental_emitter->b()),
         module_(elemental_emitter->module()) {}
 
   Status DefaultAction(HloInstruction* hlo) override;
@@ -103,7 +103,7 @@ class FusedIrEmitter : public DfsHloVisitorWithDefault {
   const HloInstruction* fused_root_ = nullptr;
 
   // Borrowed
-  llvm::IRBuilder<>* ir_builder_;
+  llvm::IRBuilder<>* b_;
   llvm::Module* module_;
 
   // Map from instruction pointers to functions to generate elements of their
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc b/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
index dcf9838d80..2b6caee6aa 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
@@ -31,7 +31,7 @@ namespace llvm_ir {
 
 void IrArray::Index::Delinearize(std::vector<llvm::Value*>* multidim,
                                  llvm::Value* linear, const Shape& shape,
-                                 llvm::IRBuilder<>* ir_builder) const {
+                                 llvm::IRBuilder<>* b) const {
   int64 divisor = 1;
   const Layout& layout = shape.layout();
   for (int64 i = 0; i < layout.minor_to_major_size(); ++i) {
@@ -48,10 +48,9 @@ void IrArray::Index::Delinearize(std::vector<llvm::Value*>* multidim,
     // useful because cuda-memcheck can't help us much in XLA: Most of our
     // memory lives in one big allocation, so cuda-memcheck can't detect
     // out-of-bounds accesses.
-    auto* quot =
-        ir_builder->CreateUDiv(linear, GetConstantWithIndexType(divisor));
+    auto* quot = b->CreateUDiv(linear, GetConstantWithIndexType(divisor));
     if (i < layout.minor_to_major_size() - 1) {
-      (*multidim)[dimension] = ir_builder->CreateURem(
+      (*multidim)[dimension] = b->CreateURem(
           quot, GetConstantWithIndexType(size_of_current_dimension));
     } else {
       (*multidim)[dimension] = quot;
@@ -61,7 +60,7 @@ void IrArray::Index::Delinearize(std::vector<llvm::Value*>* multidim,
 }
 
 IrArray::Index::Index(llvm::Value* linear, const Shape& shape,
-                      llvm::IRBuilder<>* ir_builder)
+                      llvm::IRBuilder<>* b)
     : multidim_(ShapeUtil::Rank(shape)),
       linear_(linear),
       layout_(shape.layout()),
@@ -71,7 +70,7 @@ IrArray::Index::Index(llvm::Value* linear, const Shape& shape,
   CHECK(LayoutUtil::HasLayout(shape))
       << "Shape " << ShapeUtil::HumanStringWithLayout(shape)
       << " should have a layout.";
-  Delinearize(&multidim_, linear, shape, ir_builder);
+  Delinearize(&multidim_, linear, shape, b);
 }
 
 IrArray::Index::Index(tensorflow::gtl::ArraySlice<llvm::Value*> multidim,
@@ -94,7 +93,7 @@ IrArray::Index::Index(tensorflow::gtl::ArraySlice<llvm::Value*> multidim,
 }
 
 IrArray::Index::Index(tensorflow::gtl::ArraySlice<llvm::Value*> multidim,
-                      const Shape& shape, llvm::IRBuilder<>* ir_builder)
+                      const Shape& shape, llvm::IRBuilder<>* b)
     : multidim_(multidim.begin(), multidim.end()),
       layout_(shape.layout()),
       dims_(shape.dimensions().begin(), shape.dimensions().end()) {
@@ -328,6 +327,7 @@ llvm::Value* IrArray::Index::Linearize(
     llvm::IRBuilder<>* builder) const {
   // Each dimension is multiplied by the product of the sizes of all
   // earlier dimensions and added to the accumulator logical_linear_index.
+  CHECK_EQ(size(), dimensions.size());
   llvm::Value* logical_linear_index = GetConstantWithIndexType(0);
   int64 multiplier = 1;
   for (ssize_t i = size() - 1; i >= 0; --i) {
@@ -343,7 +343,7 @@ llvm::Value* IrArray::Index::Linearize(
 }
 
 llvm::Value* IrArray::EmitArrayElementAddress(
-    const IrArray::Index& index, llvm::IRBuilder<>* ir_builder,
+    const IrArray::Index& index, llvm::IRBuilder<>* b,
     tensorflow::StringPiece name) const {
   if (ShapeUtil::IsScalar(*shape_)) {
     // Special handling of scalars: a scalar pretends to have the same value for
@@ -354,12 +354,11 @@ llvm::Value* IrArray::EmitArrayElementAddress(
   CHECK_EQ(index.size(), ShapeUtil::Rank(*shape_));
 
   if (index.LinearValidOnShape(*shape_)) {
-    llvm::Module* module =
-        ir_builder->GetInsertBlock()->getParent()->getParent();
-    return ir_builder->CreateInBoundsGEP(
-        ir_builder->CreateBitCast(
-            base_ptr_, PrimitiveTypeToIrType(shape_->element_type(), module)
-                           ->getPointerTo()),
+    llvm::Module* module = b->GetInsertBlock()->getParent()->getParent();
+    return b->CreateInBoundsGEP(
+        b->CreateBitCast(base_ptr_,
+                         PrimitiveTypeToIrType(shape_->element_type(), module)
+                             ->getPointerTo()),
         {index.linear()}, llvm_ir::AsStringRef(name));
   }
 
@@ -385,8 +384,8 @@ llvm::Value* IrArray::EmitArrayElementAddress(
     int64 dimension = LayoutUtil::Major(shape_->layout(), i);
     gep_indices.push_back(actual_index[dimension]);
   }
-  return ir_builder->CreateInBoundsGEP(base_ptr_, gep_indices,
-                                       llvm_ir::AsStringRef(name));
+  return b->CreateInBoundsGEP(base_ptr_, gep_indices,
+                              llvm_ir::AsStringRef(name));
 }
 
 void IrArray::AnnotateLoadStoreInstructionWithMetadata(
@@ -402,29 +401,27 @@ void IrArray::AnnotateLoadStoreInstructionWithMetadata(
 }
 
 llvm::Value* IrArray::EmitReadArrayElement(const Index& index,
-                                           llvm::IRBuilder<>* ir_builder,
+                                           llvm::IRBuilder<>* b,
                                            tensorflow::StringPiece name) const {
-  llvm::Value* element_address =
-      EmitArrayElementAddress(index, ir_builder, name);
-  llvm::LoadInst* load = ir_builder->CreateLoad(element_address);
+  llvm::Value* element_address = EmitArrayElementAddress(index, b, name);
+  llvm::LoadInst* load = b->CreateLoad(element_address);
   AnnotateLoadStoreInstructionWithMetadata(load);
   return load;
 }
 
 void IrArray::EmitWriteArrayElement(const Index& index, llvm::Value* value,
-                                    llvm::IRBuilder<>* ir_builder) const {
-  llvm::Value* element_address = EmitArrayElementAddress(index, ir_builder);
-  llvm::StoreInst* store = ir_builder->CreateStore(value, element_address);
+                                    llvm::IRBuilder<>* b) const {
+  llvm::Value* element_address = EmitArrayElementAddress(index, b);
+  llvm::StoreInst* store = b->CreateStore(value, element_address);
   AnnotateLoadStoreInstructionWithMetadata(store);
 }
 
 IrArray IrArray::CastToShape(const Shape& new_shape,
-                             llvm::IRBuilder<>* ir_builder) const {
-  llvm::Module* module = ir_builder->GetInsertBlock()->getParent()->getParent();
+                             llvm::IRBuilder<>* b) const {
+  llvm::Module* module = b->GetInsertBlock()->getParent()->getParent();
   llvm::Type* new_ir_type = llvm_ir::ShapeToIrType(new_shape, module);
   IrArray new_irarray(
-      ir_builder->CreatePointerCast(base_ptr_, new_ir_type->getPointerTo()),
-      new_shape);
+      b->CreatePointerCast(base_ptr_, new_ir_type->getPointerTo()), new_shape);
   new_irarray.metadata_ = metadata_;
   return new_irarray;
 }
@@ -432,9 +429,9 @@ IrArray IrArray::CastToShape(const Shape& new_shape,
 /* static */ IrArray::Index IrArray::BumpIndex(const Index& index,
                                                int64 which_dimension,
                                                int64 addend,
-                                               llvm::IRBuilder<>* ir_builder) {
+                                               llvm::IRBuilder<>* b) {
   Index new_index = index;
-  new_index[which_dimension] = ir_builder->CreateAdd(
+  new_index[which_dimension] = b->CreateAdd(
       index[which_dimension],
       llvm::ConstantInt::get(index[which_dimension]->getType(), addend), "",
       /*HasNUW=*/true,
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_array.h b/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
index 0777c49923..28ca793e3e 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
@@ -87,20 +87,19 @@ class IrArray {
     }
 
     // Constructs an index from linear index "linear" and computes the
-    // multi-dimensional index from "linear" and "shape". "ir_builder" is the IR
+    // multi-dimensional index from "linear" and "shape". "b" is the IR
     // builder to emit the index of each dimension in the multi-dimensional
     // index.
     //
     // Precondition: "shape" has a layout.
-    Index(llvm::Value* linear, const Shape& shape,
-          llvm::IRBuilder<>* ir_builder);
+    Index(llvm::Value* linear, const Shape& shape, llvm::IRBuilder<>* b);
 
     // Constructs an index from the given multi-dimensional index and the shape
     // that it indexes into.
     //
     // Precondition: "shape" has a layout.
     Index(tensorflow::gtl::ArraySlice<llvm::Value*> multidim,
-          const Shape& shape, llvm::IRBuilder<>* ir_builder);
+          const Shape& shape, llvm::IRBuilder<>* b);
 
     // Constructs an index from both a multi-dimensional index and a linear
     // index. "shape" has the same meaning as that in the constructor that takes
@@ -191,7 +190,7 @@ class IrArray {
     }
 
     void Delinearize(std::vector<llvm::Value*>* multidim, llvm::Value* linear,
-                     const Shape& shape, llvm::IRBuilder<>* ir_builder) const;
+                     const Shape& shape, llvm::IRBuilder<>* b) const;
 
     std::vector<llvm::Value*> multidim_;
 
@@ -240,8 +239,7 @@ class IrArray {
   //
   // The optional name is useful for debugging when looking at
   // the emitted LLVM IR.
-  llvm::Value* EmitArrayElementAddress(const Index& index,
-                                       llvm::IRBuilder<>* ir_builder,
+  llvm::Value* EmitArrayElementAddress(const Index& index, llvm::IRBuilder<>* b,
                                        tensorflow::StringPiece name = "") const;
 
   // Attach metadata this IrArray instance knows about to "instruction".
@@ -255,18 +253,16 @@ class IrArray {
   //
   // The optional name is useful for debugging when looking at
   // the emitted LLVM IR.
-  llvm::Value* EmitReadArrayElement(const Index& index,
-                                    llvm::IRBuilder<>* ir_builder,
+  llvm::Value* EmitReadArrayElement(const Index& index, llvm::IRBuilder<>* b,
                                     tensorflow::StringPiece name = "") const;
 
   // Emit IR to write the given value to the array element at the given index.
   void EmitWriteArrayElement(const Index& index, llvm::Value* value,
-                             llvm::IRBuilder<>* ir_builder) const;
+                             llvm::IRBuilder<>* b) const;
 
   // Returns a new IrArray whose shape is "new_shape" and base pointer is a
   // bitcast of the base pointer of "this" IrArray.
-  IrArray CastToShape(const Shape& new_shape,
-                      llvm::IRBuilder<>* ir_builder) const;
+  IrArray CastToShape(const Shape& new_shape, llvm::IRBuilder<>* b) const;
 
   void AddAliasScopeMetadata(llvm::MDNode* alias_scope) {
     CHECK_NE(alias_scope, nullptr);
@@ -312,7 +308,7 @@ class IrArray {
   // Bumps the "which_dimension" value within the provided index by the provided
   // addend.
   static Index BumpIndex(const Index& index, int64 which_dimension,
-                         int64 addend, llvm::IRBuilder<>* ir_builder);
+                         int64 addend, llvm::IRBuilder<>* b);
 
  private:
   // Add the specified LLVM IR metadata to loads/stores associated with this
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc
index 98d0ceb3e2..b79567369a 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc
@@ -22,9 +22,9 @@ Status KernelSupportLibrary::For(
     tensorflow::StringPiece name, llvm::Value* start, llvm::Value* end,
     llvm::Value* step,
     const std::function<Status(llvm::Value*, bool)>& for_body_generator) {
-  return If(ir_builder_->CreateICmpSLT(start, end), [&]() -> Status {
+  return If(b_->CreateICmpSLT(start, end), [&]() -> Status {
     TF_RETURN_IF_ERROR(for_body_generator(start, /*is_first_iteration=*/true));
-    return For(name, ir_builder_->CreateAdd(start, step), end, step,
+    return For(name, b_->CreateAdd(start, step), end, step,
                [&](llvm::Value* iv) { return for_body_generator(iv, false); });
   });
 }
@@ -37,20 +37,20 @@ Status KernelSupportLibrary::For(
   if (peel_first_iteration) {
     return For(name, start, end, step, true,
                [&](llvm::Value* indvar, bool is_first_iteration) -> Status {
-                 return for_body_generator(
-                     indvar, ir_builder_->getInt1(is_first_iteration));
+                 return for_body_generator(indvar,
+                                           b_->getInt1(is_first_iteration));
                });
   } else {
     std::unique_ptr<llvm_ir::ForLoop> loop = llvm_ir::ForLoop::EmitForLoop(
-        name, start, end, step, ir_builder_,
+        name, start, end, step, b_,
         /*unroll_mode=*/unroll_mode_,
         /*prevent_vectorization=*/prevent_vectorization_);
-    ir_builder_->SetInsertPoint(&loop->GetBodyBasicBlock()->back());
+    b_->SetInsertPoint(&loop->GetBodyBasicBlock()->back());
     TF_RETURN_IF_ERROR(
         for_body_generator(loop->GetIndVarValue(),
-                           /*is_first_iteration=*/ir_builder_->CreateICmpEQ(
+                           /*is_first_iteration=*/b_->CreateICmpEQ(
                                loop->GetIndVarValue(), start)));
-    llvm_ir::SetToLastInsertPoint(loop->GetExitBasicBlock(), ir_builder_);
+    llvm_ir::SetToLastInsertPoint(loop->GetExitBasicBlock(), b_);
     return Status::OK();
   }
 }
@@ -59,23 +59,22 @@ Status KernelSupportLibrary::If(
     tensorflow::StringPiece name, llvm::Value* condition,
     const std::function<Status()>& true_block_generator,
     const std::function<Status()>& false_block_generator) {
-  llvm_ir::LlvmIfData if_data =
-      llvm_ir::EmitIfThenElse(condition, name, ir_builder_);
-  ir_builder_->SetInsertPoint(&if_data.true_block->back());
+  llvm_ir::LlvmIfData if_data = llvm_ir::EmitIfThenElse(condition, name, b_);
+  b_->SetInsertPoint(&if_data.true_block->back());
   TF_RETURN_IF_ERROR(true_block_generator());
-  ir_builder_->SetInsertPoint(&if_data.false_block->back());
+  b_->SetInsertPoint(&if_data.false_block->back());
   TF_RETURN_IF_ERROR(false_block_generator());
-  llvm_ir::SetToLastInsertPoint(if_data.after_block, ir_builder_);
+  llvm_ir::SetToLastInsertPoint(if_data.after_block, b_);
   return Status::OK();
 }
 
 void KernelSupportLibrary::EmitAndCallOutlinedKernel(
-    bool enable_fast_math, bool optimize_for_size,
-    llvm::IRBuilder<>* ir_builder, tensorflow::StringPiece kernel_name,
+    bool enable_fast_math, bool optimize_for_size, llvm::IRBuilder<>* b,
+    tensorflow::StringPiece kernel_name,
     KernelSupportLibrary::ArgumentVector arguments,
     const std::function<void(KernelSupportLibrary::ArgumentVector)>&
         kernel_body_generator) {
-  llvm::Module* module = ir_builder->GetInsertBlock()->getModule();
+  llvm::Module* module = b->GetInsertBlock()->getModule();
   llvm::Function* function =
       module->getFunction(llvm_ir::AsStringRef(kernel_name));
 
@@ -98,22 +97,22 @@ void KernelSupportLibrary::EmitAndCallOutlinedKernel(
                    std::back_inserter(arg_types),
                    [](llvm::Value* arg) { return arg->getType(); });
 
-    auto* function_type = llvm::FunctionType::get(
-        ir_builder->getVoidTy(), arg_types, /*isVarArg=*/false);
+    auto* function_type =
+        llvm::FunctionType::get(b->getVoidTy(), arg_types, /*isVarArg=*/false);
 
     function = llvm_ir::CreateFunction(
         function_type, llvm::GlobalValue::InternalLinkage,
         /*enable_fast_math=*/enable_fast_math,
         /*optimize_for_size=*/optimize_for_size, kernel_name, module);
 
-    llvm::IRBuilder<>::InsertPointGuard guard(*ir_builder);
+    llvm::IRBuilder<>::InsertPointGuard guard(*b);
 
     auto* entry_bb =
-        llvm::BasicBlock::Create(ir_builder->getContext(), "entry", function);
-    auto* return_inst = llvm::ReturnInst::Create(ir_builder->getContext(),
+        llvm::BasicBlock::Create(b->getContext(), "entry", function);
+    auto* return_inst = llvm::ReturnInst::Create(b->getContext(),
                                                  /*retVal=*/nullptr, entry_bb);
     // Set the insert point to before return_inst.
-    ir_builder->SetInsertPoint(return_inst);
+    b->SetInsertPoint(return_inst);
 
     std::vector<llvm::Value*> arg_values;
     /*
@@ -133,7 +132,7 @@ void KernelSupportLibrary::EmitAndCallOutlinedKernel(
     VLOG(3) << "Re-using kernel for " << kernel_name;
   }
 
-  ir_builder->CreateCall(function, llvm_ir::AsArrayRef(sanitized_args));
+  b->CreateCall(function, llvm_ir::AsArrayRef(sanitized_args));
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h
index 9d770cc4c3..b00f903d56 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h
@@ -30,14 +30,14 @@ namespace xla {
 // flow more readable.
 class KernelSupportLibrary {
  public:
-  // `ir_builder` is the llvm::IRBuilder instance used to generate LLVM IR.
+  // `b` is the llvm::IRBuilder instance used to generate LLVM IR.
   // `unroll_mode` specifies the desired LLVM unrolling behavior for every loop
   // generated by this instance of KernelSupportLibrary.
   explicit KernelSupportLibrary(
-      llvm::IRBuilder<>* ir_builder,
+      llvm::IRBuilder<>* b,
       llvm_ir::UnrollMode unroll_mode = llvm_ir::UnrollMode::kNoUnroll,
       bool prevent_vectorization = true)
-      : ir_builder_(ir_builder),
+      : b_(b),
         unroll_mode_(unroll_mode),
         prevent_vectorization_(prevent_vectorization) {}
 
@@ -71,18 +71,18 @@ class KernelSupportLibrary {
              const std::function<Status(llvm::Value* ind_var,
                                         bool is_first_iteration)>&
                  for_body_generator) {
-    return For(name, /*start=*/ir_builder_->getInt64(start),
-               /*end=*/ir_builder_->getInt64(end),
-               /*step=*/ir_builder_->getInt64(step), for_body_generator);
+    return For(name, /*start=*/b_->getInt64(start),
+               /*end=*/b_->getInt64(end),
+               /*step=*/b_->getInt64(step), for_body_generator);
   }
 
   void ForReturnVoid(
       tensorflow::StringPiece name, int64 start, int64 end, int64 step,
       const std::function<void(llvm::Value* ind_var, bool is_first_iteration)>&
           for_body_generator) {
-    ForReturnVoid(name, /*start=*/ir_builder_->getInt64(start),
-                  /*end=*/ir_builder_->getInt64(end),
-                  /*step=*/ir_builder_->getInt64(step), for_body_generator);
+    ForReturnVoid(name, /*start=*/b_->getInt64(start),
+                  /*end=*/b_->getInt64(end),
+                  /*step=*/b_->getInt64(step), for_body_generator);
   }
 
   // Generates the following control flow structure if `peel_first_iteration` is
@@ -184,17 +184,17 @@ class KernelSupportLibrary {
   Status For(
       tensorflow::StringPiece name, int64 start, int64 end, int64 step,
       const std::function<Status(llvm::Value* ind_var)>& for_body_generator) {
-    return For(name, /*start=*/ir_builder_->getInt64(start),
-               /*end=*/ir_builder_->getInt64(end),
-               /*step=*/ir_builder_->getInt64(step), for_body_generator);
+    return For(name, /*start=*/b_->getInt64(start),
+               /*end=*/b_->getInt64(end),
+               /*step=*/b_->getInt64(step), for_body_generator);
   }
 
   void ForReturnVoid(
       tensorflow::StringPiece name, int64 start, int64 end, int64 step,
       const std::function<void(llvm::Value* ind_var)>& for_body_generator) {
-    ForReturnVoid(name, /*start=*/ir_builder_->getInt64(start),
-                  /*end=*/ir_builder_->getInt64(end),
-                  /*step=*/ir_builder_->getInt64(step), for_body_generator);
+    ForReturnVoid(name, /*start=*/b_->getInt64(start),
+                  /*end=*/b_->getInt64(end),
+                  /*step=*/b_->getInt64(step), for_body_generator);
   }
 
   // Generates the following control flow structure:
@@ -258,41 +258,39 @@ class KernelSupportLibrary {
   // in a nullptr llvm::Value* in its position to `kernel_body_generator`.
   // Currently we only support at most one nullptr value in `arguments`.
   static void EmitAndCallOutlinedKernel(
-      bool enable_fast_math, bool optimize_for_size,
-      llvm::IRBuilder<>* ir_builder, tensorflow::StringPiece kernel_name,
-      ArgumentVector arguments,
+      bool enable_fast_math, bool optimize_for_size, llvm::IRBuilder<>* b,
+      tensorflow::StringPiece kernel_name, ArgumentVector arguments,
       const std::function<void(ArgumentVector)>& kernel_body_generator);
 
   // Thin wrappers around the more general EmitAndCallOutlinedKernel above.
   static void EmitAndCallOutlinedKernel(
-      bool enable_fast_math, bool optimize_for_size,
-      llvm::IRBuilder<>* ir_builder, tensorflow::StringPiece kernel_name,
-      llvm::Value* arg0, llvm::Value* arg1, llvm::Value* arg2,
+      bool enable_fast_math, bool optimize_for_size, llvm::IRBuilder<>* b,
+      tensorflow::StringPiece kernel_name, llvm::Value* arg0, llvm::Value* arg1,
+      llvm::Value* arg2,
       const std::function<void(llvm::Value*, llvm::Value*, llvm::Value*)>&
           kernel_body_generator) {
     EmitAndCallOutlinedKernel(
-        enable_fast_math, optimize_for_size, ir_builder, kernel_name,
-        {arg0, arg1, arg2}, [&](ArgumentVector args) {
+        enable_fast_math, optimize_for_size, b, kernel_name, {arg0, arg1, arg2},
+        [&](ArgumentVector args) {
           kernel_body_generator(args[0], args[1], args[2]);
         });
   }
 
   static void EmitAndCallOutlinedKernel(
-      bool enable_fast_math, bool optimize_for_size,
-      llvm::IRBuilder<>* ir_builder, tensorflow::StringPiece kernel_name,
-      llvm::Value* arg0, llvm::Value* arg1, llvm::Value* arg2,
-      llvm::Value* arg3,
+      bool enable_fast_math, bool optimize_for_size, llvm::IRBuilder<>* b,
+      tensorflow::StringPiece kernel_name, llvm::Value* arg0, llvm::Value* arg1,
+      llvm::Value* arg2, llvm::Value* arg3,
       const std::function<void(llvm::Value*, llvm::Value*, llvm::Value*,
                                llvm::Value*)>& kernel_body_generator) {
     EmitAndCallOutlinedKernel(
-        enable_fast_math, optimize_for_size, ir_builder, kernel_name,
+        enable_fast_math, optimize_for_size, b, kernel_name,
         {arg0, arg1, arg2, arg3}, [&](ArgumentVector args) {
           kernel_body_generator(args[0], args[1], args[2], args[3]);
         });
   }
 
  private:
-  llvm::IRBuilder<>* ir_builder_;
+  llvm::IRBuilder<>* b_;
   llvm_ir::UnrollMode unroll_mode_;
   bool prevent_vectorization_;
 };
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc
index 533b75cdae..35b3941272 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc
@@ -94,24 +94,24 @@ tensorflow::gtl::optional<std::vector<int64> > FindTranspose021(
 IrArray::Index GetUnreducedOutputIndex(
     const IrArray::Index& reduced_output_index,
     const Shape& reduced_output_shape, const Shape& unreduced_output_shape,
-    llvm::IRBuilder<>* ir_builder) {
+    llvm::IRBuilder<>* b) {
   auto bounds = reduced_output_shape.dimensions();
   auto minor_to_major = reduced_output_shape.layout().minor_to_major();
   llvm::Value* linear_index = reduced_output_index.GetConstantWithIndexType(0);
   int64 multiplier = 1;
   for (int i = 0; i < reduced_output_index.size(); ++i) {
     int64 dim = minor_to_major[i];
-    llvm::Value* addend = ir_builder->CreateMul(
-        reduced_output_index[dim],
-        reduced_output_index.GetConstantWithIndexType(multiplier),
-        "linearizing",
-        /*HasNUW=*/true, /*HasNSW=*/true);
-    linear_index = ir_builder->CreateAdd(linear_index, addend, "",
-                                         /*HasNUW=*/true, /*HasNSW=*/true);
+    llvm::Value* addend =
+        b->CreateMul(reduced_output_index[dim],
+                     reduced_output_index.GetConstantWithIndexType(multiplier),
+                     "linearizing",
+                     /*HasNUW=*/true, /*HasNSW=*/true);
+    linear_index = b->CreateAdd(linear_index, addend, "",
+                                /*HasNUW=*/true, /*HasNSW=*/true);
     multiplier *= bounds[dim];
   }
 
-  return IrArray::Index(linear_index, unreduced_output_shape, ir_builder);
+  return IrArray::Index(linear_index, unreduced_output_shape, b);
 }
 
 }  // namespace llvm_ir
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h
index 6f1268fffb..ccb9b8ba3e 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h
@@ -44,7 +44,7 @@ tensorflow::gtl::optional<std::vector<int64> > FindTranspose021(const Shape& a,
 IrArray::Index GetUnreducedOutputIndex(
     const IrArray::Index& reduced_output_index,
     const Shape& reduced_output_shape, const Shape& unreduced_output_shape,
-    llvm::IRBuilder<>* ir_builder);
+    llvm::IRBuilder<>* b);
 
 // A class to represent information for tiled parameters to support IR emission
 // for 021 transpose.
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc
index 1227534779..ba7f94834c 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc
@@ -47,27 +47,27 @@ ForLoop::ForLoop(tensorflow::StringPiece prefix, tensorflow::StringPiece suffix,
 
 /* static */ std::unique_ptr<ForLoop> ForLoop::EmitForLoop(
     tensorflow::StringPiece prefix, llvm::Value* start_index,
-    llvm::Value* end_index, llvm::Value* step, llvm::IRBuilder<>* ir_builder,
+    llvm::Value* end_index, llvm::Value* step, llvm::IRBuilder<>* b,
     UnrollMode unroll_mode, bool prevent_vectorization) {
   std::unique_ptr<ForLoop> loop(new ForLoop(prefix, /*suffix=*/"", start_index,
                                             end_index, step, unroll_mode,
                                             prevent_vectorization));
-  loop->Emit(ir_builder);
+  loop->Emit(b);
   return loop;
 }
 
-void ForLoop::Emit(llvm::IRBuilder<>* ir_builder) {
+void ForLoop::Emit(llvm::IRBuilder<>* b) {
   // The preheader block is the block the builder is currently emitting
   // code into.
-  preheader_bb_ = ir_builder->GetInsertBlock();
+  preheader_bb_ = b->GetInsertBlock();
 
-  llvm::BasicBlock::iterator insert_point = ir_builder->GetInsertPoint();
+  llvm::BasicBlock::iterator insert_point = b->GetInsertPoint();
   if (insert_point == preheader_bb_->end()) {
     // We're emitting the loop at the end of a basic block. Verify there is no
     // terminator (eg, branch) in the basic block.
     CHECK_EQ(nullptr, preheader_bb_->getTerminator());
 
-    exit_bb_ = CreateLoopBB("loop_exit", ir_builder);
+    exit_bb_ = CreateLoopBB("loop_exit", b);
   } else {
     // We're emitting the loop into the middle of a basic block. splitBasicBlock
     // requires that this basic block be well-formed (have a terminator).
@@ -86,51 +86,50 @@ void ForLoop::Emit(llvm::IRBuilder<>* ir_builder) {
   insert_before_bb_ = exit_bb_;
 
   // Create remaining basic block which form the inside of the loop.
-  header_bb_ = CreateLoopBB("loop_header", ir_builder);
-  body_bb_ = CreateLoopBB("loop_body", ir_builder);
+  header_bb_ = CreateLoopBB("loop_header", b);
+  body_bb_ = CreateLoopBB("loop_body", b);
 
   // Function entry basic block.
   // Emit alloca for the induction variable. We do this at the entry to the
   // basic block to ensure the alloc only executes once per function (we could
   // be emitting a nested loop).
   llvm::Function* func = preheader_bb_->getParent();
-  ir_builder->SetInsertPoint(&func->getEntryBlock(),
-                             func->getEntryBlock().getFirstInsertionPt());
+  b->SetInsertPoint(&func->getEntryBlock(),
+                    func->getEntryBlock().getFirstInsertionPt());
   llvm::Value* indvar_address =
-      ir_builder->CreateAlloca(start_index_->getType(), nullptr,
-                               AsStringRef(GetQualifiedName("invar_address")));
+      b->CreateAlloca(start_index_->getType(), nullptr,
+                      AsStringRef(GetQualifiedName("invar_address")));
 
   // Preheader basic block.
   // Initialize induction variable starting index. Create branch to the header.
-  ir_builder->SetInsertPoint(preheader_bb_);
-  ir_builder->CreateStore(start_index_, indvar_address);
+  b->SetInsertPoint(preheader_bb_);
+  b->CreateStore(start_index_, indvar_address);
   // The preheader should not have a branch yet.
   CHECK_EQ(preheader_bb_->getTerminator(), nullptr);
-  ir_builder->CreateBr(header_bb_);
+  b->CreateBr(header_bb_);
 
   // Header basic block.
   // Emit the loop conditional branch. Load and compare indvar with ending
   // index and jump to loop exit if equal. Jump to body otherwise.
-  ir_builder->SetInsertPoint(header_bb_);
-  indvar_ = ir_builder->CreateLoad(indvar_address,
-                                   AsStringRef(GetQualifiedName("indvar")));
-  llvm::Value* exit_cond = ir_builder->CreateICmpUGE(indvar_, end_index_);
-  ir_builder->CreateCondBr(/*Cond=*/exit_cond,
-                           /*True=*/exit_bb_, /*False=*/body_bb_);
+  b->SetInsertPoint(header_bb_);
+  indvar_ =
+      b->CreateLoad(indvar_address, AsStringRef(GetQualifiedName("indvar")));
+  llvm::Value* exit_cond = b->CreateICmpUGE(indvar_, end_index_);
+  b->CreateCondBr(/*Cond=*/exit_cond,
+                  /*True=*/exit_bb_, /*False=*/body_bb_);
 
   // Body basic block.
   // Increment indvar, store indvar, and jump to header.
-  ir_builder->SetInsertPoint(body_bb_);
+  b->SetInsertPoint(body_bb_);
   llvm::Value* step = step_;
   llvm::Value* indvar = indvar_;
 
-  llvm::Value* indvar_inc =
-      ir_builder->CreateAdd(indvar, step, "invar.inc",
-                            /*HasNUW=*/true, /*HasNSW=*/true);
-  ir_builder->CreateStore(indvar_inc, indvar_address);
-  llvm::BranchInst* back_branch = ir_builder->CreateBr(header_bb_);
+  llvm::Value* indvar_inc = b->CreateAdd(indvar, step, "invar.inc",
+                                         /*HasNUW=*/true, /*HasNSW=*/true);
+  b->CreateStore(indvar_inc, indvar_address);
+  llvm::BranchInst* back_branch = b->CreateBr(header_bb_);
 
-  std::vector<llvm::Metadata*> loop_metadata = GetLoopMetadata(ir_builder);
+  std::vector<llvm::Metadata*> loop_metadata = GetLoopMetadata(b);
   if (!loop_metadata.empty()) {
     llvm::LLVMContext* ctx = &start_index_->getContext();
     auto temp_node = llvm::MDNode::getTemporary(*ctx, llvm::None);
@@ -141,11 +140,10 @@ void ForLoop::Emit(llvm::IRBuilder<>* ir_builder) {
   }
 
   // Re-point the IR builder to the loop exit block.
-  ir_builder->SetInsertPoint(exit_bb_);
+  b->SetInsertPoint(exit_bb_);
 }
 
-std::vector<llvm::Metadata*> ForLoop::GetLoopMetadata(
-    llvm::IRBuilder<>* ir_builder) {
+std::vector<llvm::Metadata*> ForLoop::GetLoopMetadata(llvm::IRBuilder<>* b) {
   const char* const kLlvmLoopUnrollDisableMDName = "llvm.loop.unroll.disable";
   const char* const kLlvmLoopUnrollFullMDName = "llvm.loop.unroll.full";
   const char* const kLlvmLoopVectorizeMDName = "llvm.loop.vectorize.enable";
@@ -160,7 +158,7 @@ std::vector<llvm::Metadata*> ForLoop::GetLoopMetadata(
   if (prevent_vectorization_) {
     result.push_back(llvm::MDNode::get(
         *ctx, {llvm::MDString::get(*ctx, kLlvmLoopVectorizeMDName),
-               llvm::ConstantAsMetadata::get(ir_builder->getFalse())}));
+               llvm::ConstantAsMetadata::get(b->getFalse())}));
   }
 
   if (unroll_mode_ == xla::llvm_ir::UnrollMode::kFullyUnroll) {
@@ -175,9 +173,8 @@ string ForLoop::GetQualifiedName(tensorflow::StringPiece name) {
 }
 
 llvm::BasicBlock* ForLoop::CreateLoopBB(tensorflow::StringPiece name,
-                                        llvm::IRBuilder<>* ir_builder) {
-  return CreateBasicBlock(insert_before_bb_, GetQualifiedName(name),
-                          ir_builder);
+                                        llvm::IRBuilder<>* b) {
+  return CreateBasicBlock(insert_before_bb_, GetQualifiedName(name), b);
 }
 
 std::unique_ptr<ForLoop> ForLoopNest::AddLoop(tensorflow::StringPiece suffix,
@@ -197,12 +194,12 @@ std::unique_ptr<ForLoop> ForLoopNest::AddLoop(tensorflow::StringPiece suffix,
                                               bool prevent_vectorization) {
   if (inner_loop_body_bb_ != nullptr) {
     // Create this loop inside the previous one.
-    ir_builder_->SetInsertPoint(&*inner_loop_body_bb_->getFirstInsertionPt());
+    b_->SetInsertPoint(&*inner_loop_body_bb_->getFirstInsertionPt());
   }
   std::unique_ptr<ForLoop> loop(new ForLoop(
       /*prefix=*/name_, suffix, start_index, end_index, stride, unroll_mode,
       prevent_vectorization));
-  loop->Emit(ir_builder_);
+  loop->Emit(b_);
 
   if (outer_loop_preheader_bb_ == nullptr) {
     outer_loop_preheader_bb_ = loop->GetPreheaderBasicBlock();
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h
index b3266022db..a4fed5c8dc 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h
@@ -79,7 +79,7 @@ class ForLoop {
   //  loop.
   static std::unique_ptr<ForLoop> EmitForLoop(
       tensorflow::StringPiece prefix, llvm::Value* start_index,
-      llvm::Value* end_index, llvm::Value* step, llvm::IRBuilder<>* ir_builder,
+      llvm::Value* end_index, llvm::Value* step, llvm::IRBuilder<>* b,
       UnrollMode unroll_mode = llvm_ir::UnrollMode::kDefaultUnroll,
       bool prevent_vectorization = false);
 
@@ -138,10 +138,10 @@ class ForLoop {
           UnrollMode unroll_mode, bool prevent_vectorization);
 
   // Emit the loop at the insert point of the builder.
-  void Emit(llvm::IRBuilder<>* ir_builder);
+  void Emit(llvm::IRBuilder<>* b);
 
   llvm::BasicBlock* CreateLoopBB(tensorflow::StringPiece name,
-                                 llvm::IRBuilder<>* ir_builder);
+                                 llvm::IRBuilder<>* b);
 
   // Creates a name for an LLVM construct, appending prefix_ and suffix_, if
   // they are set.
@@ -149,7 +149,7 @@ class ForLoop {
 
   // Return a list of metadata nodes that should be associated with the
   // llvm::Loop for this `ForLoop`.
-  std::vector<llvm::Metadata*> GetLoopMetadata(llvm::IRBuilder<>* ir_builder);
+  std::vector<llvm::Metadata*> GetLoopMetadata(llvm::IRBuilder<>* b);
 
   string prefix_;
   string suffix_;
@@ -177,19 +177,18 @@ class ForLoop {
 // A simple class for constructing nested for-loops.
 class ForLoopNest {
  public:
-  explicit ForLoopNest(llvm::IRBuilder<>* ir_builder,
-                       llvm::Type* index_ty = nullptr)
-      : ForLoopNest(/*name=*/"", ir_builder) {
+  explicit ForLoopNest(llvm::IRBuilder<>* b, llvm::Type* index_ty = nullptr)
+      : ForLoopNest(/*name=*/"", b) {
     SetIndexType(index_ty);
   }
 
-  ForLoopNest(tensorflow::StringPiece name, llvm::IRBuilder<>* ir_builder,
+  ForLoopNest(tensorflow::StringPiece name, llvm::IRBuilder<>* b,
               llvm::Type* index_ty = nullptr)
       : name_(std::string(name)),
         outer_loop_preheader_bb_(nullptr),
         outer_loop_exit_bb_(nullptr),
         inner_loop_body_bb_(nullptr),
-        ir_builder_(ir_builder) {
+        b_(b) {
     SetIndexType(index_ty);
   }
 
@@ -270,7 +269,7 @@ class ForLoopNest {
 
  private:
   void SetIndexType(llvm::Type* index_ty) {
-    index_type_ = index_ty == nullptr ? ir_builder_->getInt64Ty() : index_ty;
+    index_type_ = index_ty == nullptr ? b_->getInt64Ty() : index_ty;
   }
 
   llvm::Constant* GetConstantWithIndexType(int64 c) const {
@@ -289,7 +288,7 @@ class ForLoopNest {
   // has been added yet.
   llvm::BasicBlock* inner_loop_body_bb_;
 
-  llvm::IRBuilder<>* ir_builder_;
+  llvm::IRBuilder<>* b_;
 
   llvm::Type* index_type_;
 
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
index 6c55361b44..e6126881af 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
@@ -48,8 +48,8 @@ namespace {
 
 // Note, this function is only useful in an insertion context; in a global
 // (e.g. constants) context it will CHECK fail.
-llvm::Module* ModuleFromIRBuilder(llvm::IRBuilder<>* ir_builder) {
-  auto block = CHECK_NOTNULL(ir_builder->GetInsertBlock());
+llvm::Module* ModuleFromIRBuilder(llvm::IRBuilder<>* b) {
+  auto block = CHECK_NOTNULL(b->GetInsertBlock());
   auto fn = CHECK_NOTNULL(block->getParent());
   auto module = CHECK_NOTNULL(fn->getParent());
   return module;
@@ -87,41 +87,41 @@ llvm::Value* EmitCallToIntrinsic(
     llvm::Intrinsic::ID intrinsic_id,
     tensorflow::gtl::ArraySlice<llvm::Value*> operands,
     tensorflow::gtl::ArraySlice<llvm::Type*> overloaded_types,
-    llvm::IRBuilder<>* ir_builder) {
-  llvm::Module* module = ModuleFromIRBuilder(ir_builder);
+    llvm::IRBuilder<>* b) {
+  llvm::Module* module = ModuleFromIRBuilder(b);
   llvm::Function* intrinsic = llvm::Intrinsic::getDeclaration(
       module, intrinsic_id, AsArrayRef(overloaded_types));
-  return ir_builder->CreateCall(intrinsic, AsArrayRef(operands));
+  return b->CreateCall(intrinsic, AsArrayRef(operands));
 }
 
 llvm::Value* EmitFloatMax(llvm::Value* lhs_value, llvm::Value* rhs_value,
-                          llvm::IRBuilder<>* ir_builder) {
-  if (ir_builder->getFastMathFlags().noNaNs()) {
-    auto cmp = ir_builder->CreateFCmpUGE(lhs_value, rhs_value);
-    return ir_builder->CreateSelect(cmp, lhs_value, rhs_value);
+                          llvm::IRBuilder<>* b) {
+  if (b->getFastMathFlags().noNaNs()) {
+    auto cmp = b->CreateFCmpUGE(lhs_value, rhs_value);
+    return b->CreateSelect(cmp, lhs_value, rhs_value);
   } else {
-    auto cmp_ge = ir_builder->CreateFCmpOGE(lhs_value, rhs_value);
-    auto lhs_is_nan = ir_builder->CreateFCmpUNE(lhs_value, lhs_value);
-    auto sel_lhs = ir_builder->CreateOr(cmp_ge, lhs_is_nan);
-    return ir_builder->CreateSelect(sel_lhs, lhs_value, rhs_value);
+    auto cmp_ge = b->CreateFCmpOGE(lhs_value, rhs_value);
+    auto lhs_is_nan = b->CreateFCmpUNE(lhs_value, lhs_value);
+    auto sel_lhs = b->CreateOr(cmp_ge, lhs_is_nan);
+    return b->CreateSelect(sel_lhs, lhs_value, rhs_value);
   }
 }
 
 llvm::Value* EmitFloatMin(llvm::Value* lhs_value, llvm::Value* rhs_value,
-                          llvm::IRBuilder<>* ir_builder) {
-  if (ir_builder->getFastMathFlags().noNaNs()) {
-    auto cmp = ir_builder->CreateFCmpULE(lhs_value, rhs_value);
-    return ir_builder->CreateSelect(cmp, lhs_value, rhs_value);
+                          llvm::IRBuilder<>* b) {
+  if (b->getFastMathFlags().noNaNs()) {
+    auto cmp = b->CreateFCmpULE(lhs_value, rhs_value);
+    return b->CreateSelect(cmp, lhs_value, rhs_value);
   } else {
-    auto cmp_le = ir_builder->CreateFCmpOLE(lhs_value, rhs_value);
-    auto lhs_is_nan = ir_builder->CreateFCmpUNE(lhs_value, lhs_value);
-    auto sel_lhs = ir_builder->CreateOr(cmp_le, lhs_is_nan);
-    return ir_builder->CreateSelect(sel_lhs, lhs_value, rhs_value);
+    auto cmp_le = b->CreateFCmpOLE(lhs_value, rhs_value);
+    auto lhs_is_nan = b->CreateFCmpUNE(lhs_value, lhs_value);
+    auto sel_lhs = b->CreateOr(cmp_le, lhs_is_nan);
+    return b->CreateSelect(sel_lhs, lhs_value, rhs_value);
   }
 }
 
 llvm::Value* EmitBufferIndexingGEP(llvm::Value* array, llvm::Value* index,
-                                   llvm::IRBuilder<>* ir_builder) {
+                                   llvm::IRBuilder<>* b) {
   llvm::Type* array_type = array->getType();
   CHECK(array_type->isPointerTy());
   llvm::PointerType* array_type_as_pointer =
@@ -131,16 +131,16 @@ llvm::Value* EmitBufferIndexingGEP(llvm::Value* array, llvm::Value* index,
           << " array=" << llvm_ir::DumpToString(*array)
           << " index=" << llvm_ir::DumpToString(*index);
 
-  return ir_builder->CreateInBoundsGEP(
+  return b->CreateInBoundsGEP(
       array_type_as_pointer->getElementType(), array,
       llvm::isa<llvm::GlobalVariable>(array)
-          ? llvm::ArrayRef<llvm::Value*>({ir_builder->getInt64(0), index})
+          ? llvm::ArrayRef<llvm::Value*>({b->getInt64(0), index})
           : index);
 }
 
 llvm::Value* EmitBufferIndexingGEP(llvm::Value* array, int64 index,
-                                   llvm::IRBuilder<>* ir_builder) {
-  return EmitBufferIndexingGEP(array, ir_builder->getInt64(index), ir_builder);
+                                   llvm::IRBuilder<>* b) {
+  return EmitBufferIndexingGEP(array, b->getInt64(index), b);
 }
 
 llvm::Type* PrimitiveTypeToIrType(PrimitiveType element_type,
@@ -232,14 +232,15 @@ llvm::Type* ShapeToIrType(const Shape& shape, llvm::Module* module) {
   return result_type;
 }
 
-StatusOr<llvm::Value*> EncodeSelfDescribingShapeConstant(
-    const Shape& shape, int32* shape_size, llvm::IRBuilder<>* ir_builder) {
+StatusOr<llvm::Value*> EncodeSelfDescribingShapeConstant(const Shape& shape,
+                                                         int32* shape_size,
+                                                         llvm::IRBuilder<>* b) {
   string encoded_shape = shape.SerializeAsString();
   if (encoded_shape.size() > std::numeric_limits<int32>::max()) {
     return InternalError("Encoded shape size exceeded int32 size limit.");
   }
   *shape_size = static_cast<int32>(encoded_shape.size());
-  return ir_builder->CreateGlobalStringPtr(llvm_ir::AsStringRef(encoded_shape));
+  return b->CreateGlobalStringPtr(llvm_ir::AsStringRef(encoded_shape));
 }
 
 StatusOr<Shape> DecodeSelfDescribingShapeConstant(const void* shape_ptr,
@@ -262,59 +263,57 @@ llvm::Constant* ConvertLiteralToIrConstant(const Literal& literal,
 
 llvm::AllocaInst* EmitAllocaAtFunctionEntry(llvm::Type* type,
                                             tensorflow::StringPiece name,
-                                            llvm::IRBuilder<>* ir_builder,
+                                            llvm::IRBuilder<>* b,
                                             int alignment) {
-  return EmitAllocaAtFunctionEntryWithCount(type, nullptr, name, ir_builder,
-                                            alignment);
+  return EmitAllocaAtFunctionEntryWithCount(type, nullptr, name, b, alignment);
 }
 
 llvm::AllocaInst* EmitAllocaAtFunctionEntryWithCount(
     llvm::Type* type, llvm::Value* element_count, tensorflow::StringPiece name,
-    llvm::IRBuilder<>* ir_builder, int alignment) {
-  llvm::IRBuilder<>::InsertPoint insert_point = ir_builder->saveIP();
-  llvm::Function* function = ir_builder->GetInsertBlock()->getParent();
-  ir_builder->SetInsertPoint(&function->getEntryBlock(),
-                             function->getEntryBlock().getFirstInsertionPt());
+    llvm::IRBuilder<>* b, int alignment) {
+  llvm::IRBuilder<>::InsertPoint insert_point = b->saveIP();
+  llvm::Function* function = b->GetInsertBlock()->getParent();
+  b->SetInsertPoint(&function->getEntryBlock(),
+                    function->getEntryBlock().getFirstInsertionPt());
   llvm::AllocaInst* alloca =
-      ir_builder->CreateAlloca(type, element_count, AsStringRef(name));
+      b->CreateAlloca(type, element_count, AsStringRef(name));
   if (alignment != 0) {
     alloca->setAlignment(alignment);
   }
-  ir_builder->restoreIP(insert_point);
+  b->restoreIP(insert_point);
   return alloca;
 }
 
 llvm::BasicBlock* CreateBasicBlock(llvm::BasicBlock* insert_before,
                                    tensorflow::StringPiece name,
-                                   llvm::IRBuilder<>* ir_builder) {
+                                   llvm::IRBuilder<>* b) {
   return llvm::BasicBlock::Create(
-      /*Context=*/ir_builder->getContext(),
+      /*Context=*/b->getContext(),
       /*Name=*/AsStringRef(name),
-      /*Parent=*/ir_builder->GetInsertBlock()->getParent(),
+      /*Parent=*/b->GetInsertBlock()->getParent(),
       /*InsertBefore*/ insert_before);
 }
 
 LlvmIfData EmitIfThenElse(llvm::Value* condition, tensorflow::StringPiece name,
-                          llvm::IRBuilder<>* ir_builder, bool emit_else) {
+                          llvm::IRBuilder<>* b, bool emit_else) {
   llvm_ir::LlvmIfData if_data;
-  if_data.if_block = ir_builder->GetInsertBlock();
-  if_data.true_block = CreateBasicBlock(
-      nullptr, tensorflow::strings::StrCat(name, "-true"), ir_builder);
+  if_data.if_block = b->GetInsertBlock();
+  if_data.true_block =
+      CreateBasicBlock(nullptr, tensorflow::strings::StrCat(name, "-true"), b);
   if_data.false_block =
-      emit_else ? CreateBasicBlock(nullptr,
-                                   tensorflow::strings::StrCat(name, "-false"),
-                                   ir_builder)
+      emit_else ? CreateBasicBlock(
+                      nullptr, tensorflow::strings::StrCat(name, "-false"), b)
                 : nullptr;
 
   // Add a terminator to the if block, if necessary.
   if (if_data.if_block->getTerminator() == nullptr) {
-    ir_builder->SetInsertPoint(if_data.if_block);
+    b->SetInsertPoint(if_data.if_block);
     if_data.after_block = CreateBasicBlock(
-        nullptr, tensorflow::strings::StrCat(name, "-after"), ir_builder);
-    ir_builder->CreateBr(if_data.after_block);
+        nullptr, tensorflow::strings::StrCat(name, "-after"), b);
+    b->CreateBr(if_data.after_block);
   } else {
     if_data.after_block = if_data.if_block->splitBasicBlock(
-        ir_builder->GetInsertPoint(),
+        b->GetInsertPoint(),
         AsStringRef(tensorflow::strings::StrCat(name, "-after")));
   }
 
@@ -322,39 +321,37 @@ LlvmIfData EmitIfThenElse(llvm::Value* condition, tensorflow::StringPiece name,
   // we're going to replace it with a conditional branch.
   if_data.if_block->getTerminator()->eraseFromParent();
 
-  ir_builder->SetInsertPoint(if_data.if_block);
-  ir_builder->CreateCondBr(
-      condition, if_data.true_block,
-      emit_else ? if_data.false_block : if_data.after_block);
+  b->SetInsertPoint(if_data.if_block);
+  b->CreateCondBr(condition, if_data.true_block,
+                  emit_else ? if_data.false_block : if_data.after_block);
 
-  ir_builder->SetInsertPoint(if_data.true_block);
-  ir_builder->CreateBr(if_data.after_block);
+  b->SetInsertPoint(if_data.true_block);
+  b->CreateBr(if_data.after_block);
 
   if (emit_else) {
-    ir_builder->SetInsertPoint(if_data.false_block);
-    ir_builder->CreateBr(if_data.after_block);
+    b->SetInsertPoint(if_data.false_block);
+    b->CreateBr(if_data.after_block);
   }
 
-  ir_builder->SetInsertPoint(if_data.after_block,
-                             if_data.after_block->getFirstInsertionPt());
+  b->SetInsertPoint(if_data.after_block,
+                    if_data.after_block->getFirstInsertionPt());
 
   return if_data;
 }
 
 llvm::Value* EmitComparison(llvm::CmpInst::Predicate predicate,
                             llvm::Value* lhs_value, llvm::Value* rhs_value,
-                            llvm::IRBuilder<>* ir_builder) {
+                            llvm::IRBuilder<>* b) {
   llvm::Value* comparison_result;
   if (lhs_value->getType()->isIntegerTy()) {
-    comparison_result = ir_builder->CreateICmp(predicate, lhs_value, rhs_value);
+    comparison_result = b->CreateICmp(predicate, lhs_value, rhs_value);
   } else {
-    comparison_result = ir_builder->CreateFCmp(predicate, lhs_value, rhs_value);
+    comparison_result = b->CreateFCmp(predicate, lhs_value, rhs_value);
   }
   // comparison_result is i1, but the NVPTX codegen incorrectly lowers i1
   // arrays. So we extend it to i8 so that it's addressable.
-  return ir_builder->CreateZExt(
-      comparison_result,
-      llvm_ir::PrimitiveTypeToIrType(PRED, ModuleFromIRBuilder(ir_builder)));
+  return b->CreateZExt(comparison_result, llvm_ir::PrimitiveTypeToIrType(
+                                              PRED, ModuleFromIRBuilder(b)));
 }
 
 // Internal helper that is called from emitted code to log an int64 value with a
@@ -363,17 +360,14 @@ static void LogS64(const char* tag, int64 value) {
   LOG(INFO) << tag << " (int64): " << value;
 }
 
-void EmitLogging(const char* tag, llvm::Value* value,
-                 llvm::IRBuilder<>* ir_builder) {
+void EmitLogging(const char* tag, llvm::Value* value, llvm::IRBuilder<>* b) {
   llvm::FunctionType* log_function_type = llvm::FunctionType::get(
-      ir_builder->getVoidTy(),
-      {ir_builder->getInt64Ty(), ir_builder->getInt64Ty()}, /*isVarArg=*/false);
-  ir_builder->CreateCall(
+      b->getVoidTy(), {b->getInt64Ty(), b->getInt64Ty()}, /*isVarArg=*/false);
+  b->CreateCall(
       log_function_type,
-      ir_builder->CreateIntToPtr(
-          ir_builder->getInt64(tensorflow::bit_cast<int64>(&LogS64)),
-          log_function_type->getPointerTo()),
-      {ir_builder->getInt64(tensorflow::bit_cast<int64>(tag)), value});
+      b->CreateIntToPtr(b->getInt64(tensorflow::bit_cast<int64>(&LogS64)),
+                        log_function_type->getPointerTo()),
+      {b->getInt64(tensorflow::bit_cast<int64>(tag)), value});
 }
 
 void SetAlignmentMetadataForLoad(llvm::LoadInst* load, uint64_t alignment) {
@@ -663,5 +657,56 @@ void InitializeLLVMCommandLineOptions(const HloModuleConfig& config) {
   }
 }
 
+std::pair<llvm::Value*, llvm::Value*> UMulLowHigh32(llvm::IRBuilder<>* b,
+                                                    llvm::Value* src0,
+                                                    llvm::Value* src1) {
+  CHECK_EQ(src0->getType()->getPrimitiveSizeInBits(), 32);
+  CHECK_EQ(src1->getType()->getPrimitiveSizeInBits(), 32);
+  llvm::Type* int64_ty = b->getInt64Ty();
+  src0 = b->CreateZExt(src0, int64_ty);
+  src1 = b->CreateZExt(src1, int64_ty);
+  return SplitInt64ToInt32s(b, b->CreateMul(src0, src1));
+}
+
+std::pair<llvm::Value*, llvm::Value*> SplitInt64ToInt32s(
+    llvm::IRBuilder<>* b, llvm::Value* value_64bits) {
+  CHECK_EQ(value_64bits->getType()->getPrimitiveSizeInBits(), 64);
+  llvm::Type* int32_ty = b->getInt32Ty();
+  llvm::Value* low_32bits = b->CreateTrunc(value_64bits, int32_ty);
+  llvm::Value* high_32bits =
+      b->CreateTrunc(b->CreateLShr(value_64bits, 32), int32_ty);
+  return std::make_pair(low_32bits, high_32bits);
+}
+
+llvm::GlobalVariable* GetOrCreateVariableForPhiloxRngState(
+    llvm::Module* module, llvm::IRBuilder<>* b) {
+  static const char* kPhiloxRngStateVariableName = "philox_rng_state";
+  llvm::GlobalVariable* state_ptr =
+      module->getNamedGlobal(kPhiloxRngStateVariableName);
+  if (!state_ptr) {
+    state_ptr = new llvm::GlobalVariable(
+        /*M=*/*module,
+        /*Ty=*/b->getInt64Ty(),
+        /*isConstant=*/false,
+        /*Linkage=*/llvm::GlobalValue::PrivateLinkage,
+        /*Initializer=*/b->getInt64(0),
+        /*Name=*/kPhiloxRngStateVariableName);
+  }
+  return state_ptr;
+}
+
+void IncrementVariableForPhiloxRngState(int64 value, llvm::Module* module,
+                                        llvm::IRBuilder<>* builder) {
+  llvm::GlobalVariable* state_ptr =
+      GetOrCreateVariableForPhiloxRngState(module, builder);
+  llvm::Value* state_value_old = builder->CreateLoad(state_ptr, "load_state");
+  // If the 64-bit value overflows, we use the wraparound value. This should
+  // be fine in practice as we only add one to the value each time when a RNG is
+  // executed.
+  llvm::Value* state_value_new = builder->CreateAdd(
+      state_value_old, builder->getInt64(value), "inc_state");
+  builder->CreateStore(state_value_new, state_ptr);
+}
+
 }  // namespace llvm_ir
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
index 9c51861eac..0958398534 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
@@ -105,26 +105,26 @@ llvm::Value* EmitCallToIntrinsic(
     llvm::Intrinsic::ID intrinsic_id,
     tensorflow::gtl::ArraySlice<llvm::Value*> operands,
     tensorflow::gtl::ArraySlice<llvm::Type*> overloaded_types,
-    llvm::IRBuilder<>* ir_builder);
+    llvm::IRBuilder<>* b);
 
 // Emit float max. Emit maxnum intrinsic is fast math is disabled, or
 // fcmp+select otherwise
 llvm::Value* EmitFloatMax(llvm::Value* lhs_value, llvm::Value* rhs_value,
-                          llvm::IRBuilder<>* ir_builder);
+                          llvm::IRBuilder<>* b);
 
 // Emit float min. Emit minnum intrinsic is fast math is disabled, or
 // fcmp+select otherwise
 llvm::Value* EmitFloatMin(llvm::Value* lhs_value, llvm::Value* rhs_value,
-                          llvm::IRBuilder<>* ir_builder);
+                          llvm::IRBuilder<>* b);
 
 // Convenience methods for emitting a GEP instruction that indexes into a buffer
 // (1-dimensional array), equivalent to array[index]. The type is automatically
 // determined from the element type of the array.  The int64 index overload
 // wraps the index in a i64 llvm::Value.
 llvm::Value* EmitBufferIndexingGEP(llvm::Value* array, llvm::Value* index,
-                                   llvm::IRBuilder<>* ir_builder);
+                                   llvm::IRBuilder<>* b);
 llvm::Value* EmitBufferIndexingGEP(llvm::Value* array, int64 index,
-                                   llvm::IRBuilder<>* ir_builder);
+                                   llvm::IRBuilder<>* b);
 
 // Returns the LLVM type which represents the given XLA primitive type.
 llvm::Type* PrimitiveTypeToIrType(PrimitiveType element_type,
@@ -139,8 +139,9 @@ llvm::Type* ShapeToIrType(const Shape& shape, llvm::Module* module);
 
 // Returns a value that represents a pointer to a global string constant that
 // encodes the shape as a serialized protobuf.
-StatusOr<llvm::Value*> EncodeSelfDescribingShapeConstant(
-    const Shape& shape, int32* shape_size, llvm::IRBuilder<>* ir_builder);
+StatusOr<llvm::Value*> EncodeSelfDescribingShapeConstant(const Shape& shape,
+                                                         int32* shape_size,
+                                                         llvm::IRBuilder<>* b);
 
 // Inverses the encoding of a Shape protobuf into an LLVM global variable.
 //
@@ -164,21 +165,21 @@ llvm::Constant* ConvertLiteralToIrConstant(const Literal& literal,
 // through a loop.
 llvm::AllocaInst* EmitAllocaAtFunctionEntry(llvm::Type* type,
                                             tensorflow::StringPiece name,
-                                            llvm::IRBuilder<>* ir_builder,
+                                            llvm::IRBuilder<>* b,
                                             int alignment = 0);
 
 // As EmitAllocaAtFunctionEntry, but allocates element_count entries
 // instead of a single element.
 llvm::AllocaInst* EmitAllocaAtFunctionEntryWithCount(
     llvm::Type* type, llvm::Value* element_count, tensorflow::StringPiece name,
-    llvm::IRBuilder<>* ir_builder, int alignment = 0);
+    llvm::IRBuilder<>* b, int alignment = 0);
 
 // Creates a basic block with the same context and function as for the
 // builder. Inserts at the end of the function if insert_before is
 // null.
 llvm::BasicBlock* CreateBasicBlock(llvm::BasicBlock* insert_before,
                                    tensorflow::StringPiece name,
-                                   llvm::IRBuilder<>* ir_builder);
+                                   llvm::IRBuilder<>* b);
 
 // Struct with data on a conditional branch in a diamond shape created
 // via EmitIfThenElse.
@@ -210,13 +211,13 @@ struct LlvmIfData {
 // block with a terminator. If you need to use this for a
 // non-terminated block, just make the function able to do that too.
 LlvmIfData EmitIfThenElse(llvm::Value* condition, tensorflow::StringPiece name,
-                          llvm::IRBuilder<>* ir_builder, bool emit_else = true);
+                          llvm::IRBuilder<>* b, bool emit_else = true);
 
 // Emits a compare operation between "lhs" and "rhs" with the given predicate,
 // and then converts the result to i8 so that it is addressable.
 llvm::Value* EmitComparison(llvm::CmpInst::Predicate predicate,
                             llvm::Value* lhs, llvm::Value* rhs,
-                            llvm::IRBuilder<>* ir_builder);
+                            llvm::IRBuilder<>* b);
 
 // Emits a call that logs the given value with the given tag as a prefix.
 // The provided tag and value are passed to a runtime logging call that is
@@ -228,8 +229,7 @@ llvm::Value* EmitComparison(llvm::CmpInst::Predicate predicate,
 // Precondition: value must be an int64.
 // Precondition: tag must be a stable pointer for the lifetime of the generated
 // program (the constant pointer is burned in to the program).
-void EmitLogging(const char* tag, llvm::Value* value,
-                 llvm::IRBuilder<>* ir_builder);
+void EmitLogging(const char* tag, llvm::Value* value, llvm::IRBuilder<>* b);
 
 // Adds alignment metadata to a load instruction using the given alignment.
 // The alignment refers to the result of the load, not the load itself.
@@ -292,6 +292,27 @@ llvm::Function* CreateFunction(llvm::FunctionType* function_type,
 // don't start with xla_ to LLVM.
 void InitializeLLVMCommandLineOptions(const HloModuleConfig& config);
 
+// Zero-extends two 32-bit values to 64 bits, multiplies them, and returns the
+// result as a pair of (low 32 bits, high 32 bits).
+std::pair<llvm::Value*, llvm::Value*> UMulLowHigh32(llvm::IRBuilder<>* b,
+                                                    llvm::Value* src0,
+                                                    llvm::Value* src1);
+// Splits the 64-bit integer value into its high and low 32 bits.
+std::pair<llvm::Value*, llvm::Value*> SplitInt64ToInt32s(
+    llvm::IRBuilder<>* b, llvm::Value* value_64bits);
+
+// Checks whether a global variable is already created to represent a
+// state passed between RNG calls implemented with Philox algorithm. If not,
+// creates such a variable. Returns the global variable.
+llvm::GlobalVariable* GetOrCreateVariableForPhiloxRngState(
+    llvm::Module* module, llvm::IRBuilder<>* b);
+
+// Adds a value to the global state variable each time when a RNG hlo is
+// executed. The value of this global state variable is added to the seed
+// of the Philox RNG algorithm so that calling the same RNG Hlo multiple times
+// should rarely produce the same result.
+void IncrementVariableForPhiloxRngState(int64 value, llvm::Module* module,
+                                        llvm::IRBuilder<>* b);
 }  // namespace llvm_ir
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
index e8b0605b9d..36f5fa1952 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
@@ -33,26 +33,24 @@ namespace xla {
 namespace llvm_ir {
 
 LoopEmitter::LoopEmitter(const BodyEmitter& body_emitter, const Shape& shape,
-                         llvm::IRBuilder<>* ir_builder)
-    : body_emitter_(body_emitter), shape_(shape), ir_builder_(ir_builder) {}
+                         llvm::IRBuilder<>* b)
+    : body_emitter_(body_emitter), shape_(shape), b_(b) {}
 
 LoopEmitter::LoopEmitter(const ElementGenerator& target_element_generator,
-                         const IrArray& target_array,
-                         llvm::IRBuilder<>* ir_builder)
+                         const IrArray& target_array, llvm::IRBuilder<>* b)
     : body_emitter_([=](const llvm_ir::IrArray::Index array_index) -> Status {
         // Convert target_element_generator to a BodyEmitter.
         TF_ASSIGN_OR_RETURN(llvm::Value * target_element,
                             target_element_generator(array_index));
-        target_array.EmitWriteArrayElement(array_index, target_element,
-                                           ir_builder);
+        target_array.EmitWriteArrayElement(array_index, target_element, b);
         return Status::OK();
       }),
       shape_(target_array.GetShape()),
-      ir_builder_(ir_builder) {}
+      b_(b) {}
 
 static LoopEmitter::BodyEmitter MakeBodyEmitterForMultiOutputFusion(
     const ElementGenerator& target_element_generator,
-    const std::vector<IrArray>& target_arrays, llvm::IRBuilder<>* ir_builder) {
+    const std::vector<IrArray>& target_arrays, llvm::IRBuilder<>* b) {
   return [=](const llvm_ir::IrArray::Index array_index) {
     TF_ASSIGN_OR_RETURN(llvm::Value * target_element,
                         target_element_generator(array_index));
@@ -64,8 +62,7 @@ static LoopEmitter::BodyEmitter MakeBodyEmitterForMultiOutputFusion(
 
     for (int64 i = 0; i < target_arrays.size(); ++i) {
       target_arrays[i].EmitWriteArrayElement(
-          array_index, ir_builder->CreateExtractValue(target_element, i),
-          ir_builder);
+          array_index, b->CreateExtractValue(target_element, i), b);
     }
     return Status::OK();
   };
@@ -73,13 +70,12 @@ static LoopEmitter::BodyEmitter MakeBodyEmitterForMultiOutputFusion(
 
 LoopEmitter::LoopEmitter(const ElementGenerator& target_element_generator,
                          tensorflow::gtl::ArraySlice<IrArray> target_arrays,
-                         llvm::IRBuilder<>* ir_builder)
+                         llvm::IRBuilder<>* b)
     : body_emitter_(MakeBodyEmitterForMultiOutputFusion(
           target_element_generator,
-          std::vector<IrArray>(target_arrays.begin(), target_arrays.end()),
-          ir_builder)),
+          std::vector<IrArray>(target_arrays.begin(), target_arrays.end()), b)),
       shape_(target_arrays[0].GetShape()),
-      ir_builder_(ir_builder) {
+      b_(b) {
   // Sanity check: In multi-output fusion, all shapes produced must have the
   // same dimensions.
   for (const IrArray& array : target_arrays) {
@@ -102,7 +98,7 @@ std::vector<IrArray::Index> LoopEmitter::EmitIndexAndSetExitBasicBlock(
   // Loops are added from outermost to innermost order with the ForLoopNest
   // class so emit loops in order from most-major dimension down to most-minor
   // dimension (of the target shape).
-  ForLoopNest loop_nest(loop_name, ir_builder_);
+  ForLoopNest loop_nest(loop_name, b_);
   IrArray::Index array_index(index_type, shape_.dimensions_size());
   for (int i = 0; i < LayoutUtil::MinorToMajor(shape_).size(); ++i) {
     int64 dimension = LayoutUtil::Major(shape_.layout(), i);
@@ -116,8 +112,8 @@ std::vector<IrArray::Index> LoopEmitter::EmitIndexAndSetExitBasicBlock(
   // Set IR builder insertion point to the loop body basic block of the
   // innermost loop.
   llvm::BasicBlock* innermost_body_bb = loop_nest.GetInnerLoopBodyBasicBlock();
-  ir_builder_->SetInsertPoint(innermost_body_bb,
-                              innermost_body_bb->getFirstInsertionPt());
+  b_->SetInsertPoint(innermost_body_bb,
+                     innermost_body_bb->getFirstInsertionPt());
 
   // Set exit_bb_ to the exit block of the loop nest.
   exit_bb_ = loop_nest.GetOuterLoopExitBasicBlock();
@@ -129,7 +125,7 @@ std::vector<IrArray::Index> LoopEmitter::EmitIndexAndSetExitBasicBlock(
 Status LoopEmitter::EmitLoop(tensorflow::StringPiece loop_name,
                              llvm::Type* index_type) {
   if (index_type == nullptr) {
-    index_type = ir_builder_->getInt64Ty();
+    index_type = b_->getInt64Ty();
   }
 
   for (const IrArray::Index& array_index :
@@ -137,10 +133,10 @@ Status LoopEmitter::EmitLoop(tensorflow::StringPiece loop_name,
     TF_RETURN_IF_ERROR(body_emitter_(array_index));
   }
 
-  // Set the insertion point of ir_builder_ to the loop exit, so that
+  // Set the insertion point of b_ to the loop exit, so that
   // code emitted for later instructions will be correctly placed.
   if (exit_bb_ != nullptr) {
-    ir_builder_->SetInsertPoint(exit_bb_);
+    b_->SetInsertPoint(exit_bb_);
   }
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
index 6be1c2fba2..c4f5c82086 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
@@ -41,11 +41,11 @@ class LoopEmitter {
   using BodyEmitter = std::function<Status(const IrArray::Index& index)>;
 
   LoopEmitter(const BodyEmitter& body_emitter, const Shape& shape,
-              llvm::IRBuilder<>* ir_builder);
+              llvm::IRBuilder<>* b);
   // Constructs a LoopEmitter from an element generator that generates each
   // element of the given target array.
   LoopEmitter(const ElementGenerator& target_element_generator,
-              const IrArray& target_array, llvm::IRBuilder<>* ir_builder);
+              const IrArray& target_array, llvm::IRBuilder<>* b);
 
   // Constructs a LoopEmitter that emits one element into each of N separate
   // arrays on each iteration of the loop.
@@ -54,7 +54,7 @@ class LoopEmitter {
   // produce an LLVM struct with N elements.
   LoopEmitter(const ElementGenerator& target_element_generator,
               tensorflow::gtl::ArraySlice<IrArray> target_arrays,
-              llvm::IRBuilder<>* ir_builder);
+              llvm::IRBuilder<>* b);
 
   LoopEmitter(const LoopEmitter&) = delete;
   LoopEmitter& operator=(const LoopEmitter&) = delete;
@@ -65,8 +65,7 @@ class LoopEmitter {
   // specifies the element, will return multiple indices if the loop is
   // unrolled.
   std::vector<IrArray::Index> EmitIndexAndSetExitBasicBlock() {
-    return EmitIndexAndSetExitBasicBlock(/*loop_name=*/"",
-                                         ir_builder_->getInt64Ty());
+    return EmitIndexAndSetExitBasicBlock(/*loop_name=*/"", b_->getInt64Ty());
   }
 
   virtual std::vector<IrArray::Index> EmitIndexAndSetExitBasicBlock(
@@ -87,7 +86,7 @@ class LoopEmitter {
   // scalar, no loops are emitted and exit_bb_ is nullptr in that case.
   llvm::BasicBlock* exit_bb_;
 
-  llvm::IRBuilder<>* ir_builder_;
+  llvm::IRBuilder<>* b_;
 };
 
 }  // namespace llvm_ir
diff --git a/tensorflow/compiler/xla/service/llvm_ir/math_ops.cc b/tensorflow/compiler/xla/service/llvm_ir/math_ops.cc
new file mode 100644
index 0000000000..0e115cdabf
--- /dev/null
+++ b/tensorflow/compiler/xla/service/llvm_ir/math_ops.cc
@@ -0,0 +1,59 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/llvm_ir/math_ops.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+
+namespace xla {
+namespace llvm_ir {
+
+llvm::Value* EmitFastTanh(llvm::IRBuilder<>* b, llvm::Value* input) {
+  llvm::Type* type = input->getType();
+
+  // Clamp the input to [-9, 9].
+  llvm::Value* input_clamped = llvm_ir::EmitFloatMin(
+      llvm_ir::EmitFloatMax(input, llvm::ConstantFP::get(type, -9.0), b),
+      llvm::ConstantFP::get(type, 9.0), b);
+
+  static constexpr std::array<float, 7> numerator_coeffs{
+      -2.76076847742355e-16f, 2.00018790482477e-13f, -8.60467152213735e-11f,
+      5.12229709037114e-08f,  1.48572235717979e-05f, 6.37261928875436e-04f,
+      4.89352455891786e-03f};
+
+  static constexpr std::array<float, 4> denominator_coeffs{
+      1.19825839466702e-06f, 1.18534705686654e-04f, 2.26843463243900e-03f,
+      4.89352518554385e-03f};
+
+  llvm::Value* input_squared = b->CreateFMul(input_clamped, input_clamped);
+  llvm::Value* numerator = llvm::ConstantFP::get(type, numerator_coeffs[0]);
+  for (int i = 1; i < numerator_coeffs.size(); i++) {
+    numerator = b->CreateFAdd(b->CreateFMul(input_squared, numerator),
+                              llvm::ConstantFP::get(type, numerator_coeffs[i]));
+  }
+
+  numerator = b->CreateFMul(input_clamped, numerator);
+
+  llvm::Value* denominator = llvm::ConstantFP::get(type, denominator_coeffs[0]);
+  for (int i = 1; i < denominator_coeffs.size(); i++) {
+    denominator =
+        b->CreateFAdd(b->CreateFMul(input_squared, denominator),
+                      llvm::ConstantFP::get(type, denominator_coeffs[i]));
+  }
+
+  return b->CreateFDiv(numerator, denominator);
+}
+
+}  // namespace llvm_ir
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/pool_test.cc b/tensorflow/compiler/xla/service/llvm_ir/math_ops.h
index 8c4fe258e3..6c8bc3a076 100644
--- a/tensorflow/compiler/xla/service/pool_test.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/math_ops.h
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,28 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/pool.h"
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_MATH_OPS_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_MATH_OPS_H_
 
-#include "tensorflow/compiler/xla/test_helpers.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Value.h"
 
 namespace xla {
-namespace {
+namespace llvm_ir {
 
-using PoolTest = ::testing::Test;
+// Emits an approximation of tanh. The implementation uses the same rational
+// interpolant as implemented in Eigen3.
+llvm::Value* EmitFastTanh(llvm::IRBuilder<>* b, llvm::Value* input);
 
-TEST_F(PoolTest, Test) {
-  Pool<int> pool;
-
-  {
-    auto ptr = pool.Allocate();
-    EXPECT_NE(nullptr, ptr.get());
-    *ptr = 5;
-  }
-
-  auto ptr = pool.Allocate();
-  EXPECT_NE(nullptr, ptr.get());
-  EXPECT_EQ(5, *ptr);
-}
-
-}  // namespace
+}  // namespace llvm_ir
 }  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_MATH_OPS_H_
diff --git a/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc b/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
index 16a9a5aaeb..5187948e29 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
@@ -19,14 +19,18 @@ limitations under the License.
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/Value.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h"
+#include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -35,49 +39,58 @@ namespace llvm_ir {
 namespace {
 // Adds the inner comparison loop where we compare elements pointed to by
 // 'keys_index' and 'compare_keys_index'.
-void EmitCompareLoop(int64 dimension_to_sort,
-                     const llvm_ir::IrArray::Index& keys_index,
-                     const llvm_ir::IrArray::Index& compare_keys_index,
-                     const llvm_ir::IrArray& keys_array,
-                     llvm::IRBuilder<>* ir_builder) {
-  // TODO(b/26783907): parallelize this loop.
-
+void EmitCompareLoop(int64 dimension_to_sort, const IrArray::Index& keys_index,
+                     const IrArray::Index& compare_keys_index,
+                     const IrArray& keys_array,
+                     const tensorflow::gtl::optional<IrArray>& values_array,
+                     llvm::IRBuilder<>* b) {
   // if (is_smaller_index &&
   //     compare_keys[dimension_to_sort] < dimension_to_sort_bound)
-  llvm::Value* is_smaller_index = ir_builder->CreateICmpSLT(
+  llvm::Value* is_smaller_index = b->CreateICmpSLT(
       keys_index[dimension_to_sort], compare_keys_index[dimension_to_sort]);
   int64 dimension_to_sort_bound =
       keys_array.GetShape().dimensions(dimension_to_sort);
-  auto if_data = llvm_ir::EmitIfThenElse(
-      ir_builder->CreateAnd(
-          is_smaller_index,
-          ir_builder->CreateICmpSLT(
-              compare_keys_index[dimension_to_sort],
-              keys_index.GetConstantWithIndexType(dimension_to_sort_bound))),
-      "smaller_comparison_index", ir_builder, /*emit_else=*/false);
-  SetToFirstInsertPoint(if_data.true_block, ir_builder);
-  auto key1 = keys_array.EmitReadArrayElement(keys_index, ir_builder);
-  auto key2 = keys_array.EmitReadArrayElement(compare_keys_index, ir_builder);
+  auto if_data = EmitIfThenElse(
+      b->CreateAnd(is_smaller_index,
+                   b->CreateICmpSLT(compare_keys_index[dimension_to_sort],
+                                    keys_index.GetConstantWithIndexType(
+                                        dimension_to_sort_bound))),
+      "smaller_comparison_index", b, /*emit_else=*/false);
+  SetToFirstInsertPoint(if_data.true_block, b);
+  auto key1 = keys_array.EmitReadArrayElement(keys_index, b);
+  auto key2 = keys_array.EmitReadArrayElement(compare_keys_index, b);
   auto key_type = keys_array.GetShape().element_type();
   auto comparison =
       primitive_util::IsFloatingPointType(key_type)
           // TODO(b/26783907): Figure out how to handle NaNs.
-          ? ir_builder->CreateFCmp(llvm::FCmpInst::FCMP_ULT, key1, key2)
-          : ir_builder->CreateICmp(
-                primitive_util::IsSignedIntegralType(key_type)
-                    ? llvm::ICmpInst::ICMP_SLT
-                    : llvm::ICmpInst::ICMP_ULT,
-                key1, key2);
-  auto min_key = ir_builder->CreateSelect(comparison, key1, key2);
-  auto max_key = ir_builder->CreateSelect(comparison, key2, key1);
-  keys_array.EmitWriteArrayElement(keys_index, min_key, ir_builder);
-  keys_array.EmitWriteArrayElement(compare_keys_index, max_key, ir_builder);
+          ? b->CreateFCmp(llvm::FCmpInst::FCMP_ULT, key2, key1)
+          : b->CreateICmp(primitive_util::IsSignedIntegralType(key_type)
+                              ? llvm::ICmpInst::ICMP_SLT
+                              : llvm::ICmpInst::ICMP_ULT,
+                          key2, key1);
+  // If key2 < key1
+  auto if_smaller_data =
+      EmitIfThenElse(comparison, "is_smaller_than", b, /*emit_else=*/false);
+  SetToFirstInsertPoint(if_smaller_data.true_block, b);
+  // Swap key1 with key2.
+  keys_array.EmitWriteArrayElement(keys_index, key2, b);
+  keys_array.EmitWriteArrayElement(compare_keys_index, key1, b);
+  if (values_array.has_value()) {
+    // Also swap the values.
+    auto value1 = values_array.value().EmitReadArrayElement(keys_index, b);
+    auto value2 =
+        values_array.value().EmitReadArrayElement(compare_keys_index, b);
+    values_array.value().EmitWriteArrayElement(keys_index, value2, b);
+    values_array.value().EmitWriteArrayElement(compare_keys_index, value1, b);
+  }
 }
 }  // namespace
 
 Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array,
-                       tensorflow::StringPiece name,
-                       llvm::IRBuilder<>* ir_builder) {
+                       const tensorflow::gtl::optional<IrArray>& values_array,
+                       tensorflow::StringPiece name, llvm::Value* xor_mask,
+                       llvm::IRBuilder<>* b,
+                       const gpu::LaunchDimensions* launch_dimensions) {
   const Shape& keys_shape = keys_array.GetShape();
 
   // TODO(b/26783907): This case can probably be avoided with the Algebraic
@@ -87,11 +100,13 @@ Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array,
   }
 
   // Create loop nests which loop through the operand dimensions. The sort
-  // dimension is handled in three separate innermost loops which perform the
-  // sorting.
-  ForLoopNest loop_nest(name, ir_builder);
+  // dimension is handled in the innermost loop which performs the sorting.
+  ForLoopNest loop_nest(name, b);
   IrArray::Index keys_index =
       loop_nest.EmitOperandArrayLoopNest(keys_array, dimension_to_sort, "keys");
+  if (loop_nest.GetInnerLoopBodyBasicBlock() != nullptr) {
+    SetToFirstInsertPoint(loop_nest.GetInnerLoopBodyBasicBlock(), b);
+  }
 
   // 'compare_keys_index' is the index of the element that 'keys_index' should
   // be compared to.
@@ -104,95 +119,46 @@ Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array,
     }
   }
 
-  // Create the sorting loops which do the sorting.
-  int64 dimension_to_sort_bound = keys_shape.dimensions(dimension_to_sort);
-  std::unique_ptr<ForLoop> stages_loop = loop_nest.AddLoop(
-      /*start_index=*/0,
-      /*end_index=*/
-      tensorflow::Log2Ceiling64(dimension_to_sort_bound),
-      /*suffix=*/"sort_stages");
-  std::unique_ptr<ForLoop> mask_loop = loop_nest.AddLoop(
-      /*suffix=*/"mask",
-      /*start_index=*/keys_index.GetConstantWithIndexType(0),
-      /*end_index=*/stages_loop->GetIndVarValue());
-  std::unique_ptr<ForLoop> compare_loop = loop_nest.AddLoop(
-      /*start_index=*/0,
-      /*end_index=*/dimension_to_sort_bound,
-      /*suffix=*/"compare");
-
-  // Naive C++ code for the inner loops (without parallelization):
+  // Naive C++ code for the inner compare loop:
   //
-  // for (int64 stage = 0; stage < Log2Ceiling(dimension_to_sort_bound);
-  //     ++stage) {
-  //   int64 first_xor_mask = (1LL << (stage + 1)) - 1;
-  //   for (int64 i = 0; i < dimension_to_sort_bound; ++i) {
-  //     int64 j = i ^ first_xor_mask;
-  //     if (i < j && j < dimension_to_sort_bound) {
-  //       int64 min_key = std::min(keys[i], keys[j]);
-  //       keys[j] = std::max(keys[i], keys[j]);
-  //       keys[i] = min_key;
-  //     }
-  //   }
-  //   for (int64 mask = 0; mask < stage; ++mask) {
-  //     int64 later_xor_mask = (1LL << (stage - (mask + 1));
-  //     for (int64 i = 0; i < dimension_to_sort_bound; ++i) {
-  //       int64 j = i ^ later_xor_mask;
-  //       if (i < j && j < dimension_to_sort_bound) {
-  //         int64 min_key = std::min(keys[i], keys[j]);
-  //         keys[j] = std::max(keys[i], keys[j]);
-  //         keys[i] = min_key;
-  //       }
-  //     }
+  // for (int64 i = 0; i < dimension_to_sort_bound; ++i) {
+  //   int64 j = i ^ xor_mask;
+  //   if (i < j && j < dimension_to_sort_bound) {
+  //     int64 min_key = std::min(keys[i], keys[j]);
+  //     keys[j] = std::max(keys[i], keys[j]);
+  //     keys[i] = min_key;
   //   }
   // }
   //
   // This follows the algorithm described on Wikipedia:
   // https://en.wikipedia.org/wiki/Bitonic_sorter
 
-  SetToFirstInsertPoint(stages_loop->GetBodyBasicBlock(), ir_builder);
-  // The first xor mask of a stage is 2^(stage + 1) - 1.
-  auto first_xor_mask = ir_builder->CreateSub(
-      ir_builder->CreateShl(
-          keys_index.GetConstantWithIndexType(1),
-          ir_builder->CreateAdd(stages_loop->GetIndVarValue(),
-                                keys_index.GetConstantWithIndexType(1))),
-      keys_index.GetConstantWithIndexType(1));
-  std::unique_ptr<ForLoop> first_compare_loop = ForLoop::EmitForLoop(
-      /*prefix=*/"first_compare",
-      /*start_index=*/keys_index.GetConstantWithIndexType(0),
-      /*end_index=*/
-      keys_index.GetConstantWithIndexType(dimension_to_sort_bound),
-      /*step=*/keys_index.GetConstantWithIndexType(1),
-      /*ir_builder=*/ir_builder);
-
-  SetToFirstInsertPoint(first_compare_loop->GetBodyBasicBlock(), ir_builder);
-  // 'first_compare_loop' iterates through the 'dimension_to_sort'.
-  keys_index[dimension_to_sort] = first_compare_loop->GetIndVarValue();
-  compare_keys_index[dimension_to_sort] = ir_builder->CreateXor(
-      first_compare_loop->GetIndVarValue(), first_xor_mask);
-  EmitCompareLoop(dimension_to_sort, keys_index, compare_keys_index, keys_array,
-                  ir_builder);
-
-  SetToFirstInsertPoint(compare_loop->GetPreheaderBasicBlock(), ir_builder);
-  // The later masks of a stage are 2^(stage - (mask_loop_ind_var + 1)).
-  auto later_xor_mask = ir_builder->CreateShl(
-      keys_index.GetConstantWithIndexType(1),
-      ir_builder->CreateSub(
-          stages_loop->GetIndVarValue(),
-          ir_builder->CreateAdd(mask_loop->GetIndVarValue(),
-                                keys_index.GetConstantWithIndexType(1))));
-
-  SetToFirstInsertPoint(compare_loop->GetBodyBasicBlock(), ir_builder);
-  // 'compare_loop' iterates through the 'dimension_to_sort'.
-  keys_index[dimension_to_sort] = compare_loop->GetIndVarValue();
-  compare_keys_index[dimension_to_sort] =
-      ir_builder->CreateXor(compare_loop->GetIndVarValue(), later_xor_mask);
-  EmitCompareLoop(dimension_to_sort, keys_index, compare_keys_index, keys_array,
-                  ir_builder);
+  int64 dimension_to_sort_bound =
+      keys_array.GetShape().dimensions(dimension_to_sort);
+  Shape compare_shape = ShapeUtil::MakeShape(keys_shape.element_type(),
+                                             {dimension_to_sort_bound});
+  auto compare_loop_body_emitter =
+      [&](const IrArray::Index& compare_index) -> Status {
+    keys_index[dimension_to_sort] = compare_index[0];
+    compare_keys_index[dimension_to_sort] =
+        b->CreateXor(compare_index[0], xor_mask);
+    EmitCompareLoop(dimension_to_sort, keys_index, compare_keys_index,
+                    keys_array, values_array, b);
+    return Status::OK();
+  };
+  if (launch_dimensions != nullptr) {
+    TF_RETURN_IF_ERROR(gpu::ParallelLoopEmitter(compare_loop_body_emitter,
+                                                compare_shape,
+                                                *launch_dimensions, b)
+                           .EmitLoop(name));
+  } else {
+    TF_RETURN_IF_ERROR(LoopEmitter(compare_loop_body_emitter, compare_shape, b)
+                           .EmitLoop(name));
+  }
 
   // Set the IR builder insert point to the exit basic block of the outer most
   // loop. This ensures later instructions are inserted after this loop nest.
-  ir_builder->SetInsertPoint(loop_nest.GetOuterLoopExitBasicBlock());
+  b->SetInsertPoint(loop_nest.GetOuterLoopExitBasicBlock());
 
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/llvm_ir/sort_util.h b/tensorflow/compiler/xla/service/llvm_ir/sort_util.h
index fc45bfab12..8458744c6b 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/sort_util.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/sort_util.h
@@ -16,18 +16,25 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_SORT_UTIL_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_SORT_UTIL_H_
 
+#include "llvm/IR/Value.h"
+#include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
 namespace llvm_ir {
-// Emits llvm IR to sort the 'dimension_to_sort' dimension of 'keys_array' into
-// ascending order.
+// Emits llvm IR to do pairwise comparisons/swaps in the 'dimension_to_sort'
+// dimension of 'keys_array'. All other dimensions are kept as-is. This
+// implements the inner loop of BitonicSort. If 'launch_dimensions' is nullptr,
+// the inner compare loop will not be parallelized.
 Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array,
-                       tensorflow::StringPiece name,
-                       llvm::IRBuilder<>* ir_builder);
+                       const tensorflow::gtl::optional<IrArray>& values_array,
+                       tensorflow::StringPiece name, llvm::Value* xor_mask,
+                       llvm::IRBuilder<>* b,
+                       const gpu::LaunchDimensions* launch_dimensions);
 }  // namespace llvm_ir
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.cc b/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.cc
index 5fc08aab91..11ed6ee59f 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.cc
@@ -31,12 +31,12 @@ namespace llvm_ir {
 
 void EmitTupleSelect(const IrArray& select, const IrArray& pred,
                      llvm::Value* on_true, llvm::Value* on_false,
-                     llvm::IRBuilder<>* ir_builder, llvm::Module* module) {
+                     llvm::IRBuilder<>* b, llvm::Module* module) {
   CHECK(ShapeUtil::IsScalar(pred.GetShape()));
 
   llvm::LoadInst* pred_value =
-      ir_builder->CreateLoad(pred.GetBasePointer(), "load_predicate_value");
-  llvm::Value* pred_cond = ir_builder->CreateICmpNE(
+      b->CreateLoad(pred.GetBasePointer(), "load_predicate_value");
+  llvm::Value* pred_cond = b->CreateICmpNE(
       pred_value,
       llvm::ConstantInt::get(PrimitiveTypeToIrType(PRED, module), 0),
       "boolean_predicate");
@@ -46,47 +46,42 @@ void EmitTupleSelect(const IrArray& select, const IrArray& pred,
   VLOG(2) << "  pred_cond: " << DumpToString(*pred_cond);
 
   for (int i = 0; i < ShapeUtil::TupleElementCount(select.GetShape()); ++i) {
-    llvm::Value* const element_index[] = {ir_builder->getInt64(0),
-                                          ir_builder->getInt64(i)};
+    llvm::Value* const element_index[] = {b->getInt64(0), b->getInt64(i)};
     llvm::Value* on_true_element_address =
-        ir_builder->CreateInBoundsGEP(on_true, element_index);
-    llvm::Value* on_true_element = ir_builder->CreateLoad(
+        b->CreateInBoundsGEP(on_true, element_index);
+    llvm::Value* on_true_element = b->CreateLoad(
         on_true_element_address, "on_true_element_" + llvm::Twine(i));
     llvm::Value* on_false_element_address =
-        ir_builder->CreateInBoundsGEP(on_false, element_index);
-    llvm::Value* on_false_element = ir_builder->CreateLoad(
+        b->CreateInBoundsGEP(on_false, element_index);
+    llvm::Value* on_false_element = b->CreateLoad(
         on_false_element_address, "on_false_element_" + llvm::Twine(i));
 
     llvm::Value* output_element_address =
-        ir_builder->CreateInBoundsGEP(select.GetBasePointer(), element_index);
-    ir_builder->CreateStore(
-        ir_builder->CreateSelect(pred_cond, on_true_element, on_false_element,
-                                 "select_output_element_" + llvm::Twine(i)),
-        output_element_address);
+        b->CreateInBoundsGEP(select.GetBasePointer(), element_index);
+    b->CreateStore(b->CreateSelect(pred_cond, on_true_element, on_false_element,
+                                   "select_output_element_" + llvm::Twine(i)),
+                   output_element_address);
   }
 }
 
 void EmitTuple(const IrArray& tuple,
                tensorflow::gtl::ArraySlice<llvm::Value*> operands,
-               llvm::IRBuilder<>* ir_builder, llvm::Module* module) {
+               llvm::IRBuilder<>* b, llvm::Module* module) {
   for (size_t i = 0; i < operands.size(); ++i) {
-    auto* store = ir_builder->CreateStore(
-        ir_builder->CreatePointerCast(operands[i],
-                                      PrimitiveTypeToIrType(TUPLE, module)),
-        ir_builder->CreateInBoundsGEP(
-            tuple.GetBasePointer(),
-            {ir_builder->getInt64(0), ir_builder->getInt64(i)}));
+    auto* store = b->CreateStore(
+        b->CreatePointerCast(operands[i], PrimitiveTypeToIrType(TUPLE, module)),
+        b->CreateInBoundsGEP(tuple.GetBasePointer(),
+                             {b->getInt64(0), b->getInt64(i)}));
     tuple.AnnotateLoadStoreInstructionWithMetadata(store);
   }
 }
 
 llvm::Value* EmitGetTupleElement(const Shape& target_shape, int64 index,
                                  int alignment, llvm::Value* operand,
-                                 llvm::IRBuilder<>* ir_builder,
-                                 llvm::Module* module) {
-  llvm::Value* element_ptr = ir_builder->CreateInBoundsGEP(
-      operand, {ir_builder->getInt64(0), ir_builder->getInt64(index)});
-  llvm::LoadInst* src_buffer = ir_builder->CreateLoad(element_ptr);
+                                 llvm::IRBuilder<>* b, llvm::Module* module) {
+  llvm::Value* element_ptr =
+      b->CreateInBoundsGEP(operand, {b->getInt64(0), b->getInt64(index)});
+  llvm::LoadInst* src_buffer = b->CreateLoad(element_ptr);
 
   // Mark the loaded pointer as dereferenceable if we know its shape.
   if (!ShapeUtil::IsOpaque(target_shape)) {
@@ -98,7 +93,7 @@ llvm::Value* EmitGetTupleElement(const Shape& target_shape, int64 index,
 
   llvm::Type* element_type = ShapeToIrType(target_shape, module);
   llvm::Value* ret_val =
-      ir_builder->CreateBitCast(src_buffer, element_type->getPointerTo());
+      b->CreateBitCast(src_buffer, element_type->getPointerTo());
   return ret_val;
 }
 
diff --git a/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h b/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h
index 352d34ebf8..cf6bf5d0b1 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h
@@ -61,13 +61,13 @@ namespace llvm_ir {
 //   output[i] = pred ? tuple_on_true[i] : tuple_on_false[i]
 void EmitTupleSelect(const IrArray& select, const IrArray& pred,
                      llvm::Value* on_true, llvm::Value* on_false,
-                     llvm::IRBuilder<>* ir_builder, llvm::Module* module);
+                     llvm::IRBuilder<>* b, llvm::Module* module);
 
 // A tuple is an array of pointers, one for each operand. Each pointer points to
 // the output buffer of its corresponding operand.
 void EmitTuple(const IrArray& tuple,
                tensorflow::gtl::ArraySlice<llvm::Value*> operands,
-               llvm::IRBuilder<>* ir_builder, llvm::Module* module);
+               llvm::IRBuilder<>* b, llvm::Module* module);
 
 // A tuple is an array of pointers, one for each operand. Each pointer points to
 // the output buffer of its corresponding operand. A GetTupleElement instruction
@@ -75,8 +75,7 @@ void EmitTuple(const IrArray& tuple,
 // Returns an llvm value representing a pointer to the tuple element buffer.
 llvm::Value* EmitGetTupleElement(const Shape& target_shape, int64 index,
                                  int alignment, llvm::Value* operand,
-                                 llvm::IRBuilder<>* ir_builder,
-                                 llvm::Module* module);
+                                 llvm::IRBuilder<>* b, llvm::Module* module);
 }  // namespace llvm_ir
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc
index 53efc30c36..5e02096ee5 100644
--- a/tensorflow/compiler/xla/service/local_service.cc
+++ b/tensorflow/compiler/xla/service/local_service.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/client/executable_build_options.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/backend.h"
diff --git a/tensorflow/compiler/xla/service/local_service.h b/tensorflow/compiler/xla/service/local_service.h
index 39d6734c3f..8f707ea904 100644
--- a/tensorflow/compiler/xla/service/local_service.h
+++ b/tensorflow/compiler/xla/service/local_service.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/compiler/xla/client/executable_build_options.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
diff --git a/tensorflow/compiler/xla/service/pool.h b/tensorflow/compiler/xla/service/pool.h
deleted file mode 100644
index 8e710ebb6d..0000000000
--- a/tensorflow/compiler/xla/service/pool.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_POOL_H_
-#define TENSORFLOW_COMPILER_XLA_POOL_H_
-
-#include <functional>
-#include <vector>
-
-#include "tensorflow/compiler/xla/ptr_util.h"
-#include "tensorflow/core/platform/mutex.h"
-
-namespace xla {
-
-// Pool of values, which are created as needed and destroyed when the `Pool` is
-// destroyed
-template <typename T>
-class Pool {
- public:
-  struct Deleter {
-    void operator()(T* ptr) { pool->Deallocate(ptr); }
-
-    Pool<T>* pool;
-  };
-
-  // A pointer to a taken element of a `Pool` which returns it to the pool on
-  // destruction
-  using SmartPtr = std::unique_ptr<T, Deleter>;
-
-  // Constructs a `Pool` with given factory function, which need not be
-  // thread-safe.
-  explicit Pool(std::function<std::unique_ptr<T>()> factory)
-      : factory_(factory) {}
-
-  explicit Pool() : Pool([]() { return MakeUnique<T>(); }) {}
-
-  // Returns a pointer to a value in the pool, creating a new value if none is
-  // free. The returned smart pointer returns the element to the pool on
-  // destruction.
-  //
-  // This method is thread-safe.
-  SmartPtr Allocate() {
-    tensorflow::mutex_lock lock(mu_);
-    T* ptr;
-    if (!xs_.empty()) {
-      ptr = std::move(xs_.back()).release();
-      xs_.pop_back();
-    } else {
-      ptr = factory_().release();
-    }
-    Deleter del = {this};
-    return std::unique_ptr<T, Deleter>(ptr, del);
-  }
-
- private:
-  // Puts a pointer to a value back into the pool, leaving it free for future
-  // use.
-  //
-  // This method is thread-safe.
-  void Deallocate(T* ptr) {
-    tensorflow::mutex_lock lock(mu_);
-    xs_.push_back(std::unique_ptr<T>(ptr));
-  }
-
-  const std::function<std::unique_ptr<T>()> factory_ GUARDED_BY(mu_);
-  std::vector<std::unique_ptr<T>> xs_ GUARDED_BY(mu_);
-  tensorflow::mutex mu_;
-};
-
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_POOL_H_
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index 636013cbb5..ce070bc5b6 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_proto_util.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/service/source_map_util.h"
+#include "tensorflow/compiler/xla/service/stream_pool.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
 #include "tensorflow/compiler/xla/shape_layout.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -376,7 +377,7 @@ Service::ExecuteParallelAndRegisterResult(
     ExecutionProfile* profile) {
   // Streams where the computation are launched, so we can wait on the streams
   // to complete.
-  std::vector<Pool<se::Stream>::SmartPtr> streams;
+  std::vector<StreamPool::Ptr> streams;
   std::vector<std::unique_ptr<se::Timer>> timers;
 
   // Global data handles for the computation results, one for each computation.
@@ -403,7 +404,7 @@ Service::ExecuteParallelAndRegisterResult(
     CHECK_EQ(replicas.size(), arguments[i].size());
     std::vector<ScopedShapedBuffer> result_buffers;
     for (int64 replica = 0; replica < replicas.size(); ++replica) {
-      TF_ASSIGN_OR_RETURN(Pool<se::Stream>::SmartPtr stream,
+      TF_ASSIGN_OR_RETURN(StreamPool::Ptr stream,
                           backend->BorrowStream(replicas[replica]));
       streams.push_back(std::move(stream));
 
@@ -515,13 +516,13 @@ StatusOr<GlobalDataHandle> Service::ExecuteAndRegisterResult(
         arguments,
     Backend* backend, const string& result_tag, ExecutionProfile* profile) {
   // Set up streams.
-  std::vector<Pool<se::Stream>::SmartPtr> streams;
+  std::vector<StreamPool::Ptr> streams;
 
   TF_ASSIGN_OR_RETURN(auto replicas,
                       Replicas(*backend, SingleComputationDeviceHandle()));
   TF_RET_CHECK(!replicas.empty());
   for (se::StreamExecutor* executor : replicas) {
-    TF_ASSIGN_OR_RETURN(Pool<se::Stream>::SmartPtr stream,
+    TF_ASSIGN_OR_RETURN(StreamPool::Ptr stream,
                         backend->BorrowStream(executor));
     streams.push_back(std::move(stream));
   }
@@ -533,7 +534,7 @@ StatusOr<GlobalDataHandle> Service::ExecuteAndRegisterResult(
 
   // Set up run options.
   std::vector<ServiceExecutableRunOptions> run_options;
-  for (const Pool<se::Stream>::SmartPtr& stream : streams) {
+  for (const StreamPool::Ptr& stream : streams) {
     ExecutableRunOptions options;
     options.set_stream(stream.get());
     options.set_device_ordinal(stream->parent()->device_ordinal());
diff --git a/tensorflow/compiler/xla/service/service_executable_run_options.h b/tensorflow/compiler/xla/service/service_executable_run_options.h
index 7f3910cdb0..dbfed628bf 100644
--- a/tensorflow/compiler/xla/service/service_executable_run_options.h
+++ b/tensorflow/compiler/xla/service/service_executable_run_options.h
@@ -17,7 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_SERVICE_EXECUTABLE_RUN_OPTIONS_H_
 
 #include "tensorflow/compiler/xla/executable_run_options.h"
-#include "tensorflow/compiler/xla/service/pool.h"
+#include "tensorflow/compiler/xla/service/stream_pool.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/stream_executor/stream_executor.h"
 
@@ -27,8 +27,7 @@ namespace xla {
 // data, now only a stream cache for GPU backend.
 class ServiceExecutableRunOptions {
  public:
-  using StreamBorrower =
-      std::function<StatusOr<Pool<se::Stream>::SmartPtr>(int)>;
+  using StreamBorrower = std::function<StatusOr<StreamPool::Ptr>(int)>;
 
   ServiceExecutableRunOptions()
       : ServiceExecutableRunOptions(ExecutableRunOptions()) {}
@@ -51,7 +50,7 @@ class ServiceExecutableRunOptions {
 
   // Borrows a stream and returns a smart pointer which returns the stream on
   // destruction.
-  StatusOr<Pool<se::Stream>::SmartPtr> BorrowStream(int device_ordinal) const {
+  StatusOr<StreamPool::Ptr> BorrowStream(int device_ordinal) const {
     return borrow_stream_
                ? borrow_stream_(device_ordinal)
                : Status(tensorflow::error::UNIMPLEMENTED, "No stream cache");
diff --git a/tensorflow/compiler/xla/service/stream_pool.cc b/tensorflow/compiler/xla/service/stream_pool.cc
new file mode 100644
index 0000000000..92bb21b816
--- /dev/null
+++ b/tensorflow/compiler/xla/service/stream_pool.cc
@@ -0,0 +1,56 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/stream_pool.h"
+
+#include "tensorflow/compiler/xla/ptr_util.h"
+
+namespace xla {
+
+StreamPool::Ptr StreamPool::BorrowStream(se::StreamExecutor* executor) {
+  std::unique_ptr<se::Stream> stream;
+  {
+    tensorflow::mutex_lock lock(mu_);
+    if (!streams_.empty()) {
+      // Re-use an existing stream from the pool.
+      stream = std::move(streams_.back());
+      streams_.pop_back();
+    }
+  }
+
+  if (!stream) {
+    // Create a new stream.
+    stream = MakeUnique<se::Stream>(executor);
+    stream->Init();
+  }
+
+  // Return the stream wrapped in Ptr, which has our special deleter semantics.
+  PtrDeleter deleter = {this};
+  return Ptr(stream.release(), deleter);
+}
+
+void StreamPool::ReturnStream(se::Stream* stream) {
+  if (stream->ok()) {
+    tensorflow::mutex_lock lock(mu_);
+    streams_.emplace_back(stream);
+  } else {
+    // If the stream has encountered any errors, all subsequent
+    // operations on it will fail. So just delete the stream, and rely
+    // on new streams to be created in the future.
+    delete stream;
+  }
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/stream_pool.h b/tensorflow/compiler/xla/service/stream_pool.h
new file mode 100644
index 0000000000..7221d323a6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/stream_pool.h
@@ -0,0 +1,64 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_STREAM_POOL_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_STREAM_POOL_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+namespace xla {
+
+// Pool of stream_executor::Streams, which are created as needed and
+// destroyed when the pool is destroyed.
+class StreamPool {
+ public:
+  struct PtrDeleter {
+    void operator()(se::Stream* stream) { pool->ReturnStream(stream); }
+    StreamPool* pool;
+  };
+
+  // Stream pointer type returned by BorrowStream, which returns the
+  // stream to the pool on destruction.
+  using Ptr = std::unique_ptr<se::Stream, PtrDeleter>;
+
+  StreamPool() {}
+
+  // Returns a pointer to a stream in the pool, creating a new stream
+  // if none are available in the pool. The returned smart pointer
+  // returns the stream to the pool on destruction.
+  //
+  // This method is thread-safe.
+  Ptr BorrowStream(se::StreamExecutor* executor);
+
+ private:
+  // Puts a pointer to a stream back into the pool, leaving it free
+  // for future use. Streams that have previously encountered errors
+  // are deleted, and not returned to the pool.
+  //
+  // This method is thread-safe.
+  void ReturnStream(se::Stream* stream);
+
+  tensorflow::mutex mu_;
+  std::vector<std::unique_ptr<se::Stream>> streams_ GUARDED_BY(mu_);
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_STREAM_POOL_H_
diff --git a/tensorflow/compiler/xla/service/stream_pool_test.cc b/tensorflow/compiler/xla/service/stream_pool_test.cc
new file mode 100644
index 0000000000..aaf5c37b0d
--- /dev/null
+++ b/tensorflow/compiler/xla/service/stream_pool_test.cc
@@ -0,0 +1,136 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/stream_pool.h"
+
+#include <memory>
+
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+namespace xla {
+namespace {
+
+class StreamPoolTest : public ::testing::Test {
+ protected:
+  std::unique_ptr<se::StreamExecutor> NewStreamExecutor() {
+    se::Platform* platform =
+        se::MultiPlatformManager::PlatformWithName("Host").ConsumeValueOrDie();
+    se::StreamExecutorConfig config(/*ordinal=*/0);
+    return platform->GetUncachedExecutor(config).ConsumeValueOrDie();
+  }
+};
+
+TEST_F(StreamPoolTest, EmptyPool) { StreamPool pool; }
+
+TEST_F(StreamPoolTest, OneStreamPool) {
+  std::unique_ptr<se::StreamExecutor> executor = NewStreamExecutor();
+  StreamPool pool;
+
+  // Borrow and return a stream.
+  StreamPool::Ptr stream1 = pool.BorrowStream(executor.get());
+  se::Stream* stream1_ptr = stream1.get();
+  EXPECT_TRUE(stream1->ok());
+  stream1 = nullptr;
+
+  // Borrow and return another stream.
+  StreamPool::Ptr stream2 = pool.BorrowStream(executor.get());
+  se::Stream* stream2_ptr = stream2.get();
+  EXPECT_TRUE(stream2->ok());
+  stream2 = nullptr;
+
+  // The underlying streams should be the same, since stream1 was the
+  // only stream available in the pool when stream2 was borrowed.
+  EXPECT_EQ(stream1_ptr, stream2_ptr);
+}
+
+TEST_F(StreamPoolTest, TwoStreamPool) {
+  std::unique_ptr<se::StreamExecutor> executor = NewStreamExecutor();
+  StreamPool pool;
+
+  // Borrow two streams.
+  StreamPool::Ptr stream1 = pool.BorrowStream(executor.get());
+  se::Stream* stream1_ptr = stream1.get();
+  EXPECT_TRUE(stream1->ok());
+  StreamPool::Ptr stream2 = pool.BorrowStream(executor.get());
+  se::Stream* stream2_ptr = stream2.get();
+  EXPECT_TRUE(stream2->ok());
+
+  // The underlying streams should be different, since we haven't
+  // returned either of them yet.
+  EXPECT_NE(stream1_ptr, stream2_ptr);
+
+  // Return stream1 and borrow stream3.
+  stream1 = nullptr;
+  StreamPool::Ptr stream3 = pool.BorrowStream(executor.get());
+  se::Stream* stream3_ptr = stream3.get();
+  EXPECT_TRUE(stream3->ok());
+
+  // stream1 and stream3 should be the same.
+  EXPECT_EQ(stream1_ptr, stream3_ptr);
+  EXPECT_NE(stream2_ptr, stream3_ptr);
+
+  // Return stream2, and borrow stream4.
+  stream2 = nullptr;
+  StreamPool::Ptr stream4 = pool.BorrowStream(executor.get());
+  se::Stream* stream4_ptr = stream4.get();
+  EXPECT_TRUE(stream4->ok());
+
+  // Stream2 and stream4 should be the same.
+  EXPECT_EQ(stream2_ptr, stream4_ptr);
+  EXPECT_NE(stream3_ptr, stream4_ptr);
+}
+
+TEST_F(StreamPoolTest, BadStreamDiscarded) {
+  std::unique_ptr<se::StreamExecutor> executor = NewStreamExecutor();
+  StreamPool pool;
+
+  // Borrow a stream.
+  StreamPool::Ptr stream1 = pool.BorrowStream(executor.get());
+  EXPECT_TRUE(stream1->ok());
+
+  // Force an error on the stream; here we call a method that requires
+  // DNN support, which we know the Host platform doesn't support.
+  stream1->ThenDepthConcatenate({}, {}, nullptr);
+  EXPECT_FALSE(stream1->ok());
+
+  // Return stream1 and borrow stream2.
+  stream1 = nullptr;
+  StreamPool::Ptr stream2 = pool.BorrowStream(executor.get());
+  se::Stream* stream2_ptr = stream2.get();
+  EXPECT_TRUE(stream2->ok());
+
+  // The underlying streams should be different. They would have been
+  // the same, but since we forced an error on stream1, it cannot be
+  // put back into the pool. Sadly we can't just check:
+  //    EXPECT_NE(stream1_ptr, stream2_ptr);
+  //
+  // The above should hold logically, but it may fail if the new
+  // stream instance allocated for stream2 happens to reside in the
+  // same memory address as stream1, which has been deleted.
+  //
+  // The check that stream2->ok() serves as a good-enough check.
+
+  // Return stream2 and borrow stream3. The previous error on stream1
+  // has no effect on these streams, and they are the same.
+  stream2 = nullptr;
+  StreamPool::Ptr stream3 = pool.BorrowStream(executor.get());
+  se::Stream* stream3_ptr = stream3.get();
+  EXPECT_TRUE(stream3->ok());
+  EXPECT_EQ(stream2_ptr, stream3_ptr);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/transpose_folding_test.cc b/tensorflow/compiler/xla/service/transpose_folding_test.cc
index 7051a4cf51..58f767e913 100644
--- a/tensorflow/compiler/xla/service/transpose_folding_test.cc
+++ b/tensorflow/compiler/xla/service/transpose_folding_test.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index 6480148336..ec901af1e2 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -682,7 +682,7 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
                            CompatibleIgnoringElementType);
   } else {
     // Opaque, token, etc types are vacuously compatible.
-    return true;
+    return lhs.element_type() == rhs.element_type();
   }
 }
 
@@ -697,7 +697,7 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
                            CompatibleIgnoringFpPrecision);
   } else {
     // Opaque, token, etc types are vacuously compatible.
-    return true;
+    return lhs.element_type() == rhs.element_type();
   }
 }
 
diff --git a/tensorflow/compiler/xla/shape_util_test.cc b/tensorflow/compiler/xla/shape_util_test.cc
index ed2d16c0e9..e5dd62ae9a 100644
--- a/tensorflow/compiler/xla/shape_util_test.cc
+++ b/tensorflow/compiler/xla/shape_util_test.cc
@@ -334,6 +334,17 @@ TEST(ShapeUtilTest, IncompatibleScalarVsTuple) {
   EXPECT_FALSE(ShapeUtil::CompatibleIgnoringFpPrecision(shape2, shape1));
 }
 
+TEST(ShapeUtilTest, OpaqueVsArray) {
+  Shape shape1 = ShapeUtil::MakeShape(F32, {5, 7});
+  Shape shape2 = ShapeUtil::MakeOpaqueShape();
+  EXPECT_FALSE(ShapeUtil::Compatible(shape1, shape2));
+  EXPECT_FALSE(ShapeUtil::Compatible(shape2, shape1));
+  EXPECT_FALSE(ShapeUtil::CompatibleIgnoringFpPrecision(shape1, shape2));
+  EXPECT_FALSE(ShapeUtil::CompatibleIgnoringFpPrecision(shape2, shape1));
+  EXPECT_FALSE(ShapeUtil::CompatibleIgnoringElementType(shape1, shape2));
+  EXPECT_FALSE(ShapeUtil::CompatibleIgnoringElementType(shape2, shape1));
+}
+
 TEST(ShapeUtilTest, CompareShapesWithPaddedDimensionsMismatch) {
   Shape shape1 = ShapeUtil::MakeShape(F32, {20, 30});
   shape1.mutable_layout()->add_padded_dimensions(10);
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 6a75aa6794..42d52aee78 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -154,8 +154,8 @@ tf_cc_binary(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
         "//tensorflow/core:lib",
@@ -192,8 +192,8 @@ cc_library(
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service:interpreter_plugin",  # reference backend
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -262,7 +262,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service:computation_placer",
         "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:local_service",
@@ -290,8 +290,8 @@ xla_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
@@ -314,8 +314,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
@@ -334,8 +334,8 @@ xla_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
@@ -356,9 +356,9 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -376,9 +376,10 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service:platform_util",
+        "//tensorflow/compiler/xla/service:stream_pool",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:lib",
@@ -395,8 +396,8 @@ xla_test(
     ],
     deps = [
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -419,9 +420,9 @@ xla_test(
         "//tensorflow/compiler/xla:xla_proto",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -445,8 +446,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -464,9 +465,9 @@ xla_test(
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
@@ -483,8 +484,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -501,8 +502,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -519,9 +520,9 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -543,8 +544,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -562,8 +563,8 @@ xla_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
@@ -586,8 +587,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
@@ -612,8 +613,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -638,7 +639,7 @@ xla_test(
     deps = [
         ":client_library_test_base",
         ":literal_test_util",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
     ],
@@ -658,7 +659,7 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/service:reduce_precision_insertion",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -681,8 +682,8 @@ xla_test(
         "//tensorflow/compiler/xla:reference_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -702,7 +703,7 @@ xla_test(
         "//tensorflow/compiler/xla:execution_options_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
@@ -725,8 +726,8 @@ xla_test(
         "//tensorflow/compiler/xla:reference_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -749,8 +750,8 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -773,8 +774,8 @@ xla_test(
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -795,7 +796,7 @@ CONVOLUTION_TEST_DEPS = [
     "//tensorflow/compiler/xla/client:global_data",
     "//tensorflow/compiler/xla/client:local_client",
     "//tensorflow/compiler/xla/client:padding",
-    "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+    "//tensorflow/compiler/xla/client:xla_builder",
     "//tensorflow/compiler/xla/tests:client_library_test_base",
     "//tensorflow/compiler/xla/tests:literal_test_util",
     "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -838,8 +839,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:padding",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -862,8 +863,8 @@ xla_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:padding",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -891,10 +892,10 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/client/lib:math",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
@@ -924,9 +925,9 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
@@ -950,8 +951,8 @@ xla_test(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -972,7 +973,7 @@ xla_test(
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:reference_util",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -991,8 +992,8 @@ xla_test(
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array3d",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1013,7 +1014,7 @@ xla_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/service:computation_placer",
         "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:local_service",
@@ -1044,8 +1045,8 @@ xla_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -1065,9 +1066,9 @@ xla_test(
         "//tensorflow/compiler/xla:array3d",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1096,9 +1097,9 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1123,9 +1124,9 @@ xla_test_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:padding",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -1164,9 +1165,9 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:padding",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1187,7 +1188,7 @@ xla_test(
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -1241,8 +1242,8 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1260,7 +1261,7 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service/cpu:custom_call_target_registry",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
@@ -1283,8 +1284,8 @@ xla_test(
         "//tensorflow/compiler/xla:array4d",
         "//tensorflow/compiler/xla:reference_util",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1306,8 +1307,8 @@ xla_test(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1327,8 +1328,8 @@ xla_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1345,8 +1346,8 @@ xla_test(
     ],
     deps = [
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1362,8 +1363,8 @@ xla_test(
     ],
     deps = [
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1387,8 +1388,8 @@ xla_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -1408,8 +1409,8 @@ xla_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
@@ -1438,8 +1439,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1458,7 +1459,7 @@ xla_test(
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array4d",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1481,9 +1482,9 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1507,8 +1508,8 @@ xla_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1527,8 +1528,8 @@ xla_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1549,8 +1550,8 @@ xla_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
@@ -1572,7 +1573,7 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1593,8 +1594,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_proto",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -1612,8 +1613,8 @@ xla_test(
     ],
     deps = [
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1635,8 +1636,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:global_data",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1656,8 +1657,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -1673,8 +1674,8 @@ xla_test(
     deps = [
         ":client_library_test_base",
         "//tensorflow/compiler/xla/client:global_data",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
     ],
@@ -1687,8 +1688,8 @@ xla_test(
     deps = [
         ":client_library_test_base",
         "//tensorflow/compiler/xla/client:global_data",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
     ],
@@ -1708,8 +1709,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -1793,8 +1794,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service:hlo_runner",
@@ -1821,8 +1822,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_runner",
         "//tensorflow/compiler/xla/service:platform_util",
@@ -1858,8 +1859,8 @@ xla_test(
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service:local_service",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -1886,8 +1887,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:local_service",
         "//tensorflow/compiler/xla/service:platform_util",
@@ -1903,6 +1904,16 @@ xla_test(
     ],
 )
 
+xla_test(
+    name = "outfeed_in_nested_computation_test",
+    srcs = ["outfeed_in_nested_computation_test.cc"],
+    deps = [
+        "//tensorflow/compiler/xla/tests:local_client_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+    ],
+)
+
 tf_cc_test(
     name = "hlo_metadata_test",
     srcs = [
@@ -1912,7 +1923,7 @@ tf_cc_test(
         ":local_client_test_base",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/service:cpu_plugin",
         "//tensorflow/compiler/xla/service:local_service",
         "//tensorflow/core:test_main",
@@ -1954,8 +1965,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1968,7 +1979,7 @@ xla_test(
     name = "deep_graph_test",
     srcs = ["deep_graph_test.cc"],
     deps = [
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
@@ -2001,6 +2012,7 @@ xla_test(
         "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:generic_transfer_manager",
         "//tensorflow/compiler/xla/service:shaped_buffer",
+        "//tensorflow/compiler/xla/service:stream_pool",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core:test",
@@ -2053,10 +2065,30 @@ xla_test(
         ":local_client_test_base",
         ":test_utils",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
     ],
 )
+
+xla_test(
+    name = "iota_test",
+    srcs = ["iota_test.cc"],
+    blacklisted_backends = [
+        "cpu",
+        "gpu",
+    ],
+    tags = [
+        "enable_for_xla_interpreter",
+    ],
+    deps = [
+        ":client_library_test_base",
+        ":literal_test_util",
+        ":xla_internal_test_main",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+    ],
+)
diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
index 3ae96fa1bc..74f2e36f82 100644
--- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
@@ -24,7 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/xla/tests/axpy_simple_test.cc b/tensorflow/compiler/xla/tests/axpy_simple_test.cc
index 8d15b7841b..caeb0bf49a 100644
--- a/tensorflow/compiler/xla/tests/axpy_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/axpy_simple_test.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
diff --git a/tensorflow/compiler/xla/tests/bad_rng_shape_validation_test.cc b/tensorflow/compiler/xla/tests/bad_rng_shape_validation_test.cc
index 8c227df7f0..af0b852239 100644
--- a/tensorflow/compiler/xla/tests/bad_rng_shape_validation_test.cc
+++ b/tensorflow/compiler/xla/tests/bad_rng_shape_validation_test.cc
@@ -19,8 +19,8 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
diff --git a/tensorflow/compiler/xla/tests/batch_normalization_test.cc b/tensorflow/compiler/xla/tests/batch_normalization_test.cc
index 6a024798f9..d372d1ca43 100644
--- a/tensorflow/compiler/xla/tests/batch_normalization_test.cc
+++ b/tensorflow/compiler/xla/tests/batch_normalization_test.cc
@@ -22,8 +22,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/lib/math.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
diff --git a/tensorflow/compiler/xla/tests/bfloat16_test.cc b/tensorflow/compiler/xla/tests/bfloat16_test.cc
index 747c82b502..6c20f654fe 100644
--- a/tensorflow/compiler/xla/tests/bfloat16_test.cc
+++ b/tensorflow/compiler/xla/tests/bfloat16_test.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
diff --git a/tensorflow/compiler/xla/tests/binop_scaling_test.cc b/tensorflow/compiler/xla/tests/binop_scaling_test.cc
index 20cb989751..0d7a3aa46a 100644
--- a/tensorflow/compiler/xla/tests/binop_scaling_test.cc
+++ b/tensorflow/compiler/xla/tests/binop_scaling_test.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
diff --git a/tensorflow/compiler/xla/tests/bitcast_convert_test.cc b/tensorflow/compiler/xla/tests/bitcast_convert_test.cc
index d531e8fa82..c6b5108fe9 100644
--- a/tensorflow/compiler/xla/tests/bitcast_convert_test.cc
+++ b/tensorflow/compiler/xla/tests/bitcast_convert_test.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
diff --git a/tensorflow/compiler/xla/tests/broadcast_simple_test.cc b/tensorflow/compiler/xla/tests/broadcast_simple_test.cc
index 50dd574624..1d28e85b16 100644
--- a/tensorflow/compiler/xla/tests/broadcast_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/broadcast_simple_test.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/xla/tests/call_test.cc b/tensorflow/compiler/xla/tests/call_test.cc
index 2086e38b91..b1d18210ea 100644
--- a/tensorflow/compiler/xla/tests/call_test.cc
+++ b/tensorflow/compiler/xla/tests/call_test.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
diff --git a/tensorflow/compiler/xla/tests/check_execution_arity_test.cc b/tensorflow/compiler/xla/tests/check_execution_arity_test.cc
index 0bc8facfe2..a4eb57fc7b 100644
--- a/tensorflow/compiler/xla/tests/check_execution_arity_test.cc
+++ b/tensorflow/compiler/xla/tests/check_execution_arity_test.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.cc b/tensorflow/compiler/xla/tests/client_library_test_base.cc
index ef784da457..59d917054b 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.cc
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.cc
@@ -19,7 +19,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
@@ -273,10 +273,16 @@ Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
     const Shape* shape_with_layout) {
   std::vector<GlobalData*> arguments(arguments_passed_in.begin(),
                                      arguments_passed_in.end());
+
+  // Transfer and use elements of arguments_, if the AddParam() API was used.
+  std::vector<std::unique_ptr<GlobalData>> owning_arguments;
   if (!arguments_.empty()) {
     CHECK(arguments.empty());
     for (const auto& argument : arguments_) {
-      arguments.push_back(argument.get());
+      owning_arguments.push_back(
+          client_->TransferToServer(MaybeConvertLiteralToBfloat16(argument))
+              .ValueOrDie());
+      arguments.push_back(owning_arguments.back().get());
     }
   }
 
@@ -331,10 +337,16 @@ Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
     ErrorSpec error, const Shape* shape_with_layout) {
   std::vector<GlobalData*> arguments(arguments_passed_in.begin(),
                                      arguments_passed_in.end());
+
+  // Transfer and use elements of arguments_, if the AddParam() API was used.
+  std::vector<std::unique_ptr<GlobalData>> owning_arguments;
   if (!arguments_.empty()) {
     CHECK(arguments.empty());
     for (const auto& argument : arguments_) {
-      arguments.push_back(argument.get());
+      owning_arguments.push_back(
+          client_->TransferToServer(MaybeConvertLiteralToBfloat16(argument))
+              .ValueOrDie());
+      arguments.push_back(owning_arguments.back().get());
     }
   }
 
@@ -454,6 +466,14 @@ ClientLibraryTestBase::ComputeValueAndReference(
   // function.
   std::vector<std::unique_ptr<GlobalData>> argument_data;
   std::vector<std::unique_ptr<GlobalData>> ref_argument_data;
+
+  // Use `arguments_` if the AddParam() API was used.  Otherwise, use
+  // plain `arguments`.
+  if (!arguments_.empty()) {
+    CHECK_EQ(arguments.size(), 0);
+    arguments = arguments_;
+  }
+
   for (const auto& arg : arguments) {
     TF_ASSIGN_OR_RETURN(auto data, client_->TransferToServer(arg.Clone()));
     TF_ASSIGN_OR_RETURN(auto ref_data, ref_client_->TransferToServer(arg));
@@ -552,10 +572,9 @@ ClientLibraryTestBase::CreatePatternedMatrixWithZeroPadding(int rows, int cols,
 
 XlaOp ClientLibraryTestBase::AddParam(const Literal& argument,
                                       XlaBuilder* builder) {
-  XlaOp data_handle;
-  arguments_.push_back(CreateParameterAndTransferLiteral(
-      arguments_.size(), argument, "", builder, &data_handle));
-  return data_handle;
+  arguments_.push_back(argument.Clone());
+  return Parameter(builder, /*parameter_number=*/arguments_.size() - 1,
+                   MaybeConvertShapeToBfloat16(argument.shape()), "");
 }
 
 XlaOp ClientLibraryTestBase::CreateConstantFromLiteral(const Literal& literal,
@@ -575,22 +594,39 @@ ClientLibraryTestBase::CreateParameterAndTransferLiteral(int64 parameter_number,
                                            nullptr, builder, data_handle);
 }
 
+Shape ClientLibraryTestBase::MaybeConvertShapeToBfloat16(const Shape& shape) {
+  if (!use_bfloat16_) {
+    return shape;
+  }
+  Shape new_shape = shape;
+  ShapeUtil::ForEachMutableSubshape(&new_shape,
+                                    [](Shape* subshape, const ShapeIndex&) {
+                                      if (subshape->element_type() == F32) {
+                                        subshape->set_element_type(BF16);
+                                      }
+                                    });
+  return new_shape;
+}
+
+Literal ClientLibraryTestBase::MaybeConvertLiteralToBfloat16(
+    const Literal& literal) {
+  if (use_bfloat16_) {
+    return std::move(*LiteralUtil::ConvertF32ToBF16(literal));
+  }
+  return literal.Clone();
+}
+
 std::unique_ptr<GlobalData>
 ClientLibraryTestBase::CreateParameterAndTransferLiteral(
     int64 parameter_number, const Literal& literal, const string& name,
     const DeviceHandle* device_handle, XlaBuilder* builder,
     XlaOp* data_handle) {
-  const Literal* param_literal = &literal;
-  std::unique_ptr<Literal> converted_literal;
-  if (use_bfloat16_) {
-    converted_literal = LiteralUtil::ConvertF32ToBF16(literal);
-    param_literal = converted_literal.get();
-  }
+  Literal param_literal = MaybeConvertLiteralToBfloat16(literal);
   std::unique_ptr<GlobalData> data =
-      client_->TransferToServer(*param_literal, device_handle)
+      client_->TransferToServer(param_literal, device_handle)
           .ConsumeValueOrDie();
   *data_handle =
-      Parameter(builder, parameter_number, param_literal->shape(), name);
+      Parameter(builder, parameter_number, param_literal.shape(), name);
   return data;
 }
 
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h
index fcc9347db5..4a6e8a3124 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.h
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.h
@@ -26,8 +26,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
@@ -399,12 +399,16 @@ class ClientLibraryTestBase : public ::testing::Test {
                                const string& error_message)>& verify_output,
       const Shape* output_with_layout = nullptr);
 
+  // Converts an f32 shape/literal to bf16 if use_bfloat16_ is true.
+  Literal MaybeConvertLiteralToBfloat16(const Literal& literal);
+  Shape MaybeConvertShapeToBfloat16(const Shape& shape);
+
   // Whether to run tests with all float-type input/output converted to
   // bfloat16.
   bool use_bfloat16_ = false;
 
   // Arguments to be passed to the computation when it runs.
-  std::vector<std::unique_ptr<GlobalData>> arguments_;
+  std::vector<Literal> arguments_;
 };
 
 template <typename NativeT>
diff --git a/tensorflow/compiler/xla/tests/client_test.cc b/tensorflow/compiler/xla/tests/client_test.cc
index 6ce2f844a3..c898dacf48 100644
--- a/tensorflow/compiler/xla/tests/client_test.cc
+++ b/tensorflow/compiler/xla/tests/client_test.cc
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/xla/tests/compilation_cache_test.cc b/tensorflow/compiler/xla/tests/compilation_cache_test.cc
index ff38246286..7c52c9fbbb 100644
--- a/tensorflow/compiler/xla/tests/compilation_cache_test.cc
+++ b/tensorflow/compiler/xla/tests/compilation_cache_test.cc
@@ -19,8 +19,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/xla/tests/compute_constant_test.cc b/tensorflow/compiler/xla/tests/compute_constant_test.cc
index 64bf8b3b38..5a06d061f0 100644
--- a/tensorflow/compiler/xla/tests/compute_constant_test.cc
+++ b/tensorflow/compiler/xla/tests/compute_constant_test.cc
@@ -19,8 +19,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/shape_util.h"
diff --git a/tensorflow/compiler/xla/tests/concat_test.cc b/tensorflow/compiler/xla/tests/concat_test.cc
index 9f288634c0..be017477d8 100644
--- a/tensorflow/compiler/xla/tests/concat_test.cc
+++ b/tensorflow/compiler/xla/tests/concat_test.cc
@@ -19,8 +19,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
diff --git a/tensorflow/compiler/xla/tests/conditional_test.cc b/tensorflow/compiler/xla/tests/conditional_test.cc
index 369663de15..b27c1044ba 100644
--- a/tensorflow/compiler/xla/tests/conditional_test.cc
+++ b/tensorflow/compiler/xla/tests/conditional_test.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
diff --git a/tensorflow/compiler/xla/tests/constants_test.cc b/tensorflow/compiler/xla/tests/constants_test.cc
index 71d72a9828..4937574831 100644
--- a/tensorflow/compiler/xla/tests/constants_test.cc
+++ b/tensorflow/compiler/xla/tests/constants_test.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array3d.h"
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
diff --git a/tensorflow/compiler/xla/tests/convert_test.cc b/tensorflow/compiler/xla/tests/convert_test.cc
index dca57fd1c7..1adc68cc48 100644
--- a/tensorflow/compiler/xla/tests/convert_test.cc
+++ b/tensorflow/compiler/xla/tests/convert_test.cc
@@ -13,13 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <array>
 #include <cstdint>
 #include <limits>
 #include <memory>
 #include <vector>
 
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
@@ -52,13 +53,67 @@ TEST_F(ConvertTest, ConvertR1S32ToR1S32) {
   ComputeAndCompareR1<int32>(&builder, expected, {});
 }
 
+TEST_F(ConvertTest, ConvertR1S32ToR1U32) {
+  XlaBuilder builder(TestName());
+  auto a = ConstantR1<int32>(&builder, {42, 64});
+  ConvertElementType(a, U32);
+
+  std::vector<uint32> expected = {42, 64};
+  ComputeAndCompareR1<uint32>(&builder, expected, {});
+}
+
+TEST_F(ConvertTest, ConvertR1S32ToR1PRED) {
+  XlaBuilder builder(TestName());
+  auto a = ConstantR1<int32>(&builder, {42, 0, -64});
+  ConvertElementType(a, PRED);
+
+  std::array<bool, 3> expected = {true, false, true};
+  ComputeAndCompareR1<bool>(&builder, expected, {});
+}
+
+TEST_F(ConvertTest, ConvertR1U32ToR1U32) {
+  XlaBuilder builder(TestName());
+  auto a = ConstantR1<uint32>(&builder, {42, 64});
+  ConvertElementType(a, U32);
+
+  std::vector<uint32> expected = {42, 64};
+  ComputeAndCompareR1<uint32>(&builder, expected, {});
+}
+
+TEST_F(ConvertTest, ConvertR1U32ToR1S32) {
+  XlaBuilder builder(TestName());
+  auto a = ConstantR1<uint32>(&builder, {42, 64});
+  ConvertElementType(a, S32);
+
+  std::vector<int32> expected = {42, 64};
+  ComputeAndCompareR1<int32>(&builder, expected, {});
+}
+
+TEST_F(ConvertTest, ConvertR1U32ToR1PRED) {
+  XlaBuilder builder(TestName());
+  auto a = ConstantR1<uint32>(&builder, {42, 0, 64});
+  ConvertElementType(a, PRED);
+
+  std::array<bool, 3> expected = {true, false, true};
+  ComputeAndCompareR1<bool>(&builder, expected, {});
+}
+
 TEST_F(ConvertTest, ConvertR1F32ToR1F32) {
   XlaBuilder builder(TestName());
   auto a = ConstantR1<float>(&builder, {42.0f, 64.0f});
   ConvertElementType(a, F32);
 
   std::vector<float> expected = {42.0f, 64.0f};
-  ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
+  ComputeAndCompareR1<float>(&builder, expected, {});
+}
+
+TEST_F(ConvertTest, ConvertR1F32ToR1PRED) {
+  XlaBuilder builder(TestName());
+  auto a = ConstantR1<float>(&builder, {42.0f, 0.0f, 64.0f});
+  ConvertElementType(a, PRED);
+
+  std::array<bool, 3> expected = {true, false, true};
+  ComputeAndCompareR1<bool>(&builder, expected, {});
 }
 
 TEST_F(ConvertTest, ConvertR1S32ToR1F32) {
@@ -67,7 +122,7 @@ TEST_F(ConvertTest, ConvertR1S32ToR1F32) {
   ConvertElementType(a, F32);
 
   std::vector<float> expected = {42.0f, 64.0f};
-  ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
+  ComputeAndCompareR1<float>(&builder, expected, {});
 }
 
 TEST_F(ConvertTest, ConvertR1PREDToR1S32) {
@@ -79,6 +134,15 @@ TEST_F(ConvertTest, ConvertR1PREDToR1S32) {
   ComputeAndCompareR1<int32>(&builder, expected, {});
 }
 
+TEST_F(ConvertTest, ConvertR1PREDToR1U32) {
+  XlaBuilder builder(TestName());
+  auto a = ConstantR1<bool>(&builder, {true, false, true});
+  ConvertElementType(a, U32);
+
+  std::vector<uint32> expected = {1, 0, 1};
+  ComputeAndCompareR1<uint32>(&builder, expected, {});
+}
+
 TEST_F(ConvertTest, ConvertR1PREDToR1F32) {
   XlaBuilder builder(TestName());
   auto a = ConstantR1<bool>(&builder, {true, false, true});
@@ -94,7 +158,7 @@ XLA_TEST_F(ConvertTest, ConvertR1S0S32ToR1S0F32) {
   ConvertElementType(a, F32);
 
   std::vector<float> expected = {};
-  ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
+  ComputeAndCompareR1<float>(&builder, expected, {});
 }
 
 TEST_F(ConvertTest, ConvertR1F32ToR1S32) {
diff --git a/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc b/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc
index 944366410b..7b6bbc4f57 100644
--- a/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/padding.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/xla/tests/convolution_test.cc b/tensorflow/compiler/xla/tests/convolution_test.cc
index a8b8f74ca9..5ed8122e00 100644
--- a/tensorflow/compiler/xla/tests/convolution_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_test.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/padding.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
diff --git a/tensorflow/compiler/xla/tests/convolution_variants_test.cc b/tensorflow/compiler/xla/tests/convolution_variants_test.cc
index 8792e7781b..6784c16715 100644
--- a/tensorflow/compiler/xla/tests/convolution_variants_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_variants_test.cc
@@ -27,7 +27,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/padding.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
diff --git a/tensorflow/compiler/xla/tests/copy_test.cc b/tensorflow/compiler/xla/tests/copy_test.cc
index 1dc6ff0f4f..5ef273e5a2 100644
--- a/tensorflow/compiler/xla/tests/copy_test.cc
+++ b/tensorflow/compiler/xla/tests/copy_test.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include <utility>
 
 #include "tensorflow/compiler/xla/array2d.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
diff --git a/tensorflow/compiler/xla/tests/custom_call_test.cc b/tensorflow/compiler/xla/tests/custom_call_test.cc
index 90f3d1b874..13c777835e 100644
--- a/tensorflow/compiler/xla/tests/custom_call_test.cc
+++ b/tensorflow/compiler/xla/tests/custom_call_test.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h"
diff --git a/tensorflow/compiler/xla/tests/deallocation_test.cc b/tensorflow/compiler/xla/tests/deallocation_test.cc
index d4b3aac85b..5f234f36a8 100644
--- a/tensorflow/compiler/xla/tests/deallocation_test.cc
+++ b/tensorflow/compiler/xla/tests/deallocation_test.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
diff --git a/tensorflow/compiler/xla/tests/deconstruct_tuple_test.cc b/tensorflow/compiler/xla/tests/deconstruct_tuple_test.cc
index a6a233e71a..2db6503afa 100644
--- a/tensorflow/compiler/xla/tests/deconstruct_tuple_test.cc
+++ b/tensorflow/compiler/xla/tests/deconstruct_tuple_test.cc
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/xla/tests/deep_graph_test.cc b/tensorflow/compiler/xla/tests/deep_graph_test.cc
index 810947ab01..3f3e8ab712 100644
--- a/tensorflow/compiler/xla/tests/deep_graph_test.cc
+++ b/tensorflow/compiler/xla/tests/deep_graph_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/tests/dot_operation_test.cc b/tensorflow/compiler/xla/tests/dot_operation_test.cc
index d86fd7cc2d..cfd36abf47 100644
--- a/tensorflow/compiler/xla/tests/dot_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/dot_operation_test.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
diff --git a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
index 88ac96d6b0..7f6f203a1b 100644
--- a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/local_service.h"
diff --git a/tensorflow/compiler/xla/tests/execution_profile_test.cc b/tensorflow/compiler/xla/tests/execution_profile_test.cc
index ebba13c5b3..5116e60ca6 100644
--- a/tensorflow/compiler/xla/tests/execution_profile_test.cc
+++ b/tensorflow/compiler/xla/tests/execution_profile_test.cc
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/client/global_data.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/core/platform/test.h"
diff --git a/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc b/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc
index 86bfaea4ef..bf1de02ba9 100644
--- a/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc
+++ b/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
diff --git a/tensorflow/compiler/xla/tests/floor_ceil_test.cc b/tensorflow/compiler/xla/tests/floor_ceil_test.cc
index 30dc639f11..39cc6c5927 100644
--- a/tensorflow/compiler/xla/tests/floor_ceil_test.cc
+++ b/tensorflow/compiler/xla/tests/floor_ceil_test.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
diff --git a/tensorflow/compiler/xla/tests/fmax_test.cc b/tensorflow/compiler/xla/tests/fmax_test.cc
index 0254ae1baa..c5bbbe778d 100644
--- a/tensorflow/compiler/xla/tests/fmax_test.cc
+++ b/tensorflow/compiler/xla/tests/fmax_test.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/core/platform/test.h"
diff --git a/tensorflow/compiler/xla/tests/fusion_test.cc b/tensorflow/compiler/xla/tests/fusion_test.cc
index dc64477935..792be0d3fc 100644
--- a/tensorflow/compiler/xla/tests/fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/fusion_test.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
@@ -799,6 +799,46 @@ ENTRY main {
       *result));
 }
 
+class FusionClientLibraryTest : public ClientLibraryTestBase {};
+
+XLA_TEST_F(FusionClientLibraryTest, ManyLayoutTransformations) {
+  // On the GPU backend, it's possible to have too many transposes within one
+  // fusion, causing the kernel to run out shared memory and thus not compile.
+  // We want to check that doesn't happen.
+  //
+  // To do this, we create a computation that computes
+  //
+  //   P0 + P0*P1*P1 + P0*P2*P2 ...
+  //
+  // where even parameters have layout 1 and odd parameters have layout 2.
+  //
+  // Our goal is to tempt the backend into creating one giant multi-output
+  // fusion for the whole computation, including the transposes.  Currently
+  // multi-output fusion only fuses fusions, so each of the terms in the sum
+  // needs to be a fusion itself, thus the contortions above.
+  constexpr int kNumParams = 25;
+  XlaBuilder b("ManyLayoutTransformations");
+
+  // This test produces values that overflow int32, which is UB, so use uint32,
+  // where overflow is OK.
+  Array2D<uint32> arr(32, 32);
+  arr.FillUnique();
+  std::unique_ptr<Literal> l1 = LiteralUtil::CreateR2FromArray2D(arr)->Relayout(
+      LayoutUtil::MakeLayout({0, 1}));
+
+  std::unique_ptr<Literal> l2 = LiteralUtil::CreateR2FromArray2D(arr)->Relayout(
+      LayoutUtil::MakeLayout({1, 0}));
+
+  XlaOp p0 = AddParam(*l1, &b);
+  XlaOp sum = p0;
+  for (int i = 1; i < kNumParams; ++i) {
+    auto pN = AddParam((i % 2 == 0 ? *l1 : *l2), &b);
+    sum = sum + p0 * pN * pN;
+  }
+
+  ComputeAndCompare(&b, {});
+}
+
 void BM_ParallelFusion(int num_iters) {
   // Simple element-wise computation to benchmark parallel task partitioning.
   tensorflow::testing::StopTiming();
diff --git a/tensorflow/compiler/xla/tests/gather_operation_test.cc b/tensorflow/compiler/xla/tests/gather_operation_test.cc
index c5ca64fa3f..b77bece85a 100644
--- a/tensorflow/compiler/xla/tests/gather_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/gather_operation_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/status_macros.h"
diff --git a/tensorflow/compiler/xla/tests/half_test.cc b/tensorflow/compiler/xla/tests/half_test.cc
index 73a47eda72..51450314b6 100644
--- a/tensorflow/compiler/xla/tests/half_test.cc
+++ b/tensorflow/compiler/xla/tests/half_test.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include <cmath>
 #include <vector>
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
@@ -48,7 +48,8 @@ class UnaryOpTest : public HalfTestBase,
                     public ::testing::WithParamInterface<UnaryOpTestParam> {};
 
 XLA_TEST_P(UnaryOpTest, Ops) {
-  std::vector<half> x({half(1.4), half(-2.3), half(3.2), half(-4.1)});
+  std::vector<half> x({half(1.4), half(-2.3), half(3.2), half(-4.1), half(9.0),
+                       half(42.0), half(-9.0), half(-100.0)});
   XlaBuilder builder(TestName());
   XlaOp x_opnd;
   auto x_data = CreateR1Parameter<half>(x, /*parameter_number=*/0, "x",
diff --git a/tensorflow/compiler/xla/tests/hlo_metadata_test.cc b/tensorflow/compiler/xla/tests/hlo_metadata_test.cc
index 4d82442f7e..5511190caf 100644
--- a/tensorflow/compiler/xla/tests/hlo_metadata_test.cc
+++ b/tensorflow/compiler/xla/tests/hlo_metadata_test.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/service/local_service.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/local_client_test_base.h"
diff --git a/tensorflow/compiler/xla/tests/iota_test.cc b/tensorflow/compiler/xla/tests/iota_test.cc
new file mode 100644
index 0000000000..f950aa1e8f
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/iota_test.cc
@@ -0,0 +1,61 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <numeric>
+#include <vector>
+
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace xla {
+namespace {
+
+class IotaTest : public ClientLibraryTestBase {
+ public:
+  explicit IotaTest(se::Platform* platform = nullptr)
+      : ClientLibraryTestBase(platform) {}
+  template <typename T>
+  std::vector<T> GetExpected(const int64 num_elements) {
+    std::vector<T> result(num_elements);
+    std::iota(result.begin(), result.end(), 0);
+    return result;
+  }
+};
+
+TEST_F(IotaTest, SimpleR1) {
+  for (int num_elements = 1; num_elements < 10000001; num_elements *= 10) {
+    {
+      XlaBuilder builder(TestName() + "_f32");
+      IotaGen(&builder, F32, num_elements);
+      ComputeAndCompareR1<float>(&builder, GetExpected<float>(num_elements), {},
+                                 ErrorSpec{0.0001});
+    }
+    {
+      XlaBuilder builder(TestName() + "_u32");
+      IotaGen(&builder, U32, num_elements);
+      ComputeAndCompareR1<uint32>(&builder, GetExpected<uint32>(num_elements),
+                                  {});
+    }
+    {
+      XlaBuilder builder(TestName() + "_s32");
+      IotaGen(&builder, S32, num_elements);
+      ComputeAndCompareR1<int32>(&builder, GetExpected<int32>(num_elements),
+                                 {});
+    }
+  }
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/local_client_allocation_test.cc b/tensorflow/compiler/xla/tests/local_client_allocation_test.cc
index 0df50150ae..e2cd5bcc5a 100644
--- a/tensorflow/compiler/xla/tests/local_client_allocation_test.cc
+++ b/tensorflow/compiler/xla/tests/local_client_allocation_test.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/local_service.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
diff --git a/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc b/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
index 70612e7c49..74494e60e8 100644
--- a/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
+++ b/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
@@ -21,8 +21,8 @@ limitations under the License.
 
 #include "llvm/ADT/Triple.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -92,9 +92,10 @@ int main(int argc, char** argv) {
   // It's lame to hard-code the buffer assignments, but we need
   // local_client_aot_test.cc to be able to easily invoke the function.
   CHECK_EQ(result->result_buffer_index(), 1);
-  CHECK_EQ(result->buffer_sizes().size(), 2);
+  CHECK_EQ(result->buffer_sizes().size(), 3);
   CHECK_EQ(result->buffer_sizes()[0], -1);             // param buffer
   CHECK_EQ(result->buffer_sizes()[1], sizeof(float));  // result buffer
+  CHECK_EQ(result->buffer_sizes()[2], -1);             // const buffer
   if (triple.isOSBinFormatELF()) {
     // Check the ELF magic.
     CHECK_EQ(result->object_file_data()[0], 0x7F);
diff --git a/tensorflow/compiler/xla/tests/local_client_execute_test.cc b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
index 2f4d197ae6..1a823cf189 100644
--- a/tensorflow/compiler/xla/tests/local_client_execute_test.cc
+++ b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
@@ -19,7 +19,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
@@ -772,6 +772,10 @@ XLA_TEST_F(LocalClientExecuteTest, CompileExecutable) {
   ScopedShapedBuffer result =
       executable->Run({&x_array}, DefaultExecutableRunOptions())
           .ConsumeValueOrDie();
+  ASSERT_IS_OK(local_client_->mutable_backend()
+                   ->BorrowStream(0)
+                   .ValueOrDie()
+                   ->BlockHostUntilDone());
 
   LiteralTestUtil::ExpectR1Near<float>(
       {2.0f, 4.0f, 6.0f}, *ShapedBufferToLiteral(result), error_spec_);
diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.cc b/tensorflow/compiler/xla/tests/local_client_test_base.cc
index 88797a7d0a..eaddf756db 100644
--- a/tensorflow/compiler/xla/tests/local_client_test_base.cc
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -189,7 +190,19 @@ StatusOr<ScopedShapedBuffer> LocalClientTestBase::ExecuteLocally(
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<LocalExecutable> executable,
       local_client_->Compile(computation, argument_layouts, build_options));
-  return executable->Run(arguments, run_options);
+  TF_ASSIGN_OR_RETURN(auto ret, executable->Run(arguments, run_options));
+
+  auto device_ordinal =
+      build_options.device_ordinal() == -1 ? 0 : build_options.device_ordinal();
+  auto* stream = run_options.stream();
+  if (!stream) {
+    stream = local_client_->mutable_backend()
+                 ->BorrowStream(device_ordinal)
+                 .ValueOrDie()
+                 .get();
+  }
+  TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
+  return std::move(ret);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.h b/tensorflow/compiler/xla/tests/local_client_test_base.h
index 258226523d..b4477e9a6b 100644
--- a/tensorflow/compiler/xla/tests/local_client_test_base.h
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.h
@@ -22,7 +22,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/local_service.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
diff --git a/tensorflow/compiler/xla/tests/log_test.cc b/tensorflow/compiler/xla/tests/log_test.cc
index cdf70ee418..2d622242e6 100644
--- a/tensorflow/compiler/xla/tests/log_test.cc
+++ b/tensorflow/compiler/xla/tests/log_test.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
diff --git a/tensorflow/compiler/xla/tests/map_test.cc b/tensorflow/compiler/xla/tests/map_test.cc
index 7ddc636931..0732e195d4 100644
--- a/tensorflow/compiler/xla/tests/map_test.cc
+++ b/tensorflow/compiler/xla/tests/map_test.cc
@@ -19,8 +19,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc b/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc
index 069b8a881f..da8c42d465 100644
--- a/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc
@@ -19,8 +19,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
diff --git a/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc b/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc
index e576f000ef..955dbef6dc 100644
--- a/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc
+++ b/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
diff --git a/tensorflow/compiler/xla/tests/outfeed_in_nested_computation_test.cc b/tensorflow/compiler/xla/tests/outfeed_in_nested_computation_test.cc
new file mode 100644
index 0000000000..cea7006526
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/outfeed_in_nested_computation_test.cc
@@ -0,0 +1,168 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/tests/local_client_test_base.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+
+// Tests that ensure outfeed instructions that are contained in nested
+// computations in non-root positions are executed.
+
+class LocalClientExecuteTest : public LocalClientTestBase {};
+
+TEST_F(LocalClientExecuteTest, OutfeedInWhile) {
+  XlaBuilder b(TestName());
+
+  Shape state_tuple_array_shape = ShapeUtil::MakeShape(xla::S32, {10, 5});
+  Shape int_shape = ShapeUtil::MakeShape(xla::S32, {});
+  Shape state_tuple_shape =
+      ShapeUtil::MakeTupleShape({int_shape, state_tuple_array_shape});
+  Shape xfeed_shape = ShapeUtil::MakeShape(xla::S32, {2});
+
+  XlaOp some_buffer = Broadcast(ConstantR0<int32_t>(&b, 0), {10, 5});
+  XlaOp num_iter = Infeed(&b, int_shape);
+  XlaOp init_tuple = Tuple(&b, {num_iter, some_buffer});
+
+  TF_ASSERT_OK_AND_ASSIGN(XlaComputation loop_cond, [&] {
+    // Condition: iteration variable > 0
+    XlaBuilder cond_builder("loop_condition");
+    XlaOp state_tuple = Parameter(&cond_builder, 0, state_tuple_shape, "state");
+    XlaOp loop_counter = GetTupleElement(state_tuple, 0);
+    Outfeed(loop_counter, int_shape, "");
+    Gt(loop_counter, ConstantR0<int32_t>(&cond_builder, 0));
+    return cond_builder.Build();
+  }());
+
+  TF_ASSERT_OK_AND_ASSIGN(XlaComputation loop_body, [&] {
+    XlaBuilder body_builder("loop_body");
+    XlaOp state_tuple = Parameter(&body_builder, 0, state_tuple_shape, "state");
+    XlaOp loop_counter = GetTupleElement(state_tuple, 0);
+    XlaOp buffer_inside = GetTupleElement(state_tuple, 1);
+
+    // Read some stuff from Infeed.
+    XlaOp some_input = Infeed(&body_builder, xfeed_shape);
+    XlaOp sum = Add(some_input, Broadcast(loop_counter, {2}));
+    Outfeed(sum, xfeed_shape, "");
+
+    XlaOp iter_left = Sub(loop_counter, ConstantR0<int32_t>(&body_builder, 1));
+
+    Tuple(&body_builder, {iter_left, buffer_inside});
+    return body_builder.Build();
+  }());
+
+  // Build loop.
+  XlaOp result_tuple = While(loop_cond, loop_body, init_tuple);
+  GetTupleElement(result_tuple, 0);
+  TF_ASSERT_OK_AND_ASSIGN(XlaComputation computation, b.Build());
+
+  std::unique_ptr<xla::Literal> comp_result;
+  std::unique_ptr<tensorflow::Thread> thread(
+      tensorflow::Env::Default()->StartThread(
+          tensorflow::ThreadOptions(), "execute_thread", [&] {
+            comp_result = local_client_->ExecuteAndTransfer(computation, {})
+                              .ConsumeValueOrDie();
+          }));
+
+  VLOG(1) << "Transferring trip count to computation";
+  // Transfer number of iterations to Infeed.
+  TF_ASSERT_OK(
+      local_client_->TransferToInfeed(*LiteralUtil::CreateR0<int32_t>(1)));
+
+  // Pick up value from outfeed
+  {
+    VLOG(1) << "Reading from condition outfeed";
+    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> r,
+                            local_client_->TransferFromOutfeed(&int_shape));
+    EXPECT_EQ(r->Get<int32>({}), 1);
+  }
+
+  VLOG(1) << "Writing data to infeed";
+  // Transfer some stuff to Infeed for use inside of loop.
+  TF_ASSERT_OK(local_client_->TransferToInfeed(
+      *LiteralUtil::CreateR1<int32_t>({10, 20})));
+
+  // Pick up value from outfeed
+  {
+    VLOG(1) << "Reading from body outfeed";
+    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> r,
+                            local_client_->TransferFromOutfeed(&xfeed_shape));
+    EXPECT_EQ(r->Get<int32>({0}), 11);
+    EXPECT_EQ(r->Get<int32>({1}), 21);
+  }
+
+  {
+    VLOG(1) << "Reading from condition outfeed";
+    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> r,
+                            local_client_->TransferFromOutfeed(&int_shape));
+    EXPECT_EQ(r->Get<int32>({}), 0);
+  }
+
+  // Joins the thread
+  thread.reset();
+
+  EXPECT_EQ(comp_result->Get<int32>({}), 0);
+}
+
+TEST_F(LocalClientExecuteTest, OutfeedInConditional) {
+  XlaBuilder b(TestName());
+
+  Shape condition_shape = ShapeUtil::MakeShape(xla::PRED, {});
+  Shape result_shape = ShapeUtil::MakeShape(xla::PRED, {});
+
+  TF_ASSERT_OK_AND_ASSIGN(XlaComputation true_computation, [&] {
+    XlaBuilder inner_builder("true_computation");
+    XlaOp param = Parameter(&inner_builder, 0, result_shape, "param");
+    Outfeed(param, result_shape, "");
+    Or(param, param);
+    return inner_builder.Build();
+  }());
+
+  TF_ASSERT_OK_AND_ASSIGN(XlaComputation false_computation, [&] {
+    XlaBuilder inner_builder("false_computation");
+    Parameter(&inner_builder, 0, result_shape, "param");
+    return inner_builder.Build();
+  }());
+
+  XlaOp pred = Infeed(&b, condition_shape);
+  Conditional(/*predicate=*/pred, /*true_operand=*/pred,
+              /*true_computation=*/true_computation, /*false_operand=*/pred,
+              /*false_computation=*/false_computation);
+
+  TF_ASSERT_OK_AND_ASSIGN(XlaComputation computation, b.Build());
+
+  std::unique_ptr<xla::Literal> comp_result;
+  std::unique_ptr<tensorflow::Thread> thread(
+      tensorflow::Env::Default()->StartThread(
+          tensorflow::ThreadOptions(), "execute_thread", [&] {
+            comp_result = local_client_->ExecuteAndTransfer(computation, {})
+                              .ConsumeValueOrDie();
+          }));
+
+  TF_ASSERT_OK(
+      local_client_->TransferToInfeed(*LiteralUtil::CreateR0<bool>(true)));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> r,
+                          local_client_->TransferFromOutfeed(&result_shape));
+
+  EXPECT_EQ(r->Get<bool>({}), true);
+
+  // Join the thread
+  thread.reset();
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/pad_test.cc b/tensorflow/compiler/xla/tests/pad_test.cc
index e428fa9b5e..ca21b0b2ba 100644
--- a/tensorflow/compiler/xla/tests/pad_test.cc
+++ b/tensorflow/compiler/xla/tests/pad_test.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
diff --git a/tensorflow/compiler/xla/tests/params_test.cc b/tensorflow/compiler/xla/tests/params_test.cc
index 8ba1d11b33..f6c762e7a4 100644
--- a/tensorflow/compiler/xla/tests/params_test.cc
+++ b/tensorflow/compiler/xla/tests/params_test.cc
@@ -21,8 +21,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/shape_util.h"
diff --git a/tensorflow/compiler/xla/tests/pred_test.cc b/tensorflow/compiler/xla/tests/pred_test.cc
index 5c351b2d11..2fc7f816b5 100644
--- a/tensorflow/compiler/xla/tests/pred_test.cc
+++ b/tensorflow/compiler/xla/tests/pred_test.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
diff --git a/tensorflow/compiler/xla/tests/prng_test.cc b/tensorflow/compiler/xla/tests/prng_test.cc
index 5ebf8344d2..029af69573 100644
--- a/tensorflow/compiler/xla/tests/prng_test.cc
+++ b/tensorflow/compiler/xla/tests/prng_test.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -177,7 +177,7 @@ XLA_TEST_F(PrngTest, Uniformity108) {
   EXPECT_LT(UniformChiSquared(108, 256), 132.144);
 }
 XLA_TEST_F(PrngTest, Uniformity256) {
-  EXPECT_LT(UniformChiSquared(256, 256), 293.248);
+  EXPECT_LT(UniformChiSquared(256, 512), 293.248);
 }
 
 XLA_TEST_F(PrngTest, MapUsingRng) {
diff --git a/tensorflow/compiler/xla/tests/query_inferred_shape_test.cc b/tensorflow/compiler/xla/tests/query_inferred_shape_test.cc
index 526a38e8d1..fab2a65de1 100644
--- a/tensorflow/compiler/xla/tests/query_inferred_shape_test.cc
+++ b/tensorflow/compiler/xla/tests/query_inferred_shape_test.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
diff --git a/tensorflow/compiler/xla/tests/reduce_precision_test.cc b/tensorflow/compiler/xla/tests/reduce_precision_test.cc
index 04c7f31646..531648fe3e 100644
--- a/tensorflow/compiler/xla/tests/reduce_precision_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_precision_test.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/reduce_precision_insertion.h"
diff --git a/tensorflow/compiler/xla/tests/reduce_test.cc b/tensorflow/compiler/xla/tests/reduce_test.cc
index e4a8ddf86a..2065271a7f 100644
--- a/tensorflow/compiler/xla/tests/reduce_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_test.cc
@@ -37,8 +37,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
diff --git a/tensorflow/compiler/xla/tests/reduce_window_test.cc b/tensorflow/compiler/xla/tests/reduce_window_test.cc
index c2681f70f7..1bd6fdab31 100644
--- a/tensorflow/compiler/xla/tests/reduce_window_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_window_test.cc
@@ -24,8 +24,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/padding.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
diff --git a/tensorflow/compiler/xla/tests/replay_test.cc b/tensorflow/compiler/xla/tests/replay_test.cc
index d544968648..d891451381 100644
--- a/tensorflow/compiler/xla/tests/replay_test.cc
+++ b/tensorflow/compiler/xla/tests/replay_test.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
diff --git a/tensorflow/compiler/xla/tests/reshape_motion_test.cc b/tensorflow/compiler/xla/tests/reshape_motion_test.cc
index 7c0389cfa3..368f5583c9 100644
--- a/tensorflow/compiler/xla/tests/reshape_motion_test.cc
+++ b/tensorflow/compiler/xla/tests/reshape_motion_test.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/reference_util.h"
diff --git a/tensorflow/compiler/xla/tests/reshape_test.cc b/tensorflow/compiler/xla/tests/reshape_test.cc
index 46d91711a5..382d1b1ae7 100644
--- a/tensorflow/compiler/xla/tests/reshape_test.cc
+++ b/tensorflow/compiler/xla/tests/reshape_test.cc
@@ -22,8 +22,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
diff --git a/tensorflow/compiler/xla/tests/reverse_test.cc b/tensorflow/compiler/xla/tests/reverse_test.cc
index 23f0d26d93..41e49b4003 100644
--- a/tensorflow/compiler/xla/tests/reverse_test.cc
+++ b/tensorflow/compiler/xla/tests/reverse_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
diff --git a/tensorflow/compiler/xla/tests/scalar_computations_test.cc b/tensorflow/compiler/xla/tests/scalar_computations_test.cc
index 3b603c0d31..e42c71eb28 100644
--- a/tensorflow/compiler/xla/tests/scalar_computations_test.cc
+++ b/tensorflow/compiler/xla/tests/scalar_computations_test.cc
@@ -19,8 +19,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
diff --git a/tensorflow/compiler/xla/tests/select_and_scatter_test.cc b/tensorflow/compiler/xla/tests/select_and_scatter_test.cc
index b1f1e69d3c..e3d4f98dd7 100644
--- a/tensorflow/compiler/xla/tests/select_and_scatter_test.cc
+++ b/tensorflow/compiler/xla/tests/select_and_scatter_test.cc
@@ -22,8 +22,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/padding.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/reference_util.h"
diff --git a/tensorflow/compiler/xla/tests/select_test.cc b/tensorflow/compiler/xla/tests/select_test.cc
index 59409ab26e..1c01402798 100644
--- a/tensorflow/compiler/xla/tests/select_test.cc
+++ b/tensorflow/compiler/xla/tests/select_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
diff --git a/tensorflow/compiler/xla/tests/slice_test.cc b/tensorflow/compiler/xla/tests/slice_test.cc
index a593faca00..b8ad6668f8 100644
--- a/tensorflow/compiler/xla/tests/slice_test.cc
+++ b/tensorflow/compiler/xla/tests/slice_test.cc
@@ -20,7 +20,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
diff --git a/tensorflow/compiler/xla/tests/test_utils_test.cc b/tensorflow/compiler/xla/tests/test_utils_test.cc
index 8f424ae81f..a2f0338e25 100644
--- a/tensorflow/compiler/xla/tests/test_utils_test.cc
+++ b/tensorflow/compiler/xla/tests/test_utils_test.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/local_client_test_base.h"
diff --git a/tensorflow/compiler/xla/tests/transfer_manager_test.cc b/tensorflow/compiler/xla/tests/transfer_manager_test.cc
index 0f86b7f20f..125513ddfd 100644
--- a/tensorflow/compiler/xla/tests/transfer_manager_test.cc
+++ b/tensorflow/compiler/xla/tests/transfer_manager_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/generic_transfer_manager.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
+#include "tensorflow/compiler/xla/service/stream_pool.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
@@ -60,7 +61,7 @@ class TransferManagerTest : public LocalClientTestBase {
   }
 
  protected:
-  Backend::StreamPtr stream_ptr_;
+  StreamPool::Ptr stream_ptr_;
   se::Stream* stream_;
 
  private:
diff --git a/tensorflow/compiler/xla/tests/transpose_test.cc b/tensorflow/compiler/xla/tests/transpose_test.cc
index 6ebb4324f8..fbe9d1b64a 100644
--- a/tensorflow/compiler/xla/tests/transpose_test.cc
+++ b/tensorflow/compiler/xla/tests/transpose_test.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
diff --git a/tensorflow/compiler/xla/tests/tuple_test.cc b/tensorflow/compiler/xla/tests/tuple_test.cc
index bf86c5dfb6..2fd70b72b5 100644
--- a/tensorflow/compiler/xla/tests/tuple_test.cc
+++ b/tensorflow/compiler/xla/tests/tuple_test.cc
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace xla {
@@ -545,5 +546,51 @@ XLA_TEST_F(TupleHloTest, DISABLED_ON_INTERPRETER(BitcastAfterGTE)) {
       *result));
 }
 
+// Disabled on interpreter due to lack of outfeed.
+XLA_TEST_F(TupleHloTest,
+           DISABLED_ON_INTERPRETER(NonAmbiguousTopLevelAllocation)) {
+  const char* testcase = R"(
+    HloModule tuple
+
+    ENTRY main {
+      a = f32[2] parameter(0)
+      b = f32[2] parameter(1)
+      c = f32[2] parameter(2)
+      d = f32[2] parameter(3)
+      cond = pred[] parameter(4)
+
+      tup0 = (f32[2],f32[2]) tuple(a, b)
+      tup1 = (f32[2],f32[2]) tuple(c, d)
+
+      s = (f32[2],f32[2]) tuple-select(cond, tup0, tup1)
+      gte = f32[2] get-tuple-element(s), index=0
+      tuple = (f32[2]) tuple(gte)
+      token = token[] after-all()
+      ROOT outfeed = token[] outfeed(tuple, token)
+    }
+  )";
+  auto module =
+      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
+          .ValueOrDie();
+  auto param0 = LiteralUtil::CreateR1<float>({1, 2});
+  auto param1 = LiteralUtil::CreateR1<float>({2, 3});
+  auto param4 = LiteralUtil::CreateR0<bool>(false);
+  // Put execution on a separate thread so we can block on outfeed.
+  std::unique_ptr<tensorflow::Thread> thread(
+      tensorflow::Env::Default()->StartThread(
+          tensorflow::ThreadOptions(), "execute_thread", [&] {
+            TF_EXPECT_OK(Execute(std::move(module),
+                                 {param0.get(), param1.get(), param1.get(),
+                                  param0.get(), param4.get()})
+                             .status());
+          }));
+  auto expected =
+      LiteralUtil::MakeTupleOwned(LiteralUtil::CreateR1<float>({2, 3}));
+  auto literal = MakeUnique<Literal>();
+  TF_EXPECT_OK(backend().transfer_manager()->TransferLiteralFromOutfeed(
+      backend().default_stream_executor(), expected->shape(), literal.get()));
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *literal));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/unary_op_test.cc b/tensorflow/compiler/xla/tests/unary_op_test.cc
index a90a6fb0a5..20ae68ab74 100644
--- a/tensorflow/compiler/xla/tests/unary_op_test.cc
+++ b/tensorflow/compiler/xla/tests/unary_op_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
diff --git a/tensorflow/compiler/xla/tests/vector_ops_reduce_test.cc b/tensorflow/compiler/xla/tests/vector_ops_reduce_test.cc
index ea3aba6df1..ef1b1445bb 100644
--- a/tensorflow/compiler/xla/tests/vector_ops_reduce_test.cc
+++ b/tensorflow/compiler/xla/tests/vector_ops_reduce_test.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array3d.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
diff --git a/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc b/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
index 79bae22dac..3848ec1684 100644
--- a/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
@@ -21,8 +21,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
diff --git a/tensorflow/compiler/xla/tests/while_test.cc b/tensorflow/compiler/xla/tests/while_test.cc
index 29befef92e..c81c27891c 100644
--- a/tensorflow/compiler/xla/tests/while_test.cc
+++ b/tensorflow/compiler/xla/tests/while_test.cc
@@ -20,8 +20,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
diff --git a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
index 4d4dd62a3f..0ee8e68c88 100644
--- a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
+++ b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
@@ -18,10 +18,11 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
+#include "tensorflow/compiler/xla/service/stream_pool.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -133,7 +134,7 @@ void ExecuteAndFetchProfile(string* profile_output, LocalClient* client,
   DeviceMemoryAllocator* allocator = backend->memory_allocator();
   auto* transfer_manager = backend->transfer_manager();
   TF_ASSERT_OK_AND_ASSIGN(
-      Backend::StreamPtr stream_ptr,
+      StreamPool::Ptr stream_ptr,
       backend->BorrowStream(backend->default_device_ordinal()));
 
   TF_ASSERT_OK_AND_ASSIGN(
@@ -172,6 +173,7 @@ void ExecuteAndFetchProfile(string* profile_output, LocalClient* client,
       auto execution_result,
       executable->ExecuteOnStream(&run_options, {&lhs_arg, &rhs_arg},
                                   &hlo_execution_profile));
+  TF_ASSERT_OK(stream_ptr->BlockHostUntilDone());
   (void)execution_result;
 
   *profile_output =
diff --git a/tensorflow/compiler/xla/tools/BUILD b/tensorflow/compiler/xla/tools/BUILD
index 55501827f2..d7cabbe876 100644
--- a/tensorflow/compiler/xla/tools/BUILD
+++ b/tensorflow/compiler/xla/tools/BUILD
@@ -37,6 +37,7 @@ cc_library(
         "//tensorflow/compiler/xla/client",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service",
         "//tensorflow/compiler/xla/service:hlo_proto",
@@ -84,6 +85,7 @@ cc_library(
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:testing",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service:hlo_proto",
@@ -164,6 +166,7 @@ tf_cc_binary(
         "//tensorflow/compiler/xla/client",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service",
         "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/compiler/xla/service:interpreter_plugin",
@@ -181,6 +184,7 @@ tf_cc_binary(
         "//tensorflow/compiler/xla/client",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_proto",
@@ -198,6 +202,7 @@ tf_cc_binary(
         "//tensorflow/compiler/xla/client",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service",
         "//tensorflow/compiler/xla/service:hlo_graph_dumper",
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_graphviz.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_graphviz.cc
index befb554537..f20dcef382 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_graphviz.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_graphviz.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/client.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/service.h"
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
index cfb8f37487..f0af0580c1 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/client.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/service.h"
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
index 5dd5150be3..f03e1b1f96 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/client.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/service.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc
index a5dce20456..dc5c106d02 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/client.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/service.h"
diff --git a/tensorflow/compiler/xla/tools/replay_computation.cc b/tensorflow/compiler/xla/tools/replay_computation.cc
index 854e797ec2..3bb2f3c000 100644
--- a/tensorflow/compiler/xla/tools/replay_computation.cc
+++ b/tensorflow/compiler/xla/tools/replay_computation.cc
@@ -42,6 +42,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/lib/testing.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/gpu/infeed_manager.h"
diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index a173c51879..6a4e252b44 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -130,8 +130,12 @@ py_library(
         "//tensorflow/contrib/bigtable",  # depends on bigtable
         "//tensorflow/contrib/cloud:cloud_py",  # doesn't compile on Windows
         "//tensorflow/contrib/ffmpeg:ffmpeg_ops_py",
-        "//tensorflow/contrib/lite/python:lite",  # unix dependency, need to fix code
+        # TODO(aaroey): tensorrt dependency has to appear before tflite so the
+        # build can resolve its flatbuffers symbols within the tensorrt library.
+        # This is an issue with the tensorrt static library and will be fixed by
+        # the next tensorrt release, so fix the order here after that.
         "//tensorflow/contrib/tensorrt:init_py",  # doesn't compile on windows
+        "//tensorflow/contrib/lite/python:lite",  # unix dependency, need to fix code
     ]),
 )
 
diff --git a/tensorflow/contrib/android/cmake/src/main/AndroidManifest.xml b/tensorflow/contrib/android/cmake/src/main/AndroidManifest.xml
index bced47e046..c17110a78b 100644
--- a/tensorflow/contrib/android/cmake/src/main/AndroidManifest.xml
+++ b/tensorflow/contrib/android/cmake/src/main/AndroidManifest.xml
@@ -1,6 +1,10 @@
 <manifest xmlns:android="http://schemas.android.com/apk/res/android"
     package="org.tensorflow.contrib.android">
 
+    <uses-sdk
+        android:minSdkVersion="4"
+        android:targetSdkVersion="19" />
+
     <application android:allowBackup="true" android:label="@string/app_name"
         android:supportsRtl="true">
 
diff --git a/tensorflow/contrib/autograph/__init__.py b/tensorflow/contrib/autograph/__init__.py
index 7821c98f1c..26e7a4a4d3 100644
--- a/tensorflow/contrib/autograph/__init__.py
+++ b/tensorflow/contrib/autograph/__init__.py
@@ -22,20 +22,21 @@ from __future__ import division
 from __future__ import print_function
 
 # TODO(mdan): Bring only the relevant symbols to the top level.
-from tensorflow.contrib.autograph import utils
 from tensorflow.contrib.autograph import operators
+from tensorflow.contrib.autograph import utils
+from tensorflow.contrib.autograph.core.errors import GraphConstructionError
+from tensorflow.contrib.autograph.core.errors import TfRuntimeError
+from tensorflow.contrib.autograph.core.errors import improved_errors
+from tensorflow.contrib.autograph.impl.api import RunMode
 from tensorflow.contrib.autograph.impl.api import convert
 from tensorflow.contrib.autograph.impl.api import converted_call
 from tensorflow.contrib.autograph.impl.api import do_not_convert
-from tensorflow.contrib.autograph.impl.api import RunMode
 from tensorflow.contrib.autograph.impl.api import to_code
-from tensorflow.contrib.autograph.core.errors import improved_errors
-from tensorflow.contrib.autograph.core.errors import GraphConstructionError
-from tensorflow.contrib.autograph.core.errors import TfRuntimeError
 from tensorflow.contrib.autograph.impl.api import to_graph
 from tensorflow.contrib.autograph.lang.directives import set_element_type
 from tensorflow.contrib.autograph.lang.directives import set_loop_options
 from tensorflow.contrib.autograph.lang.special_functions import stack
+from tensorflow.contrib.autograph.lang.special_functions import tensor_list
 from tensorflow.contrib.autograph.pyct.transformer import AutographParseError
 from tensorflow.python.util.all_util import remove_undocumented
 
@@ -57,6 +58,7 @@ _allowed_symbols = [
     'set_element_type',
     'set_loop_options',
     'stack',
+    'tensor_list',
     # Exceptions
     'AutographParseError',
     # Utilities: to be removed
diff --git a/tensorflow/contrib/autograph/converters/asserts_test.py b/tensorflow/contrib/autograph/converters/asserts_test.py
index 9c58ae3acc..38faba45df 100644
--- a/tensorflow/contrib/autograph/converters/asserts_test.py
+++ b/tensorflow/contrib/autograph/converters/asserts_test.py
@@ -35,7 +35,7 @@ class AssertsTest(converter_testing.TestCase):
     node, ctx = self.prepare(test_fn, {})
     node = asserts.transform(node, ctx)
 
-    self.assertTrue(isinstance(node.body[0].body[0].value, gast.Call))
+    self.assertTrue(isinstance(node.body[0].value, gast.Call))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/autograph/converters/directives_test.py b/tensorflow/contrib/autograph/converters/directives_test.py
index 5f798a5b76..a573ba5850 100644
--- a/tensorflow/contrib/autograph/converters/directives_test.py
+++ b/tensorflow/contrib/autograph/converters/directives_test.py
@@ -38,7 +38,7 @@ class DirectivesTest(converter_testing.TestCase):
     node, ctx = self.prepare(test_fn, {'directives': directives})
     node = directives_converter.transform(node, ctx)
 
-    def_, = anno.getanno(node.body[0].body[0].targets[0],
+    def_, = anno.getanno(node.body[0].targets[0],
                          anno.Static.DEFINITIONS)
     d = def_.directives[directives.set_element_type]
     self.assertEqual(d['dtype'].s, 'a')
@@ -52,7 +52,7 @@ class DirectivesTest(converter_testing.TestCase):
     node, ctx = self.prepare(test_fn, {'directives': directives})
     node = directives_converter.transform(node, ctx)
 
-    def_, = anno.getanno(node.body[0].args.args[0], anno.Static.DEFINITIONS)
+    def_, = anno.getanno(node.args.args[0], anno.Static.DEFINITIONS)
     d = def_.directives[directives.set_element_type]
     self.assertEqual(d['dtype'].n, 1)
     self.assertEqual(d['shape'].n, 2)
@@ -67,7 +67,7 @@ class DirectivesTest(converter_testing.TestCase):
     node, ctx = self.prepare(test_fn, {'directives': directives})
     node = directives_converter.transform(node, ctx)
 
-    d = anno.getanno(node.body[0].body[1], AgAnno.DIRECTIVES)
+    d = anno.getanno(node.body[1], AgAnno.DIRECTIVES)
     d = d[directives.set_loop_options]
     self.assertEqual(d['parallel_iterations'].n, 10)
     self.assertEqual(d['back_prop'].id, 'a')
diff --git a/tensorflow/contrib/autograph/converters/error_handlers_test.py b/tensorflow/contrib/autograph/converters/error_handlers_test.py
index 878526c8b4..cd74e5f18f 100644
--- a/tensorflow/contrib/autograph/converters/error_handlers_test.py
+++ b/tensorflow/contrib/autograph/converters/error_handlers_test.py
@@ -34,11 +34,13 @@ class ErrorHandlersTest(converter_testing.TestCase):
       raise ValueError()
 
     node, ctx = self.prepare(test_fn, {})
-    anno.setanno(node.body[0], anno.Basic.ORIGIN,
-                 origin_info.OriginInfo('test_path', None, None, None, None))
+    anno.setanno(node, anno.Basic.ORIGIN,
+                 origin_info.OriginInfo(None, None, None))
     node = error_handlers.transform(node, ctx)
     with self.compiled(node, {}) as result:
       with self.assertRaises(errors.GraphConstructionError):
+        # Here we just assert that the handler works. Its correctness is
+        # verified by errors_test.py.
         result.test_fn()
 
   def test_no_origin_annotation(self):
diff --git a/tensorflow/contrib/autograph/converters/lists_test.py b/tensorflow/contrib/autograph/converters/lists_test.py
index 447a88bbe2..996e99ee61 100644
--- a/tensorflow/contrib/autograph/converters/lists_test.py
+++ b/tensorflow/contrib/autograph/converters/lists_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from tensorflow.contrib.autograph.converters import lists
 from tensorflow.contrib.autograph.core import converter_testing
 from tensorflow.contrib.autograph.lang import directives
+from tensorflow.contrib.autograph.lang import special_functions
 from tensorflow.contrib.autograph.pyct import anno
 from tensorflow.contrib.autograph.pyct import parser
 from tensorflow.python.framework import dtypes
@@ -52,20 +53,18 @@ class ListTest(converter_testing.TestCase):
       return [1, 2, 3]
 
     with self.converted(test_fn, lists, {}) as result:
-      with self.test_session() as sess:
-        tl = result.test_fn()
-        r = list_ops.tensor_list_stack(tl, dtypes.int32)
-        self.assertAllEqual(sess.run(r), [1, 2, 3])
+      self.assertAllEqual(result.test_fn(), [1, 2, 3])
 
   def test_list_append(self):
 
     def test_fn():
-      l = [1]
+      l = special_functions.tensor_list([1])
       l.append(2)
       l.append(3)
       return l
 
-    with self.converted(test_fn, lists, {}) as result:
+    ns = {'special_functions': special_functions}
+    with self.converted(test_fn, lists, ns) as result:
       with self.test_session() as sess:
         tl = result.test_fn()
         r = list_ops.tensor_list_stack(tl, dtypes.int32)
@@ -74,12 +73,13 @@ class ListTest(converter_testing.TestCase):
   def test_list_pop(self):
 
     def test_fn():
-      l = [1, 2, 3]
+      l = special_functions.tensor_list([1, 2, 3])
       s = l.pop()
       return s, l
 
-    node, ctx = self.prepare(test_fn, {})
-    def_, = anno.getanno(node.body[0].body[0].targets[0],
+    ns = {'special_functions': special_functions}
+    node, ctx = self.prepare(test_fn, ns)
+    def_, = anno.getanno(node.body[0].targets[0],
                          anno.Static.ORIG_DEFINITIONS)
     def_.directives[directives.set_element_type] = {
         'dtype': parser.parse_expression('tf.int32'),
@@ -87,7 +87,7 @@ class ListTest(converter_testing.TestCase):
     }
     node = lists.transform(node, ctx)
 
-    with self.compiled(node, {}, dtypes.int32) as result:
+    with self.compiled(node, ns, dtypes.int32) as result:
       with self.test_session() as sess:
         ts, tl = result.test_fn()
         r = list_ops.tensor_list_stack(tl, dtypes.int32)
@@ -114,7 +114,7 @@ class ListTest(converter_testing.TestCase):
       return tf.stack(l)
 
     node, ctx = self.prepare(test_fn, {})
-    def_, = anno.getanno(node.body[0].body[0].targets[0],
+    def_, = anno.getanno(node.body[0].targets[0],
                          anno.Static.ORIG_DEFINITIONS)
     def_.directives[directives.set_element_type] = {
         'dtype': parser.parse_expression('tf.int32')
diff --git a/tensorflow/contrib/autograph/converters/side_effect_guards_test.py b/tensorflow/contrib/autograph/converters/side_effect_guards_test.py
index de1874321e..bee512abbc 100644
--- a/tensorflow/contrib/autograph/converters/side_effect_guards_test.py
+++ b/tensorflow/contrib/autograph/converters/side_effect_guards_test.py
@@ -43,7 +43,7 @@ class SideEffectGuardsTest(converter_testing.TestCase):
     node, ctx = self.prepare(test_fn, {})
     node = side_effect_guards.transform(node, ctx)
 
-    self.assertEqual(len(node.body[0].body), 1)
+    self.assertEqual(len(node.body), 1)
 
     with self.compiled(node, {}, state_ops.assign) as result:
       with self.test_session() as sess:
@@ -64,7 +64,7 @@ class SideEffectGuardsTest(converter_testing.TestCase):
     node, ctx = self.prepare(test_fn, {})
     node = side_effect_guards.transform(node, ctx)
 
-    self.assertEqual(len(node.body[0].body), 1)
+    self.assertEqual(len(node.body), 1)
 
     with self.compiled(node, {}, state_ops.assign) as result:
       with self.test_session() as sess:
@@ -84,7 +84,7 @@ class SideEffectGuardsTest(converter_testing.TestCase):
     node, ctx = self.prepare(test_fn, {})
     node = side_effect_guards.transform(node, ctx)
 
-    self.assertEqual(len(node.body[0].body), 1)
+    self.assertEqual(len(node.body), 1)
 
     with self.compiled(node, {}, control_flow_ops.Assert) as result:
       with self.test_session() as sess:
@@ -104,7 +104,7 @@ class SideEffectGuardsTest(converter_testing.TestCase):
     node, ctx = self.prepare(test_fn, {})
     node = side_effect_guards.transform(node, ctx)
 
-    self.assertEqual(len(node.body[0].body), 1)
+    self.assertEqual(len(node.body), 1)
 
     with self.compiled(node, {}, state_ops.assign_add) as result:
       with self.test_session() as sess:
@@ -125,7 +125,7 @@ class SideEffectGuardsTest(converter_testing.TestCase):
     node, ctx = self.prepare(test_fn, {})
     node = side_effect_guards.transform(node, ctx)
 
-    self.assertEqual(len(node.body[0].body[0].body), 1)
+    self.assertEqual(len(node.body[0].body), 1)
 
     with self.compiled(node, {}, state_ops.assign, ops.name_scope) as result:
       with self.test_session() as sess:
@@ -147,7 +147,7 @@ class SideEffectGuardsTest(converter_testing.TestCase):
     node, ctx = self.prepare(test_fn, {})
     node = side_effect_guards.transform(node, ctx)
 
-    self.assertEqual(len(node.body[0].body), 1)
+    self.assertEqual(len(node.body), 1)
 
     with self.compiled(node, {}, state_ops.assign,
                        state_ops.assign_add) as result:
diff --git a/tensorflow/contrib/autograph/converters/slices.py b/tensorflow/contrib/autograph/converters/slices.py
index 9cfa066672..c527f98613 100644
--- a/tensorflow/contrib/autograph/converters/slices.py
+++ b/tensorflow/contrib/autograph/converters/slices.py
@@ -36,12 +36,14 @@ class SliceTransformer(converter.Base):
   def _process_single_assignment(self, target, value):
     if not isinstance(target, gast.Subscript):
       return None
+    if not isinstance(target.slice, gast.Index):
+      return None
 
     template = """
       target = ag__.set_item(target, key, item)
     """
     return templates.replace(
-        template, target=target.value, key=target.slice, item=value)
+        template, target=target.value, key=target.slice.value, item=value)
 
   def visit_Assign(self, node):
     node = self.generic_visit(node)
@@ -76,7 +78,7 @@ class SliceTransformer(converter.Base):
           opts=ag__.GetItemOpts(element_dtype=dtype))
     """
     return templates.replace_as_expression(
-        template, target=node.value, key=node.slice, dtype=dtype)
+        template, target=node.value, key=node.slice.value, dtype=dtype)
 
 
 def transform(node, ctx):
diff --git a/tensorflow/contrib/autograph/converters/slices_test.py b/tensorflow/contrib/autograph/converters/slices_test.py
index 3c0f81e8bc..c822d53a4a 100644
--- a/tensorflow/contrib/autograph/converters/slices_test.py
+++ b/tensorflow/contrib/autograph/converters/slices_test.py
@@ -38,7 +38,7 @@ class SliceTest(converter_testing.TestCase):
       return l[1]
 
     node, ctx = self.prepare(test_fn, {})
-    def_, = anno.getanno(node.body[0].args.args[0], anno.Static.DEFINITIONS)
+    def_, = anno.getanno(node.args.args[0], anno.Static.DEFINITIONS)
     def_.directives[directives.set_element_type] = {
         'dtype': parser.parse_expression('tf.int32')
     }
@@ -59,11 +59,11 @@ class SliceTest(converter_testing.TestCase):
       return l[1]
 
     node, ctx = self.prepare(test_fn, {})
-    def_, = anno.getanno(node.body[0].args.args[0], anno.Static.DEFINITIONS)
+    def_, = anno.getanno(node.args.args[0], anno.Static.DEFINITIONS)
     def_.directives[directives.set_element_type] = {
         'dtype': parser.parse_expression('tf.int32')
     }
-    def_, = anno.getanno(node.body[0].body[0].body[0].targets[0],
+    def_, = anno.getanno(node.body[0].body[0].targets[0],
                          anno.Static.DEFINITIONS)
     def_.directives[directives.set_element_type] = {
         'dtype': parser.parse_expression('tf.float32')
diff --git a/tensorflow/contrib/autograph/core/converter_testing.py b/tensorflow/contrib/autograph/core/converter_testing.py
index 2025e32817..5ee2c3fffd 100644
--- a/tensorflow/contrib/autograph/core/converter_testing.py
+++ b/tensorflow/contrib/autograph/core/converter_testing.py
@@ -94,7 +94,8 @@ class TestCase(test.TestCase):
       return 7
 
     try:
-      result, source = compiler.ast_to_object(node)
+      result, source = compiler.ast_to_object(node, include_source_map=True)
+
       result.tf = self.make_fake_mod('fake_tf', *symbols)
       fake_ag = self.make_fake_mod('fake_ag', converted_call)
       fake_ag.__dict__.update(operators.__dict__)
@@ -144,6 +145,7 @@ class TestCase(test.TestCase):
               recursive=True,
               autograph_decorators=()):
     node, source = parser.parse_entity(test_fn)
+    node = node.body[0]
     if namer is None:
       namer = FakeNamer()
     program_ctx = converter.ProgramContext(
diff --git a/tensorflow/contrib/autograph/core/errors.py b/tensorflow/contrib/autograph/core/errors.py
index e58745337a..c219b372c1 100644
--- a/tensorflow/contrib/autograph/core/errors.py
+++ b/tensorflow/contrib/autograph/core/errors.py
@@ -31,11 +31,14 @@ import logging
 import sys
 import traceback
 
-from tensorflow.contrib.autograph.pyct.origin_info import CodeLocation
+from tensorflow.contrib.autograph.pyct import origin_info
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.util import tf_inspect
 
 
+# TODO(mdan): Add a superclass common to all errors.
+
+
 class GraphConstructionError(Exception):
   """Error for graph construction errors from AutoGraph generated code."""
 
@@ -65,27 +68,35 @@ class TfRuntimeError(Exception):
     return message + ''.join(traceback.format_list(self.custom_traceback))
 
 
-def _rewrite_frame(source_map, cleaned_traceback, stack_frame_indices):
-  """Rewrites the stack frames at the given indices using the given source map.
+def _rewrite_tb(source_map, tb, filter_function_name=None):
+  """Rewrites code references in a traceback.
 
   Args:
-    source_map: Dict[CodeLocation, OriginInfo], a mapping between the user and
-        AG generated code.
-    cleaned_traceback: List[Tuple[text, text, text, text]], the current
-        traceback.
-    stack_frame_indices: Iterable[Int], frame indices to possibly rewrite if
-        there are matching source mapping keys.
-
+    source_map: Dict[origin_info.LineLocation, origin_info.OriginInfo], mapping
+        locations to their origin
+    tb: List[Tuple[Text, Text, Text, Text]], consistent with
+        traceback.extract_tb
+    filter_function_name: Optional[Text], allows restricting restricts the
+        frames to rewrite to a particular function name
   Returns:
-    None
+    List[Tuple[Text, Text, Text, Text]], the rewritten traceback
   """
-  for frame_index in stack_frame_indices:
-    # (file_path, line number, function name, code)
-    file_path, line_number, _, _ = cleaned_traceback[frame_index]
-    source_map_key = CodeLocation(file_path=file_path, line_number=line_number)
-    found_mapping = source_map_key in source_map
-    if found_mapping:
-      cleaned_traceback[frame_index] = source_map[source_map_key].as_frame()
+  new_tb = []
+  for frame in tb:
+    filename, lineno, function_name, _ = frame
+    loc = origin_info.LineLocation(filename, lineno)
+    origin = source_map.get(loc)
+    # TODO(mdan): We shouldn't need the function name at all.
+    # filename + lineno should be sufficient, even if there are multiple source
+    # maps.
+    if origin is not None:
+      if filter_function_name == function_name or filter_function_name is None:
+        new_tb.append(origin.as_frame())
+      else:
+        new_tb.append(frame)
+    else:
+      new_tb.append(frame)
+  return new_tb
 
 
 # TODO(znado): Make more robust to name changes in the rewriting logic.
@@ -98,18 +109,20 @@ def _remove_rewrite_frames(tb):
   return cleaned_tb
 
 
+# TODO(mdan): rename to raise_*
 def rewrite_graph_construction_error(source_map):
   """Rewrites errors raised by non-AG APIs inside AG generated code.
 
-  Meant to be called from the try/except block inside each AutoGraph generated
-  function.  Only rewrites the traceback frames corresponding to the function
-  that this is called from.  When we raise a GraphConstructionError at the end
-  it is then caught by calling functions, where they can be responsible for
-  rewriting their own frames.
+  This is called from the except handler inside an AutoGraph generated function
+  (that is, during exception handling). Only rewrites the frames corresponding
+  to the function that this is called from, so each function is responsible
+  to call this to have its own frames rewritten.
+
+  This function always raises an error.
 
   Args:
-    source_map: Dict[CodeLocation, OriginInfo], a mapping between the user and
-        AG generated code.
+    source_map: Dict[origin_info.Location, origin_info.OriginInfo], the source
+        map belonging to the calling function
 
   Raises:
     GraphConstructionError: The rewritten underlying error.
@@ -120,31 +133,19 @@ def rewrite_graph_construction_error(source_map):
   assert original_error is not None
   try:
     _, _, _, func_name, _, _ = tf_inspect.stack()[1]
-    # The latest function call is added to the beginning of a traceback, but
-    # when rewriting the traceback of multiple function calls the previous
-    # functions' except blocks may have already rewritten their own frames so
-    # we want to copy over all of the previous frames. We may have rewritten
-    # previous frames only if the error is a GraphConstructionError.
     if isinstance(original_error, GraphConstructionError):
+      # TODO(mdan): This is incomplete.
+      # The error might have bubbled through a non-converted function.
       cleaned_traceback = traceback.extract_tb(e_traceback)
       previous_traceback = original_error.custom_traceback
       cleaned_traceback = [cleaned_traceback[0]] + previous_traceback
     else:
       cleaned_traceback = traceback.extract_tb(e_traceback)
-    cleaned_traceback = _remove_rewrite_frames(cleaned_traceback)
-
-    current_frame_indices = []
-    # This code is meant to be called from the try/except block that wraps a
-    # function body.  Here we look for all frames that came from the function
-    # that this wraps, look for any matching line numbers in the source
-    # mapping, and then rewrite them if matches are found.
-    for fi, frame in enumerate(cleaned_traceback):
-      _, _, frame_func_name, _ = frame
-      if frame_func_name == func_name:
-        current_frame_indices.append(fi)
-        break
-    if current_frame_indices:
-      _rewrite_frame(source_map, cleaned_traceback, current_frame_indices)
+
+    # Remove the frame corresponding to this function call.
+    cleaned_traceback = cleaned_traceback[1:]
+
+    cleaned_traceback = _rewrite_tb(source_map, cleaned_traceback, func_name)
 
     if isinstance(original_error, GraphConstructionError):
       original_error.custom_traceback = cleaned_traceback
@@ -153,6 +154,7 @@ def rewrite_graph_construction_error(source_map):
       new_error = GraphConstructionError(original_error, cleaned_traceback)
   except Exception:
     logging.exception('Error while rewriting AutoGraph error:')
+    # TODO(mdan): Should reraise here, removing the top frame as well.
     raise original_error
   else:
     raise new_error
@@ -161,18 +163,17 @@ def rewrite_graph_construction_error(source_map):
     del e_traceback
 
 
+# TODO(mdan): This should be consistent with rewrite_graph_construction_error
+# Both should either raise or return.
 def rewrite_tf_runtime_error(error, source_map):
   """Rewrites TensorFlow runtime errors raised by ops created in AG code.
 
   Args:
-    error: error_impl.OpError, an TensorFlow error that will have its traceback
-        rewritten.
-    source_map: Dict[CodeLocation, OriginInfo], a mapping between the user and
-        AG generated code.
+    error: tf.OpError
+    source_map: Dict[origin_info.LineLocation, origin_info.OriginInfo]
 
   Returns:
-    A TfRuntimeError with a traceback rewritten according to the given
-    source mapping.
+    TfRuntimeError, the rewritten underlying error.
   """
   # Check for cases where we leave a user method and re-enter it in the
   # traceback.  This is done by looking at the function names when the
@@ -198,15 +199,16 @@ def rewrite_tf_runtime_error(error, source_map):
   # The source map keys are (file_path, line_number) so get the set of all user
   # file_paths.
   try:
-    all_user_files = set(k.file_path for k in source_map)
+    all_user_files = set(loc.filename for loc in source_map)
     cleaned_traceback = []
     last_user_frame_index = None
     last_user_user_file_path = None
     last_user_user_fn_name = None
+    # TODO(mdan): Simplify this logic.
     for fi, frame in enumerate(error.op.traceback):
-      frame_file_path, frame_line_number, _, _ = frame
-      src_map_key = CodeLocation(
-          file_path=frame_file_path, line_number=frame_line_number)
+      frame_file_path, lineno, _, _ = frame
+      lineno -= 1  # Frame line numbers are 1-based.
+      src_map_key = origin_info.LineLocation(frame_file_path, lineno)
       if frame_file_path in all_user_files:
         if src_map_key in source_map:
           original_fn_name = source_map[src_map_key].function_name
@@ -223,8 +225,8 @@ def rewrite_tf_runtime_error(error, source_map):
         last_user_user_file_path = frame_file_path
       cleaned_traceback.append(frame)
 
-    for fi in range(len(cleaned_traceback)):
-      _rewrite_frame(source_map, cleaned_traceback, [fi])
+    cleaned_traceback = _rewrite_tb(source_map, cleaned_traceback)
+
     op_name = error.op.name
     op_message = error.message
     rewritten_error = TfRuntimeError(op_name, op_message, cleaned_traceback)
@@ -263,7 +265,7 @@ def improved_errors(converted_function):
     ValueError: If converted_function is not generated by AutoGraph
   """
   if (getattr(converted_function, 'ag_source_map', None) is None or
-      not converted_function.ag_source_map):
+      not isinstance(converted_function.ag_source_map, dict)):
     raise ValueError(
         'converted_function must be the result of an autograph.to_graph call')
   try:
diff --git a/tensorflow/contrib/autograph/core/errors_test.py b/tensorflow/contrib/autograph/core/errors_test.py
index 7be54563a1..c0e2c74e47 100644
--- a/tensorflow/contrib/autograph/core/errors_test.py
+++ b/tensorflow/contrib/autograph/core/errors_test.py
@@ -28,88 +28,76 @@ from tensorflow.python.util import tf_inspect
 
 
 def zero_div():
-  return array_ops.constant(10, dtype=dtypes.int32) // 0
+  x = array_ops.constant(10, dtype=dtypes.int32)
+  return x // 0
 
 
 def zero_div_caller():
-  a = zero_div() + 2
-  return a
+  return zero_div()
 
 
 class RuntimeErrorsTest(test.TestCase):
 
-  def setUp(self):
-    self._fake_origin = origin_info.OriginInfo('new file', 'new func', 96, 0,
-                                               'print("hello world!")')
-
-  def test_error_replacement(self):
-    _, zero_div_lineno = tf_inspect.getsourcelines(zero_div)
-    src_map = {
-        errors.CodeLocation(
-            file_path=__file__, line_number=zero_div_lineno + 1):
-            self._fake_origin
-    }
+  def fake_origin(self, function, line_offset):
+    _, lineno = tf_inspect.getsourcelines(function)
+    filename = tf_inspect.getsourcefile(function)
+    lineno += line_offset
+    loc = origin_info.LineLocation(filename, lineno)
+    origin = origin_info.OriginInfo(loc, 'test_function_name', 'test_code')
+    return loc, origin
+
+  def test_improved_errors_basic(self):
+    loc, origin = self.fake_origin(zero_div, 2)
+    zero_div_caller.ag_source_map = {loc: origin}
+
+    ops = zero_div_caller()
     with self.assertRaises(errors.TfRuntimeError) as cm:
-      z = zero_div_caller()
-      zero_div_caller.ag_source_map = src_map
       with errors.improved_errors(zero_div_caller):
         with self.test_session() as sess:
-          sess.run(z)
-    expected = cm.exception
-    current_traceback = expected.custom_traceback
-    for frame in current_traceback:
-      self.assertNotEqual('zero_div', frame[2])
-    self.assertTrue(
-        any(self._fake_origin.as_frame() == frame
-            for frame in current_traceback))
-
-  def test_error_not_found(self):
-    src_map = {
-        errors.CodeLocation(file_path=__file__, line_number=-1):
-            self._fake_origin
-    }
+          sess.run(ops)
+
+    for frame in cm.exception.custom_traceback:
+      _, _, function_name, _ = frame
+      self.assertNotEqual('zero_div', function_name)
+    self.assertIn(origin.as_frame(), set(cm.exception.custom_traceback))
+
+  def test_improved_errors_no_matching_lineno(self):
+    loc, origin = self.fake_origin(zero_div, -1)
+    zero_div_caller.ag_source_map = {loc: origin}
+
+    ops = zero_div_caller()
     with self.assertRaises(errors.TfRuntimeError) as cm:
-      z = zero_div_caller()
-      zero_div_caller.ag_source_map = src_map
       with errors.improved_errors(zero_div_caller):
         with self.test_session() as sess:
-          sess.run(z)
-    expected = cm.exception
-    current_traceback = expected.custom_traceback
-    self.assertTrue(any('zero_div' in frame[2] for frame in current_traceback))
-    for frame in current_traceback:
-      self.assertNotEqual(frame, self._fake_origin.as_frame())
-
-  def test_rewriting_error(self):
-    _, zero_div_lineno = tf_inspect.getsourcelines(zero_div)
-    src_map = {
-        errors.CodeLocation(
-            file_path=__file__, line_number=zero_div_lineno + 1):
-            None
-    }
-    with self.assertRaisesRegexp(tf_errors.InvalidArgumentError,
-                                 'Integer division by zero'):
-      z = zero_div_caller()
-      zero_div_caller.ag_source_map = src_map
+          sess.run(ops)
+
+    all_function_names = set()
+    for frame in cm.exception.custom_traceback:
+      _, _, function_name, _ = frame
+      all_function_names.add(function_name)
+      self.assertNotEqual('test_function_name', function_name)
+    self.assertIn('zero_div', all_function_names)
+
+  def test_improved_errors_failures(self):
+    loc, _ = self.fake_origin(zero_div, 2)
+    zero_div_caller.ag_source_map = {loc: 'bogus object'}
+
+    ops = zero_div_caller()
+    with self.assertRaises(tf_errors.InvalidArgumentError):
       with errors.improved_errors(zero_div_caller):
         with self.test_session() as sess:
-          sess.run(z)
+          sess.run(ops)
 
-  def test_no_ag_source_map(self):
+  def test_improved_errors_validation(self):
     with self.assertRaisesRegexp(
         ValueError,
         'converted_function must be the result of an autograph.to_graph call'):
-      with errors.improved_errors(None):
-        pass
-
-  def test_bad_ag_source_map(self):
+      errors.improved_errors(zero_div).__enter__()
     with self.assertRaisesRegexp(
         ValueError,
         'converted_function must be the result of an autograph.to_graph call'):
-      src_map = None
-      zero_div_caller.ag_source_map = src_map
-      with errors.improved_errors(None):
-        pass
+      zero_div_caller.ag_source_map = 'not a dict'
+      errors.improved_errors(zero_div_caller).__enter__()
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/autograph/examples/integration_tests/BUILD b/tensorflow/contrib/autograph/examples/integration_tests/BUILD
index 1368ce244c..d20c17b63b 100644
--- a/tensorflow/contrib/autograph/examples/integration_tests/BUILD
+++ b/tensorflow/contrib/autograph/examples/integration_tests/BUILD
@@ -22,7 +22,19 @@ py_test(
         "keras_test.py",
     ],
     srcs_version = "PY2AND3",
-    visibility = ["//visibility:public"],
+    tags = ["no_windows"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_test(
+    name = "list_literals_test",
+    srcs = [
+        "list_literals_test.py",
+    ],
+    srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         "//tensorflow:tensorflow_py",
     ],
diff --git a/tensorflow/contrib/autograph/examples/integration_tests/keras_test.py b/tensorflow/contrib/autograph/examples/integration_tests/keras_test.py
index a2fc7c550e..73125eb452 100644
--- a/tensorflow/contrib/autograph/examples/integration_tests/keras_test.py
+++ b/tensorflow/contrib/autograph/examples/integration_tests/keras_test.py
@@ -20,6 +20,8 @@ from __future__ import print_function
 
 import tensorflow as tf
 
+from tensorflow.contrib import autograph
+
 
 class MinimalKeras(tf.keras.Model):
 
@@ -27,11 +29,34 @@ class MinimalKeras(tf.keras.Model):
     return x * 3
 
 
+class ModelWithStaticConditional(object):
+
+  def __init__(self, initial):
+    self.initial = initial
+    if self.initial:
+      self.h = 15
+
+  @autograph.convert()
+  def call(self):
+    x = 10
+    if self.initial:
+      x += self.h
+    return x
+
+
 class KerasTest(tf.test.TestCase):
 
   def test_basic(self):
     MinimalKeras()
 
+  def test_conditional_attributes_False(self):
+    model = ModelWithStaticConditional(False)
+    self.assertEqual(model.call(), 10)
+
+  def test_conditional_attributes_True(self):
+    model = ModelWithStaticConditional(True)
+    self.assertEqual(model.call(), 25)
+
 
 if __name__ == '__main__':
   tf.test.main()
diff --git a/tensorflow/contrib/autograph/examples/integration_tests/list_literals_test.py b/tensorflow/contrib/autograph/examples/integration_tests/list_literals_test.py
new file mode 100644
index 0000000000..680b6dbaf0
--- /dev/null
+++ b/tensorflow/contrib/autograph/examples/integration_tests/list_literals_test.py
@@ -0,0 +1,41 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests of functions that use list literals."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from tensorflow.contrib import autograph as ag
+
+
+def list_used_as_tuple():
+  return tf.constant([1, 2, 3])
+
+
+class ListLiteralsTest(tf.test.TestCase):
+
+  def test_basic(self):
+    converted = ag.to_graph(list_used_as_tuple)
+    result = converted()
+
+    with self.test_session() as sess:
+      self.assertAllEqual(sess.run(result), [1, 2, 3])
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/contrib/autograph/examples/notebooks/algorithms.ipynb b/tensorflow/contrib/autograph/examples/notebooks/algorithms.ipynb
new file mode 100644
index 0000000000..bf824e2760
--- /dev/null
+++ b/tensorflow/contrib/autograph/examples/notebooks/algorithms.ipynb
@@ -0,0 +1,1512 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "b9R-4ezU3NH0"
+      },
+      "source": [
+        "## AutoGraph: examples of simple algorithms\n",
+        "\n",
+        "This notebook shows how you can use AutoGraph to compile simple algorithms and run them in TensorFlow.\n",
+        "\n",
+        "It requires the nightly build of TensorFlow, which is installed below."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "TuWj26KWz1fZ"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install -U -q tf-nightly"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "3kudk1elq0Gh"
+      },
+      "source": [
+        "### Fibonacci numbers\n",
+        "\n",
+        "https://en.wikipedia.org/wiki/Fibonacci_number"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 2,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "height": 197
+        },
+        "colab_type": "code",
+        "executionInfo": {
+          "elapsed": 7512,
+          "status": "ok",
+          "timestamp": 1532101577266,
+          "user": {
+            "displayName": "",
+            "photoUrl": "",
+            "userId": ""
+          },
+          "user_tz": 240
+        },
+        "id": "H7olFlMXqrHe",
+        "outputId": "472dbfe0-9449-4f93-e908-1a0785188a92"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "0 :  1\n",
+            "1 :  2\n",
+            "2 :  3\n",
+            "3 :  5\n",
+            "4 :  8\n",
+            "5 :  13\n",
+            "6 :  21\n",
+            "7 :  34\n",
+            "8 :  55\n",
+            "9 :  89\n"
+          ]
+        }
+      ],
+      "source": [
+        "import tensorflow as tf\n",
+        "from tensorflow.contrib import autograph as ag\n",
+        "\n",
+        "\n",
+        "def fib(n):\n",
+        "  f1 = 0\n",
+        "  f2 = 1\n",
+        "  for i in range(n):\n",
+        "    tmp = f2\n",
+        "    f2 = f2 + f1\n",
+        "    f1 = tmp\n",
+        "    print(i, ': ', f2)\n",
+        "  return f2\n",
+        "\n",
+        "\n",
+        "with tf.Graph().as_default():\n",
+        "  final_fib = ag.to_graph(fib)(tf.constant(10))\n",
+        "  with tf.Session() as sess:\n",
+        "    sess.run(final_fib)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "p8zZyj-tq4K3"
+      },
+      "source": [
+        "#### Generated code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 3,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "height": 541
+        },
+        "colab_type": "code",
+        "executionInfo": {
+          "elapsed": 103,
+          "status": "ok",
+          "timestamp": 1532101577412,
+          "user": {
+            "displayName": "",
+            "photoUrl": "",
+            "userId": ""
+          },
+          "user_tz": 240
+        },
+        "id": "UeWjK8rHq6Cj",
+        "outputId": "73ece895-12fb-489a-e52c-032945d7ed7a"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "from __future__ import print_function\n",
+            "import tensorflow as tf\n",
+            "\n",
+            "def tf__fib(n):\n",
+            "  try:\n",
+            "    with tf.name_scope('fib'):\n",
+            "      f1 = 0\n",
+            "      f2 = 1\n",
+            "\n",
+            "      def extra_test(f1_1, f2_1):\n",
+            "        with tf.name_scope('extra_test'):\n",
+            "          return True\n",
+            "\n",
+            "      def loop_body(i, f1_1, f2_1):\n",
+            "        with tf.name_scope('loop_body'):\n",
+            "          tmp = f2_1\n",
+            "          f2_1 = f2_1 + f1_1\n",
+            "          f1_1 = tmp\n",
+            "          with ag__.utils.control_dependency_on_returns(ag__.utils.\n",
+            "              dynamic_print(i, ': ', f2_1)):\n",
+            "            f2, i_1 = ag__.utils.alias_tensors(f2_1, i)\n",
+            "            return f1_1, f2\n",
+            "      f1, f2 = ag__.for_stmt(ag__.utils.dynamic_builtin(range, n),\n",
+            "          extra_test, loop_body, (f1, f2))\n",
+            "      return f2\n",
+            "  except:\n",
+            "    ag__.rewrite_graph_construction_error(ag_source_map__)\n",
+            "\n"
+          ]
+        }
+      ],
+      "source": [
+        "print(ag.to_code(fib))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "eIfVy6ZTrFEH"
+      },
+      "source": [
+        "### Fizz Buzz\n",
+        "\n",
+        "https://en.wikipedia.org/wiki/Fizz_buzz"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 4,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "height": 125
+        },
+        "colab_type": "code",
+        "executionInfo": {
+          "elapsed": 233,
+          "status": "ok",
+          "timestamp": 1532101577681,
+          "user": {
+            "displayName": "",
+            "photoUrl": "",
+            "userId": ""
+          },
+          "user_tz": 240
+        },
+        "id": "33CAheYsrEQ7",
+        "outputId": "82a493ee-15b5-419d-8c9c-5f4159090a05"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Buzz\n",
+            "11\n",
+            "Fizz\n",
+            "13\n",
+            "14\n",
+            "FizzBuzz\n"
+          ]
+        }
+      ],
+      "source": [
+        "import tensorflow as tf\n",
+        "from tensorflow.contrib import autograph as ag\n",
+        "\n",
+        "def fizzbuzz(i, n):\n",
+        "  while i \u003c n:\n",
+        "    msg = ''\n",
+        "    if i % 3 == 0:\n",
+        "      msg += 'Fizz'\n",
+        "    if i % 5 == 0:\n",
+        "      msg += 'Buzz'\n",
+        "    if msg == '':\n",
+        "      msg = tf.as_string(i)\n",
+        "    print(msg)\n",
+        "    i += 1\n",
+        "  return i\n",
+        "\n",
+        "with tf.Graph().as_default():\n",
+        "  final_i = ag.to_graph(fizzbuzz)(tf.constant(10), tf.constant(16))\n",
+        "  with tf.Session() as sess:\n",
+        "    sess.run(final_i)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "Lkq3DBGOv3fA"
+      },
+      "source": [
+        "#### Generated code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 5,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "height": 1081
+        },
+        "colab_type": "code",
+        "executionInfo": {
+          "elapsed": 289,
+          "status": "ok",
+          "timestamp": 1532101578003,
+          "user": {
+            "displayName": "",
+            "photoUrl": "",
+            "userId": ""
+          },
+          "user_tz": 240
+        },
+        "id": "bBhFIIaZrxvx",
+        "outputId": "d076a7ea-e643-4689-f90a-57f5d086dedc"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "from __future__ import print_function\n",
+            "import tensorflow as tf\n",
+            "\n",
+            "def tf__fizzbuzz(i, n):\n",
+            "  try:\n",
+            "    with tf.name_scope('fizzbuzz'):\n",
+            "\n",
+            "      def loop_test(i_1):\n",
+            "        with tf.name_scope('loop_test'):\n",
+            "          return tf.less(i_1, n)\n",
+            "\n",
+            "      def loop_body(i_1):\n",
+            "        with tf.name_scope('loop_body'):\n",
+            "          msg = ''\n",
+            "\n",
+            "          def if_true():\n",
+            "            with tf.name_scope('if_true'):\n",
+            "              msg_1, = msg,\n",
+            "              msg_1 += 'Fizz'\n",
+            "              return msg_1,\n",
+            "\n",
+            "          def if_false():\n",
+            "            with tf.name_scope('if_false'):\n",
+            "              return msg,\n",
+            "          msg = ag__.utils.run_cond(tf.equal(i_1 % 3, 0), if_true, if_false)\n",
+            "\n",
+            "          def if_true_1():\n",
+            "            with tf.name_scope('if_true_1'):\n",
+            "              msg_2, = msg,\n",
+            "              msg_2 += 'Buzz'\n",
+            "              return msg_2,\n",
+            "\n",
+            "          def if_false_1():\n",
+            "            with tf.name_scope('if_false_1'):\n",
+            "              return msg,\n",
+            "          msg = ag__.utils.run_cond(tf.equal(i_1 % 5, 0), if_true_1, if_false_1\n",
+            "              )\n",
+            "\n",
+            "          def if_true_2():\n",
+            "            with tf.name_scope('if_true_2'):\n",
+            "              msg_3, = msg,\n",
+            "              msg_3 = tf.as_string(i_1)\n",
+            "              return msg_3,\n",
+            "\n",
+            "          def if_false_2():\n",
+            "            with tf.name_scope('if_false_2'):\n",
+            "              return msg,\n",
+            "          msg = ag__.utils.run_cond(tf.equal(msg, ''), if_true_2, if_false_2)\n",
+            "          with ag__.utils.control_dependency_on_returns(ag__.utils.\n",
+            "              dynamic_print(msg)):\n",
+            "            msg_4 = ag__.utils.alias_tensors(msg)\n",
+            "            i_1 += 1\n",
+            "            return i_1,\n",
+            "      i = ag__.while_stmt(loop_test, loop_body, (i,), (tf, n, ag__, i))\n",
+            "      return i\n",
+            "  except:\n",
+            "    ag__.rewrite_graph_construction_error(ag_source_map__)\n",
+            "\n"
+          ]
+        }
+      ],
+      "source": [
+        "print(ag.to_code(fizzbuzz))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "BNRtprSvwJgk"
+      },
+      "source": [
+        "### Conway's Game of Life\n",
+        "\n",
+        "https://en.wikipedia.org/wiki/Conway%27s_Game_of_Life"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "r8_0ioEuAI-a"
+      },
+      "source": [
+        "#### Testing boilerplate"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "7moIlf8VABkl"
+      },
+      "outputs": [],
+      "source": [
+        "NUM_STEPS = 1"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "QlEvfIQPAYF5"
+      },
+      "source": [
+        "#### Game of Life for AutoGraph"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "5pCK2qQSAAK4"
+      },
+      "outputs": [],
+      "source": [
+        "#@test {\"skip\": true} \n",
+        "NUM_STEPS = 100"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 8,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "height": 308
+        },
+        "colab_type": "code",
+        "executionInfo": {
+          "elapsed": 14892,
+          "status": "ok",
+          "timestamp": 1532101593030,
+          "user": {
+            "displayName": "",
+            "photoUrl": "",
+            "userId": ""
+          },
+          "user_tz": 240
+        },
+        "id": "hC3qMqryPDHS",
+        "outputId": "8405c0e9-e518-41d6-f5bc-e78df6474169"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/html": [
+              "\u003cvideo width=\"432.0\" height=\"288.0\" controls autoplay loop\u003e\n",
+              "  \u003csource type=\"video/mp4\" src=\"data:video/mp4;base64,AAAAHGZ0eXBNNFYgAAACAGlzb21pc28yYXZjMQAAAAhmcmVlAACZUm1kYXQAAAKuBgX//6rcRem9\n",
+              "5tlIt5Ys2CDZI+7veDI2NCAtIGNvcmUgMTQ4IHIyNzk1IGFhYTlhYTggLSBILjI2NC9NUEVHLTQg\n",
+              "QVZDIGNvZGVjIC0gQ29weWxlZnQgMjAwMy0yMDE3IC0gaHR0cDovL3d3dy52aWRlb2xhbi5vcmcv\n",
+              "eDI2NC5odG1sIC0gb3B0aW9uczogY2FiYWM9MSByZWY9MyBkZWJsb2NrPTE6MDowIGFuYWx5c2U9\n",
+              "MHgzOjB4MTEzIG1lPWhleCBzdWJtZT03IHBzeT0xIHBzeV9yZD0xLjAwOjAuMDAgbWl4ZWRfcmVm\n",
+              "PTEgbWVfcmFuZ2U9MTYgY2hyb21hX21lPTEgdHJlbGxpcz0xIDh4OGRjdD0xIGNxbT0wIGRlYWR6\n",
+              "b25lPTIxLDExIGZhc3RfcHNraXA9MSBjaHJvbWFfcXBfb2Zmc2V0PS0yIHRocmVhZHM9OSBsb29r\n",
+              "YWhlYWRfdGhyZWFkcz0xIHNsaWNlZF90aHJlYWRzPTAgbnI9MCBkZWNpbWF0ZT0xIGludGVybGFj\n",
+              "ZWQ9MCBibHVyYXlfY29tcGF0PTAgY29uc3RyYWluZWRfaW50cmE9MCBiZnJhbWVzPTMgYl9weXJh\n",
+              "bWlkPTIgYl9hZGFwdD0xIGJfYmlhcz0wIGRpcmVjdD0xIHdlaWdodGI9MSBvcGVuX2dvcD0wIHdl\n",
+              "aWdodHA9MiBrZXlpbnQ9MjUwIGtleWludF9taW49MTAgc2NlbmVjdXQ9NDAgaW50cmFfcmVmcmVz\n",
+              "aD0wIHJjX2xvb2thaGVhZD00MCByYz1jcmYgbWJ0cmVlPTEgY3JmPTIzLjAgcWNvbXA9MC42MCBx\n",
+              "cG1pbj0wIHFwbWF4PTY5IHFwc3RlcD00IGlwX3JhdGlvPTEuNDAgYXE9MToxLjAwAIAAAAPQZYiE\n",
+              "ABH//veIHzLLafk613IR560urR9Q7kZxXqS9/iAAAAMAFpyZZ6/h5MpYA5/oqv4s2qPbYpW3jfK6\n",
+              "zQ6q7WMrNj7Hy8jZzmBpfHCwAAO1W4riBNsrapcCk+5V1W0XkkFULR4Qe+H3uGA2HgNW0zFAAUgt\n",
+              "W4tdpXv2OEg0Vuy5W5l/xGRmEGKDyeXyrM0S6q/1EKbad0x2mcHseUqNmeOGLy1N3b376XZKZcPY\n",
+              "IXC5F2332tNMj8CwOQiXM9PiCLyCVfZ3rQSkKBTZErkpS5kXUyoJG3FdIqLjRFKEapbUjcW64HIo\n",
+              "BeIbtRyWV9FyZfcTakx2KW3eB4ZI//MDykSe8CRgN76uBEqZFXwO63wmUREhHOb5AdaLV3xyGl/I\n",
+              "RV70rU/3t9t1aq5mFD3hy1aLTAV2U7nG072dyX87F7NgCxZHT2kFxu44fxf6gqVzE3PEbGr5fx9x\n",
+              "7TKXtmY53VP8UaeCd2HJiZ/sd165SutTnfiWvaLuCnmmXGF0AGqbj9S19kgOhTubZIJBydTTqQOV\n",
+              "YRlxbgKn2nzvunv9+NDG0/2ikyyp73W15QClmjyt8dUeynoN8CwtEQ59DdrAPZe4ARZTwWAfsRXw\n",
+              "1vcZ6Gr1nCNWllQw5IyZyxQtXrfc5p4wjPvGaltciG7d3FG1SGk6HDsZy5i/PsnkjRXLUvGbzYp2\n",
+              "2gs7ZSGfSJbEifctcMGeSqhOOYORKy6f/9omoieCVEEkniBXwWZ/eImb3nxF7SFIaBjgG2j9w5ut\n",
+              "BY6zSuQ5zRCdajzJ1loNO0havI8mp5yViAeAlLKYCxeK0Lha1FskL67W1YsARZVZ5EkhqAYEeTNI\n",
+              "M38Og48OXmj6QBN7c1b9uDUTacYEXO88ZQ1gCIREIMnm2Fgkir8pN4gtSeQ12sfOVz5x5KX7sa95\n",
+              "L4LyFQPDrFZcDBr4PWLeEEv8yzk0cYHE97GmAlA6WQ0HlWsS42cnXefvTPXnx4vcq8pbEo/slAuH\n",
+              "IBsrJEN1+aMCc9FNxwUPVbZVaWVjwLY0qh+mNWEaiNGRmacDXrYWw0NjqMPiLiFHacY5oGELRgym\n",
+              "S2mSo6zhsD1wKQ3EUQtwrjKPiDYc/HCqhkVwoWKUdI8xTS60kn4f5UqB0L77Yevh/wt7AnvQKQAq\n",
+              "QAEEevggRl1uigbOBTtscnYRnAj0edW4QExAzdo+RwLWXTzW/l3cBWTrh3ORzZQlxJ8jQTvPLB+f\n",
+              "bLazJZWFQQDcWhuhQ3gYcP1ruNwIroINRIr8px0UOgAhnk6CllxMN6gA5S0YPhFVFKd3n0AAAC9f\n",
+              "vYgISQAAAltBmiRsQR/+tSqC8p1IAOZemTPutEfx0mzK8zG8tdIxonBsDpoLZ+NnIOp4qK6idP1s\n",
+              "vbGvZz/zHM86Bg3q0yx2atmtgoo/Trt3YRy3se4HTjou+tCi7oJt2d7A8vEhVDu33JNJx+WCOgP0\n",
+              "03nVdg9lBs15v/0w7qMc3zqqJXCOy/Whl9aRhcaeOEWcD7uK6mCV8a6MpDJ959xBRfv2i/qFOFbL\n",
+              "Grs58WiGJcq4MQJI+rVWuFN50oiqBgiunfUrRmdviPYpNN11V9pwcOJwssWfIE3agnor/RC7vfLY\n",
+              "YoXzaJjtWLEL92OOaHLZT0j555xfb4FZcoJee+RXovB9IaoDdYRusngtBXPMUvnO+g2Z5Qdo9P8q\n",
+              "Zb8ItBAeHT8IBZAD/Z2nEA6qbxqOBSBtQNW6ZFYLtCTIoP/bLjCDHgtZk3cf+N1CpXs15pUIYWDW\n",
+              "elZtlTkM4w4EJlLdjLZyQPAeaBx/qoLmKyTKAEhm0hU8EcTq00f6fwkWgz2J6GTGtL/vJXgC8u4o\n",
+              "nTnf+Ou7sVJGVaouXxrzx+yGVHEcp/eV4gaFA95rInngQAOZWbA3558nK61JBPZl3NjEv5B9r9pg\n",
+              "2+SYY3wBAUeu2fgAB2+yYGw82pkoJJKpzYWORs6i1vn3GEgUTcwlYsdJcraYC5SnGvqSZhX7KM72\n",
+              "uE1e9bkpvpVyG/mkACn5R4jwX3xc2utCjjZgM101rirIF/7VfDtmJsSTDes+UVhbSr3SeMSI9ixJ\n",
+              "+fVuFZ5bnQPoRIfPc+Erw+K99JiGN+HE98/eq4pPlMY9oCfVPSdNyOAAAAFfQZ5CeId/AUuqOi5D\n",
+              "jlKfxuJGZZ1+rVyomjOIykvxtsjsuCiGtElbraCSFWcn3aIYWLrF3fPovVLcOnroBkiRMsdf5yJA\n",
+              "F87MQuoKeTaGOrxojCCCS64RiHrqNsE+7mfRRUDuB4sAEHFQHxBorgTukPSvrdFr5QDq+BhZj/6H\n",
+              "KN+IutwFWKX3ZX9pO3sI8My78TgRY5AA6FEcT91WcvnMypB/OWXzK6M8fYuhVVWipAZigjVOYhcF\n",
+              "9i6GweQFX9AV9EUQOp2qFbkrT5jceBRFLX6j4JUQ781/UGTekv1fcpCmzlpNpp8GdSeWxRL4gasp\n",
+              "F5uO5KW63rlhYccBo1cFwIN8txHNnwyQNiP00XC0PWDRZfaWSxsACRWrISow71IyUfcL7JNhjTII\n",
+              "rwDYATS0xZ9ep8siFC3JTxg1eNaroYfeI4tbkRHok47Vk+CUOQPuagVBtFMOOcy2OUbw8AWlAAAA\n",
+              "ugGeYXRDfwHM79ghzBo9nMnzfQPPIuvorxBb6AC8F4fYGD/t93kNSKNSEuhUXq9FKGtxnCkxN880\n",
+              "BPb/uTbjLTQVyPNuYlGl/gTlyLcVA/cDoLrl5TvaR/AcSLFE7C/t3kLx0STNibmdAf4TsHWKSblH\n",
+              "VWB4X7oQHrrDdhwIivRgUZf7f63j2XaGB+cbp5aHCCwJoovY51YTqsZZTz70FlSnypPHQBNzif7h\n",
+              "uvZkXhtEzpu9rYMo3YECkgAAAXIBnmNqQ38BDchAitLfY16mYQAQlVmv7062W8KLpIS1/zhS50Ib\n",
+              "b3ERigmkZKZMPaCsAi+zsLcku/gHGHnVZpuCZMFs72gmyuL4JFo6VjWcr5FtBvzIgD26rBNvP73P\n",
+              "nJjl3JImmFHiKjNez/gG3zTuYyCACuJCEYXyuEmzCM13hdCPHKg5GZtso0Z1qk6T1k2oiqF/3RIn\n",
+              "kyjRWuxBlHHmJ46TXULiUY14G+RAGoXI+u/G6muNclld2bq+6Zztuy+5ynaDWNNjuN1Ag9KUIx2F\n",
+              "XwNdepmp52/rOvISNPbMJ0U26OvqplXi+qHTbg8MLpUSIGCY8w9FZ5woLAENgvgu9M79yGlL20e7\n",
+              "ypJ4RMBqHYDpEz6Z+SSjXD8LsJ7VKlwo22A5Yukp1vTp6HHA35nV+PXK09DuRWKKdQUzmXVihF51\n",
+              "/+bB0PEFdoNxGdbbM7WveaCJN8XI7JgQWvw2nPlHX8M5QyPGSJ2HEexumoFrABvRAAAB70GaaEmo\n",
+              "QWiZTAgj//61KoCPNGHq/MxnjqmxxQAEHvTwibmyMZGX3ES9Abh1tMR+/DjR+6dnqRr/VxCl6gEP\n",
+              "wJ/5EYCYfGaGmQYsLOeM3v2SZjdvqQBwrwKk5A/63kFm8fc3QCLe93Mldv3KWXHdFT7/mudSntDc\n",
+              "vJwStG4jgi5LKlWdSrVaAxOmElsF+zWNzaCIQ1dOiZqi3JKj64hOeq1XIWyGvRvh6OLKBpB4rL6W\n",
+              "ugf7H/IPbSQuF5jWV7zL5LhxWiTiI+kAZTUMfO2YOLzmhCUSN9GAmNzgY4D2awYB4V4QTDjI7kdQ\n",
+              "tL+3Pmfl1HVilu7nC9CzQSvWIosiwv4btyHTL7IPT2gusybyNfW8QO133L6KbDhhXSDWUtcIFCgn\n",
+              "QUm36C9hvgGjorpKYr5VnErpJX6fRJm76fFYs8/nt763alyqdcSrqaTOLaf/72Wkkmlwbq3nLOIw\n",
+              "ADFDkkAPwzaM811K11iK/3HaYRT3nEhjJQFk5v4WBXwIVLAZeKdtC8YoGN9K6isN142fOG3s6fm4\n",
+              "J1nMtOEZHIwep8In4slLmHh39qBzhGZO3igiVpgz7u+JMBeFkVHe72vduBjIy+1dqvxL/TPics3s\n",
+              "+alwfTMNQKave1qW+5Uj8jZQTjcLAtKvzoako9VMIOfQUQAAAQpBnoZFESw7/wC9ZU4P+UeGsidW\n",
+              "4n5tFkXmtxppYvKQ+WGj/x3AAdl6+9c9x7N2b/yJykTvVggfpMnFUWtxla4sr1ouwANom+Uf4IBJ\n",
+              "/zXPovndpGdy98nJbZxFU4rrWpr8aI4YmRX65+IGTn756CZWwXKY5DyMgKnDcCtk0HEuoHgdGhh7\n",
+              "1PG8+nue+pE9pBHqiBNWAjPd90qfMtABmMShLoXtUObqYbqXhJvVjjFhKdPS03IF24fu9Z0ax15V\n",
+              "DnkiLmgyOCvJmcdIX70L2ZEECd/hxrSq9JUVjC41OX0F/ayI6GtkPMUuZ2xWkMFo5rqOAo7v0Zlk\n",
+              "ke/79TjeY13FNiowqcbhMwfDuwAAATIBnqV0Q38BDXNpg2t4nJdhAA5ru/5Co2KbB/AnQt7fa959\n",
+              "0crOQgtTxL36jtVyKPmfuQMYuWbJ/7bYTEV8sEjceHvN6B0CSEZzVCjaPLzOQJZMQpQ4K4WKPlGc\n",
+              "lnEwYAC9Dsejj7Fbk2RyCFiJinyU2HOscjUR6fW2jRsAFpVq/PtZDVPvesPG3AqooVaKHp9Ex+Da\n",
+              "AH0OvccSugyDKsRBAEiYR8645aXxbFSzraQsELDsIIr6HRN8F3lUNVBvzNO3mxBhq4th/kgZSjjJ\n",
+              "JZrYmg3UfIUO/jn4xs2XQ9Pa7Uy5K3JhuIQwAOUKDmAMC0p6fgz2on4ceyEcfiCGDPZpPyL3391F\n",
+              "dXID0ctPQ1a+Hk7UcAc9gSDL8CZKz59YyO0ACPjfAKV3Y2dbTAKdWBsUU0EAAAFEAZ6nakN/AItk\n",
+              "aaqbMCcBE0iEIDnEBfRZN0neHQxaz5DPSzK0ZSL640q0AA5jkP0YAYAumNCN0MxJYpWFoQ9r43H0\n",
+              "i9SZLdv1UbgpG3aX6KESZW7AgdlevaBngH/w8xYsqWx5t90zzi7x9VyRYpIAD+XTrxvgBoFILNCs\n",
+              "gd+zDA9uvbAPlLMwG/qFltlwvLokMt344erv3a/C/ySOwZHFzpakInpJ7MQHkmKi1KHZB5KrfqwF\n",
+              "FnglZJwWbe7LtVojTdwQnAksziDNlEWCkMQQJwziY1KYtlXMNX8mZ3MtYR1KNf/CNin7/ys9ZQyx\n",
+              "4Zlk//H5KDc/8O2+JaxH20CAaAABxgSxo+yJal1LnRHYfOQ1TygNueW/rPAA37g/6fLS7mbYKz7k\n",
+              "dsiSiy1mAV7n/qq81UHJPShQSXK+E4Y5XKuXEWG4AAAB8UGarEmoQWyZTAgj//61KoAW7kO9JCjl\n",
+              "XSE6nAngAJVxWWFl/YDS0gZ32xjwUFed4hmI6rj18z16nS3Mz1iMmFblrtaE4zGXS046COODiIwH\n",
+              "QG5lRmcBExMKlnynQruQtA8n/NitzdP/ysLrucGyp5nKV+XyJURULfxk4kwNp0a5TFlJ1fusOOJm\n",
+              "y0hvsvEg+d4Jz3anvWT6M9n5A84CGucNifV+WlN9gI9gs3qSoCZdU/gglcFYM5u8YchzhQFyMKxn\n",
+              "kpfWK2LU7aaZHt6xLbqjuv74523K9/dtrrsFq/LySiv1P9Wk6/6d5RC72z4cyaUq6hMMn4IWWRo0\n",
+              "zJIM1/lSYsWxt5/M1Mkv00Rt8OZvmLxuFfd1BIVlANlpgZ39RYhqqzU6v1HwaW0EudelFBGhr5mf\n",
+              "GaDE05Z8ywp5rN4Qq4D4GNAGD/qgEjtaDDf4ZBAD/TAHBwxfNjm2nPAdbbbIuWSkkv8NK6EMlKqH\n",
+              "mOktd+CB3P6Szd1+HPnUsyQ3659r3XLnoi0cvM4usfW+BgxqT0mgHSgn/F6ajdTNM+a8xJQnT036\n",
+              "7195r0uF5vwi7PIviCQ2E4Vs4Wx80/8tBDEJS4qOY1YJ5aNV1OV82fB3HOimLHd2vU/d4Cv7OBh8\n",
+              "k3gNFcjeBGh+3lQcDCLZrG1mAAAA3kGeykUVLDv/AGVBMHxAlJYGEpFnv2bb0ADrwvVKxe7+SIJI\n",
+              "g0dPJdL0s9Hd2mGX7rpdIiUH9ZgtnBO+m3uPNae/YtN3u2p0kkCez2KiPNqgSoEcHM+ePgq7afkq\n",
+              "0HHTSZl/+QbjsyfbI/0lv1mLAJUd3u7VZPPHSdXK3vwLfAwOe3Nid72slU892DijWVvanzM1IzDQ\n",
+              "XfN6x6GH2qfaLrHePrJTJxXC/RSxcAol7x2JJ5OA8VjN8jXu0yKirBiYqgcdFf9odG8j4bRmE2wD\n",
+              "MG0SKuGrJfd91b6B7hbRUwAAAPYBnul0Q38Ahz7YAbwPIqnkAA5sEIcKo2/sVUP0LEeFOLjKjaet\n",
+              "5YFAjDbL5BIdGqWouG/H8ozoec2ZpUbIZu0ELtG5yXc/5opSZlnqbOpqdTQkLs6gr9dv5GbFvVjS\n",
+              "Os1j9FIMQsdc8pttosNtygWB8gLxr65El6umAZE5CVU9Mc8Xxg/tenmTduGK9Cd7qRDiu1sLYR2f\n",
+              "or3KBMo8ebz5q5EmWucvREbYSziQIIycIwJg9OG+aH+ZUEQbjbfHfaiX7yoxGJGP78aNOHP7GvC+\n",
+              "JwM6DxnSyowUBAqkW8ckgrhet8gYYrt8MIe1MPJQB6sv8hHuAXkAAAFWAZ7rakN/AI9XvmYGr0rf\n",
+              "QEvrPPTQWEAA5ru3wBCXPJiC8OaE25OBvVl2wRXqp61wQU4HxGJCAxkSOz+G3Yzvg36uCK8bPZTq\n",
+              "avaOG/H9WxjsuwAl/bIYJdnyD151CiUZ34aErVIixKJ53oKrLeHr3xLgxuH+y3w5uH5lQRsL0Pmp\n",
+              "0jQItTBkKwlPywxFk55pROuYZWi/h/N19QaFlF7WPobUElLlr+nCH+pVt1nW9/YwVGz/cO8zwmWe\n",
+              "Fb0OnFji7CYSsi9ScC3a50GjUP7IpaY5NAHv33V57bkO/BD6dnreymTbSmQdcj7PAJkvz610fMqn\n",
+              "mDGTMB31oxAIE5eWeH7mBZouSgmtxEamul7sYaTPe7mP6FqNCz0h6wLot/zAFwx9/D2+XB0x8mmS\n",
+              "b086o+gqkoYoHQeQm2Sb3MU1Bz0KHDGo9jCmsBmecxs3oNHV4KaIoLKAAAABrEGa8EmoQWyZTAgj\n",
+              "//61KoAcdmk2P6doyaR4wEHxsIcmssCD5f+3/v8PGtlbWZ+A0oGGFPTAdgmU2TFbrRxlmwUCouNe\n",
+              "8freV7blHDodFImzwP3saA3AZT6NUl7vDGH/tw5n9y8rP4XGnhEXBHK+6jIhoAYc6G1CDX0mqczJ\n",
+              "7tbei5I0YSkDjza4rJSbAF6cRoJQH3s2Q+ggBQR0BfH6N3QlPVwd9YFvP6++J+XrbNU56Pxu6Wey\n",
+              "51asar4AaARXHregTXL4xn/VNt8Ppk2xD3/1jXAVXdqMlS0tYGM/TtrcuTC63Lx21RQtklG6k0xA\n",
+              "eWm6W0oL0KTvxuyegpC2ySp5v6zpSEYvzWR4IYirfT0RYU+jLtX0t4M/L/0k8xOLTHbouoUPD6DN\n",
+              "dYYLYlVX5noJzjCAVCiS21OCcIKqWD/YiU/+dTZpdFFNdHEa/MPvUEq7cJD7ANJ0YUweepq2Eqdh\n",
+              "57SC4Tpg6jyEnFgMaHQLSz1nJNh4lxM1TPouGZ9bmQdDr9WY+nwzRBa+ZLnaqBSYKWSKEs/TNtNZ\n",
+              "ev7d+EnJUf9G9CAmmiSDlRAvAAAAz0GfDkUVLDv/AGU2nAwHHyQlvUxuENDSO8vXFIAPilnMlQWb\n",
+              "nTHwb8wkIo6JKOaIP9blrrNXcWeeQDVprB1Bn//+nbSDHls1apJcUyMHUmojA58P91gutTiF40zp\n",
+              "fDaF096G01gcvpH5Za4+DfUvxQpt/wH5PntJzggww1tLhP1NyH5U2TTgrnA/BevK2aCa9xCuCVgA\n",
+              "JJZF4uqHE//COeWbJ6LIFJPoadxAxbrAcxPQQHMzEG5G5S3Yfd+YJBLrdO35JvVrsUTYO4AfvJeC\n",
+              "zwAAAe8Bny10Q38Aj03WPPyvISnWAC7KM5WfLH925SBeAKcvJaYOa5WZCzX9H5nU/7qAFTCgAnl3\n",
+              "rAoSnKk1337XDAnLfPYAAOSIcqQwF++e4HouwNVAWCEsVyl7Y6DnBaBT2mD1H8560KoMvm3kKNNC\n",
+              "oxFCc4BdAIXk45JUbGFNGYAjCbBbJInMjwa41HA404yKnJG7rNXdBctnsSL/36UoXvVx3J2tGX84\n",
+              "+FHk7e72CsAyB49ajd62idmFQji9Jj1GaiqtCIjWs5o6Mz8s5QfrvipNYYD0YZ7gBBGm4AEz17d8\n",
+              "isscgsp4QI2odbuEJDq1nfJbW6+1HGcN1XfDC1Xfa5IptM5UYHm5zIT4rSPBIDE6l8/NhVxlFP21\n",
+              "JPQ0DZxnZFvxIBznQbqkhaGZjMafgFoRzC9Nl17x+K6e75RlplRZtXaUIbjAUFBJIQPkoIrT6/O9\n",
+              "NtkAmnl8qqUC1RktW/RjiJqOyRTTITHqNKvKy/0gb88xEvvGPgzcSs2KpkbHJWmCGIlSWEkuqcCE\n",
+              "jBn3Y8XOQxMUxEYeLPJ/9s/F2fT5NAnko+RFlv75fWLekZZP2s17yJ5ccFGhZyrkGX6u7xXK7N8G\n",
+              "Qlz8qfOHvgMQrlB8p4j7qtnPgBPf8mcsM295CuAZxkK+sut074W+0hM24VMAAADaAZ8vakN/AI9G\n",
+              "UrhSy/Rrhc/LGXguupji5cAHC2DVoxU1gWUkKeMT366GcmuxH5O8lBZJeHl8r2KNT0EaVARyW7pN\n",
+              "L4uNsKKl/WAzLJ1OZWTQf4NaAfodQGO9KzZS0j6oGvr/urKiQwbP44Tv//glYQyyCFeq+8nnrHBj\n",
+              "aACu2w1otySh0DYMX412uY6EYcx3GtQaRpNPiKQniWdVV2KH48fVxDy0uLS0SmCZEAWLVNvtWqO+\n",
+              "q2OwCBr1m50s0i8eRTlSP9xoKtxWC4ZqL77eAW3kYEBJOAywYUAAAAH6QZs0SahBbJlMCCP//rUq\n",
+              "gBY3NzYDjVIwwAKbp/vtZn3NtK6t0V/4sA0MV4ijJVoTZ+e36T0E9eQ0LOyzsqR0ULZJUDRy41oM\n",
+              "RdsBwM4wyEJC67daWmuDEXKhZo862uqAH8A0QJ5u5RKBPFpngChYYJdWzP3onEWImG8Yryy/SXt0\n",
+              "jQ5te76AagLius72bzwZ4AZfLm/04ID6oXhPwqkf1cNsu4/kIt7oCOETiL+lzwHLEnEsdPSz3DxD\n",
+              "uLGkH8o6jHofDxEXcB6cOS43aUxGKPYPtHCj2gw6RzcRoX5lD5mwqtoCTxk6N8TxyipSUyNnbA2b\n",
+              "G5NuBUVLHTce3QKY3SdkbyH/wzdOpT3YHUE+FYQwMKCF6SMyMBxp2gI9k4yUZYljUiekF2XIFkfv\n",
+              "TFy1RUmikOycLKkTYTreTarsMD5JfjZ2FJWrroj/YX+uNeGtKNZl9Zyt+k8u4Htq1bPYEjCrLHds\n",
+              "qeIuFWmvxTYEQblStjDXmWfITtxy8KvOgn9iV+KlidrnVhlE7Dz30fuHXxxFZvIzhgU9uv6sSC7T\n",
+              "vZuGMsKGBGTYmSe0P9hLI2VyM/8GUWwG/AITiU4a7OVDjUNRPaiIEt8jt2oImPIY8qcrJ82CVd+P\n",
+              "mSjoppoeHUTHmeo+koGqjhwT7ueVHNT5VZ4yuGKEDdFfEIkAAAEMQZ9SRRUsO/8AYrbCELHs5dcg\n",
+              "AyOPuRHZUWtdXLx9XaNQixO/8Cc4Q2MgEa/wKETsHiR8C1XOv7rI3JB0rg46JfjEArbHaTHmANKo\n",
+              "+czcI/sIduYNFOE3TvObMh/KtGpZSdF+qnDDtY8zD+7RQUdzmkG5zeDj3u4Vq+f3qnKCwgbU+U0R\n",
+              "dQR9Q60wXqL03p/iYVxkI8jJqvkECuxT7efJI+5rmzyP1yn+WKY2EsjjB7bwwVfe6RxBmzR9Ed/9\n",
+              "CA95ILUJxNg4HsmCO2Ko+MqZAH3wMlG18kUm2ogL3cKIkVXogjofyKhbsSpKLpFFk71DzB6NrY/3\n",
+              "HfknWM2yn9yeQB/joufGEf/bvMAS8QAAAN4Bn3F0Q38Ado97WJWiqN4XS53kTA5YWsnJBdebpf+9\n",
+              "lcN5zPySAC6fH/XzBsBKbxdm4pTiPFVrmGXyhaRiB6dxtlwj8MyI40Do8AXHq41BAunk4K4PTgzR\n",
+              "rFycWqaL549wB2C5jNCLXlq6Tuytik3ijlMSkx9noeIG2Lc83eWkRkQieksQSO4xI1tzzkdqaNhG\n",
+              "ExZARu3MauZwrBopslb/ZLdR5ZS0G6p8o9DD5cphJjxJoSV/70/0Gr+woS8Zj0JpVvvpygE5bXQp\n",
+              "/YBCqjmq4uOCyt9SvCzPelUEwXEAAAGyAZ9zakN/AHZ6+HiwE6fxvgA5rqP9zmI+FShvhJS43N4N\n",
+              "sc5a7qq0DK7DHadXkQxf+APmeqLrIGM9X5aCQgeyxdoAlcQoyNsm6ol85w5z6JV8A3YntmCae+s8\n",
+              "+8/Yheg1ctJWrSharoeypUyemQeq9Rm5cIkSOS9Ej0hbIHyFhPQW6K3SawgMNVKQ0s1BpJvXDQSY\n",
+              "x3jIEdIgEtwe7zce/DjcO3RNN3g+SlPoM7cl0qJbM44NIDG9JGXcwVrY/YKNrpChX0yegP2ZHDI1\n",
+              "MzOs5eWP/2l5loJrLid2mK4Qhw6EGFrIadsV8rSjzgHRNuzJ4U3JdubidEobU0ehkU0P6MYRK/XM\n",
+              "58mVywGbsw6LPu56h1S4w3zHGYMd1zPKOsnCUhaRfrSZTxvjerNQ22prVPqBstk4JgHdnSScrwGw\n",
+              "eQcqvIw7gKhonPDKM4fJtO4n2EsI5Cd0iGMjmgPw/PU3FL8ZP3QbYLMwZ81Wd7BLLBDf+ngKiFIe\n",
+              "it4neyhhaE/a71b8TxeM/ZrgH9+D76dlgPI1ZJW6CCVyIs6Y5gK2plkcgRYa0MwWF+1A6zPtBEgA\n",
+              "LOAAAAIIQZt4SahBbJlMCCP//rUqgBY9we30eRuAA2kMf/9/gX2SHKs8Uq31+W7Vx4LugxILnhMT\n",
+              "6icG5WQzdpL8yjIXjBq99nVaYweUdJE3LrdOpsVxNJ3kODVBkposYOoRuOMi/SNhcjrJwShp6ljG\n",
+              "Qs7tSeRJSYDkvm+SI2ckjbManbEesw6wo2ZffuryaLuWkU9SNALC+2QbPJD4bFy7sTmB9+6VOdMm\n",
+              "rnLvYN4ZyAJz7OhQG85P+JnxdgXgvSv66sWBs05p3vOE+53H+HQCMTLVgvoYmHNTIYtZ5CIln4hA\n",
+              "GrjLg53unVVQTiYlSzZrRE2vmtsqac+v6CrcbtgC4HktflvPTsvgqWNHri9NWa+EuXgx/AgGkZVJ\n",
+              "r1n6gAd3jtjLtv6YvbPiBBo2AhBUxCbYyroAjcvjwUBtRjXTdDEvdYfItmTKA7W3+KvVi/PCtod6\n",
+              "/3gOoaA7zRdO+8+MHlGl/c2xzQhj2O1n8eJkOu+NcsBkpmxyosDi11EOEaiQ6vfnOvH9MSM+7D/v\n",
+              "k91SLlwv/nF+5eDPHSLZQIoFUjHjwVoSGCdOLqmIe6tsfTERCeAhC+1bhRhe0612KIL6izjolsR2\n",
+              "nUgrl1o39HqnKAVqQ/HguEezLTgmGW27Df2kp4E1wRl/EQgEcsMfBPga1ndY4uHPYq84ArNCWk+c\n",
+              "YwxlHAPVC3PK3Zp2kQAAAWFBn5ZFFSw7/wBXFVHDEfqz5TAg6AmqzzGCl9B1ICKhB+tKz4Y9Km1L\n",
+              "/vZyZ1OR5rO815FlrTgGoncUDKVNjpKrVerCm+HleHb1b4FhYQG8B61zGq10uLuoQHIyL4Cv2/mm\n",
+              "s5Mi7ZftErBt64oWYphUyh0Hmn9dYYheGFzLdE9gvqcAEGJDyLZq+nfiK0Px8pHIgaIfsEdSUYcC\n",
+              "8Otyxta0EKY+Dm2m8AtQ8jjuDmkSHm/uLhgf1uCnztOKFhkR+ydRCeR9tnIlTfiv3gJbsPT8swjP\n",
+              "0OUm6yT8LhwwCJU0AGI9hN0/kTkz+NeSHjSPaBx26MAfS2Y5NEtva844h4B/RttjqxMsNDiDrfB4\n",
+              "5xn/Cl/3XrcF40eivyUSC+FHzx3M4BoLQLOKf7iz8hKiUrqRGVkGToUMxkr5192x9xCjbuvLRMd8\n",
+              "9Pel4WIOhSi52xuSf1eEhC5VVAp4lHpZmHCbgAAAAaABn7V0Q38AdnTaV3jxqK844c19uepGJJSA\n",
+              "C7DQuTz6pWfCzxcMbX5JwHItpyM9y3YT46z61a7h5Lyukp+nSKoO0zQhT0EB/u6ILUCNvVbb/89X\n",
+              "7TVI5UN6EFwYYfi4uoFmqb+5Cd0J/+d2405yTsK/f6WH/T+vNB1DYWrW67ctgHOgMHAWDLG9mitl\n",
+              "16bXmPVSi2sWzpWYg3147nlnaD00aZHqQlrMPzYTLLFwWHOLNqCoWpNLMMEevc8AnQWeykk9VNTU\n",
+              "NXzAXhrKDXl1tLQTxZG7GX3K9cQyeUnjfH3rMBGDD2zCLGXrMfPVl9EJ/F5M49Rjn38sXUf2JvF8\n",
+              "D9r9tV1APCHN27+egfFIMDg9OhrQMtjAe3WEfpYS7pl5yHh7ZZ2CedEo/Wf/ygYTAQFI72AaUTrV\n",
+              "n47d9OSqAdYs7lkgV0864auRyPQeTKK1Sp3ADeIFS134VGBNG1VnrfyZuznYkI2r0FVkGFrAXpUu\n",
+              "ZJmyKqqILhJ1OTBM8C0VBV2QXBYa2aSn2jj9t40/wJJWc9IGAVR0vj/u+wFocjwf4QAAAZYBn7dq\n",
+              "Q38AeUc/pR5QUuADgu7/kKjYlIf8yn+MfKKvFMJ4eRJz/DRqteBIBJsZW3T3phi3NzuSw0zOvEhr\n",
+              "CHz7xEUteyaR+fa6YCBeiCtangbUerW/UGoCobzV/74XB/lXH53NcEw+6x9o3/ZgwG/7l4psK3P0\n",
+              "EqSwtCrcKAAv8Wi0Z88mFp3Sp19shMF41mqYa8pNsyefrruQONS60LHg/1GySbrTeTWW74lCDwnt\n",
+              "BGXpwghp/QF087PP7hxkE8lvu8APh5F1FTiOCBSvJFm6yFC/tz24gmveLoV4Rq/qtYWRE09VDCDH\n",
+              "yjftToPMsyi4DoCtXsPRk5Jxr9Mn6xDxGjfz8uMmOKJ15ejPi/Sx9cR1QrBsU9dhcYifdB+c0AMF\n",
+              "PolB3N4pBZAASP6m7EzaTer6yZ2sIKcQdlGt9xsZ0SHtS2313gpdJkLEVrHpO5/BTcfUTTcK1+bC\n",
+              "PwRYX+iIyInP1m6htprdy84ySZ5IaGCpRKFxMCf5w22wXyyon+dlMPKACguyEPTCCZQ2MqEuC+sa\n",
+              "uB/hAAABxUGbvEmoQWyZTAgj//61KoAXgR9s4tVmwJ9HTza3s57iAAoQf/wjqzjlXnP+29f12EfR\n",
+              "S7B+4I2epG2qM/uoQ7VlrfXFlhjyX/aTq0n55QXAKa2xUKolKsuMfmZFFc6+GP96b13JiSidvPgt\n",
+              "2SSGnq9Yw4MfceFmgOaZRcwoMnpdb0UpI73YdP+DfypKyrkDqKWcBc/BGhrH8+XdnpCNDXfg5rMl\n",
+              "b0uFlQ11yUxnDYOfRwLbdjJA6FYddawSEVorFtY7jkSQx+OUBUgWkKC9rhKB+uV/yqQsvbuFiyYV\n",
+              "MviBpsZgSSN0TOC5JedQ5H38ENVBLjXnWZD9PQyueLoT4qwtI+7lodFSnBG3zboWdj6P7XDbgKT/\n",
+              "zKkFObUjwhstiQtohzxd5AXhBH3DQqNv6mRzuMxFDcTEo5ut/0/1HrPGOF4R3sJ/eQT+YnYseqvc\n",
+              "0m5njpgI3qkLmn8efBB4q3zWGpHCxBwC84HKjuugMICuXfcJHKn0aWkn65aEjT8AdxDWE09InGyo\n",
+              "EM1wsU0JgJ/qq/6MdHWfQW6+bt5xWlpYJ4axi9wZc3Aoz+Rixn8UVM2e/bd31+W37ucz9udquxnL\n",
+              "2JdNUAAAARlBn9pFFSw7/wBZVXkLa/7xg9HEtDOpc+GkSv0gCD3x6eQNkROUaCyL6QH8m/0USPLW\n",
+              "nllgC+uXg2X8kUpaUiErsLvwKd9y+trtKwV7xlvkAn0JqEnToCvptE1Sb8eF86DTi2ywy7WE/imn\n",
+              "jNBYQny1cV38ScnZp/V3phWQAYBG3kUdNNuj/FyVB7DgbQbTLK48AO5nLYv8B3LvBNBfBJ+ym1yg\n",
+              "YJXKwjm8kt8xUjO2UGKeggZOs7YHWr5Fj8OX4jV/B3/cMzP+f6YyrayA/80F6f9vgrbTlhWdlFQ8\n",
+              "QtrHKjmrl874OSSPJYH5wfQfF/1NrQd6soxjmSWYI9/FqOPoy6ujUPxQvg1fUda+wK31Cv8gD96H\n",
+              "LPqpgQAAAXkBn/l0Q38AeBaU9hYCjxV6lA176iBcJKIHTfhwkqkAB+a0LmdvcgdK3vyEsSkCI+8U\n",
+              "up3OQ4OQId/B45+Mf5P4Fc2VsfnQAACxyzNkvgEEYwZk+TyOR6/VZmeFNYMrBdqc2NNBlh56ISK/\n",
+              "h5V9lagvsX7yv0p9Hk6RXo3uoMgKhKOv/QgBAqhUvAKDw4DS7G31tehd/myRMmCPxIJ79bZsQe2/\n",
+              "iq7Nquzc/VDpPXFZHPvOmiyfyrt6Fxc2jLHZJGpvacPTIeLJiSaBxgRTEKBr/xXaKQjc5nLhlwgc\n",
+              "HSz1WRlyOsXOkob3rY8KoGVETaaIvHEl7sVHsV3QN7iR2rIGzf6YHv+c3l8OW1b7tAMShtcCLifl\n",
+              "8k1OtS8Z5o7MNTObuLXIONSPGo1fC97qRzqHFEfMZntEMqsFjjWPM6JduvRiAv8p/h0kRdcTeRox\n",
+              "t4PEdFJikYgCJgtFa00LDpNvd6Vv6MImiivCAgL9L7zEaNCr8p/p5ZiDugAAAO8Bn/tqQ38AfAnX\n",
+              "r+Rl0wYAC9kEZglKr0YEZPxbFiynbDVLyUoB5/4mwbggJCKqWcWLXkOc702XkfuMANGy7OD7QUCV\n",
+              "nopFHkp77AuzGvvM2JQndhYVkdbX30/kmHQDID1DcpthKQBbzUjm7wgAOqbulxKDc1OUw1plN1OA\n",
+              "iXs8Ju+zQDtZelKPfekDEF5iPA8IQMn3LLocZ168PVHW73hdmgfMFTsqduJxZ1oiezDuUBPUKdNQ\n",
+              "1lGg5KUsS5A9iNuo+n1shJKCmk20FfXGeNEywAjYeaq4bao/dd8nZn//htlIayY083IymAgdHbKW\n",
+              "UQAAAW1Bm/5JqEFsmUwUTBH//rUqgBbB5O6qXkABRezeefAxp9PjwxeDBuTTFSUNk2voPSz0T3Lj\n",
+              "1K/LmQtEI6YkskJKgxvIXHGf8LHTV/h2Mg/qV3IQ4zvBygOQs98iZyR5jgV+hQ58R6xIcus/6y5a\n",
+              "HrkViRrv8Sk7So3LYWmfkLzyR6vcCKhF/sCJsY8RS8BK5OOGU2Ll4Qs1n4jPQwTLDELf8SF2+07z\n",
+              "zB5hexERnOHmWZ9THKXS8j6NXPrj2p32k0gvmlI4b/Of9evEX9mDBp5GtQHOvTswQ/VYUajAUXz4\n",
+              "5w6EHuB/k+FBz9pe+B69syJ2X5MYn7Qi9rKpCl2kZv4uAWXuNo7oIaU7hr6elcFz53tdL9AEjCAb\n",
+              "BlT3p448134hjvo9lj95CHF5teK1w+R310Gc3NQ0eeJcsiYD2EoVrHHjVDF/m8I8JtTUFdJ3xm+G\n",
+              "muADOcIpcqYbeqyKWwHmgvRze+DMQbkLo4AlgQAAAR4Bnh1qQ38AfBSmnoPKZzTuFWeZOcrkeWeU\n",
+              "yVIALsozlefbqRZf6f7w7fkPoFSkdlxkJJsnO6qzfbc/Kotbm2yeFrIQw5yspszQL8gAAvMHKSnw\n",
+              "f4CTQ2vfLY55MADj1baDD7LZtn0UK1Eh1HnwXobc+mdHd/JEl/a2Tszf/EZ9+J7oMl+BYsjWKwNY\n",
+              "vOv5flnnPLcex/hWFIF4n+hpBybvasl5hI9mV0CeAAyAclftj8N9n7hadcpM/TOVmHbSkJ3cr/k+\n",
+              "StSwI8gY9k3tmbMSZc42caMpFr6YdNCCIj52zmNBccPNFxW+UT/4qCqtX1gc2j7obKDaWzC1yj1A\n",
+              "td8/VAjqVn+FzuuEokhhvubRT3RCdxeWnBTCG0CxwC7gAAACMkGaAknhClJlMCCP//61KoAXgkIw\n",
+              "VJpvAgAqN7f+5rJJcY8tkjj7p4LozjswOy2dTydK33mOBGS+NojRzBOlwt3ro+/vdQIUTIVrXKwh\n",
+              "2SrHPCPJXQoCjJUPkRODCmqbZeBHsv1r7iIOZPpX66HYYhWgPLvPzAb/Nqu9nQqKoyphhNy32+S5\n",
+              "qAFvjRKLSjPAx7GoKGUNMbYduhsBsrvVTwhrV8uWAls2mxYggJzVuRUZSL9cSt+tjl44BXjlbo1a\n",
+              "I7ybNHG97GCzcbSNcg0RA+iqwDsdnrZCO0zsNdWK1qVmER0PsSf0dicSrZwIcxZWy6JbkwQn5TnO\n",
+              "kAah3wAs6pJvW+a5ZiJHl6sVlU3yCOlrECAESqWu0YR75WfiMXgesBOuXGGNsC3icmPYNzM93us1\n",
+              "7GQTI6RmmFHGo+B2yAB2YJiK1YN/T0ltUuXfFAvL4UdHgEXOVIqVj+S+YpITMKy740IvYQ5zuZPD\n",
+              "ahdXF7HIU7xE0W12w+6qkuyZwxUMXLXdgx6svudMor1GNfDCdymcKIidhuuXh7vdQrgbivH7usVC\n",
+              "zjMqgjGahkW1YlmytCooEIoULx5ux9DK360iAi4u/nAomESdiosanRfQ9jQdJSpo4rurLfeCLF1Z\n",
+              "XsQAQRTcezHlxp1tz3A3WsYMA9urPBB8pUlDdB63MfZDCBphVx/Ddv1AMvPXFEPu18oREsV3BdKx\n",
+              "e3lxLWWpytzF3zXttYGgBb90j9DgRGE1uaAWyEAAAAEiQZ4gRTRMO/8AWVV6uU/hFqUNYqrP23yu\n",
+              "FpB+ECoAQNVnJ92i7ZF1i7u1D6K4L4gxm2RaiGsRDmf2iYWEjO8yGHAqwpcDep1/+H221WMh98AE\n",
+              "VV9Ferf+hy0D7Zu5rX4Hp3s1TpcNcEBIKPHVSHIzaZKKfPXkqE/ga/eepp8Bzdc39OW6g91hVVvf\n",
+              "WJxrnf77rapWbmivuJFfeO9u+RRykk/agdEi5E/5a475KGQprA2yl390PNrCvoamPyXbETwtbYAQ\n",
+              "pF9uDZkHdN/NQ1P4rz+zQLJx21eQsP9WBLswpDFYg9BjPw+3VrVEzeid2j5wJBlq+56Hw+Ex6fI6\n",
+              "1O0GbWSAC5/5Zg+kGX0Yx7/We9PseMWGwXWIVwqI7oHPEnK6wUkAAADgAZ5fdEN/AHk02mburIzA\n",
+              "1V5U+8CauxZABexQ9zxvy3GIkNn2+19EyZqnRm0DMMsXP4ZwiY8vW/qdBTlATfbmIFDxCTzt76+L\n",
+              "X3WaNfG+rqTfzj6gLFFHl5IJDtQmIC9KAmTgQM0Lp8TEDdYJnPYGFybq0Xdyl74+130DteV0SYTD\n",
+              "hgB6230zJvCx8ZW04pZHmYvtJ1LZAxF3BAWKPXcstkh7/Er8zYdPblR7K6t0r3b/sIHpME53VRBk\n",
+              "ggj1uN/p+iN4KwToxjP8kZ1opB7xpkyOQpicygiGnwjU7EpZpywAAAF2AZ5BakN/AIdka2Wer/IA\n",
+              "EJVZr+9KNmiS7zXHA/5uJU6D0CbJOrsLPWcfwAUCZZjhlCsnAlgzrrGOONmuxU3En1TfTKb/7Pu5\n",
+              "1R8PfIYkV/dZFitvMyRPMvzwXX1OcxtjbhM+M0LCh6zNEWJFi2Pi95t8cspIknD4iXNUblA3oEFp\n",
+              "VGuXt+8S3Upf64YqAxWADhb5zxXL+O/gnWiyawM9fyRrYcExecMkEiv5MHRsJs8Euzdps1vwxzNA\n",
+              "Zu4bu6ic2K2ueNja78qXGaHz7xLoPIVJv/T4KAuseyOhznfFtKf0Ey0eSBVK9qutGGF83lfe5Wtv\n",
+              "xb73lHTKLAyiyJassoDHBSQLAcUPb4nB6xWNr9G9gWtqEIp4Or9tKJzZIZ1tnIKZFZGb0ELAlV2+\n",
+              "pKKDz5nW+syHi871Soc3HtgomT3Y1cp83yQG1GdKkcJPkU1uJVzsVPzbXbSU7/z2Q7cikc4seN2D\n",
+              "ryQ1l58HjUs0ikCXV/V/CDkAAAH6QZpGSahBaJlMCCP//rUqgBbmS0XBN5gNQAaCJTjyhVwVkMwl\n",
+              "GF6KXnd0XUyzqjFCJEv0D2xQiJu8if6sKo6qHl+BP/MZw8ss5OKq407INzCjWOsjf2HTKyC5fNLK\n",
+              "wiJv+PzieOozn64ZK7RRud2QUaDe0kuhk4uCClSYQBImrxmWeEf/X9zH3+ilYhfoZigVm0IoMiuu\n",
+              "YX1ERVdg0Ld9E6wxbYMiQAGJU1qeeTwc8vb3w3kiJheTA2PNXtrJ98RwtpnhN6QxMe1dw+aQWI7S\n",
+              "j0oQ9iNx73N93RuNVRxXj/57S9VltjA0RTZBjLvYS81QDA3fBgaNHNzOBZ7dztz/rTxxOpumjTTw\n",
+              "x9FgnvlMsjx7FYPKUcXD5quVKd8lwTlOiGVI7X1HEv3Hh4EvpYVt6azhUBI1qGunVb3X1lyMhWJ9\n",
+              "p3muqcicwInEt+BuHY92HoNXaaJJbbQmNX5s3QJbI28Pg4gc2gaUF4SQRcBgM8uwcYUzxEkBS06L\n",
+              "0moZm8bwMsLYCLj3fgXOyFudpfg6jkYPDeVK811WbzEz8Hcd42XVL0EwE3bwDc+i2I4+NERo6J6l\n",
+              "d4d7nOIvqUuorZnDPtlYcfSWgBqdP0tQHvFb4Sv9QUCBvXlH2IEiNzo/daaHVtbFRNZ3cag2HOiP\n",
+              "lMxyt8xYJMnG7di2JiwAAAD7QZ5kRREsO/8AVwwP3fRRACC0tQoY45xe6yfL8KMHlR1wbd4HcPUC\n",
+              "+4PcnqOzdoNv80ufRyOopFYryJahX+qWFUVKK+nDtdvegTv/PqvENcT8ykEwwQ7z2oNUdaMITYi5\n",
+              "4tC5YA9FaLSBorMGx3aocAbiF8065MBqyaTkiW7FtGRHVSPubGixAl7hiQRoBoEipfCxkE/EBoII\n",
+              "omSCNrFRyjd8oY66cDfZt+iBI44uLDeP6eHMEpBALsV0FY7iWjBLaYO1t2PsklOb93SAExoyIX1I\n",
+              "TiPXiUgrCYe7dgepAF31BCnOuxiIAPWKLDHZLhGOJBLqdemk1EZoKCEAAAE5AZ6DdEN/AIteG4cJ\n",
+              "hGXgWAAHNd3/IaNiUh/zKhTXYgf+UKkbUvWJoLo7whMXByWkvy3MotNcPaSHeaKS5vKy/hBJIgk5\n",
+              "CWcdsbd5QzFHyjOIZiaEAA1AziqRPTDRRVYKhcrm181rAlAdaYmvKZAOu92pmI39/PSQjhiMouSe\n",
+              "XVT3pg0s+/zN7WMQCHqTmey2TTctwD0YnAH9CK4EMAw1jPCCTXgop9epuL/iXjup2S+LS3pGE3iO\n",
+              "oIHon+1ERGRC2Vp3b2QAstSXzK/2zI+bVnxf0PhgKqa/NeuEaF2SBGZ/TyqGPDnQfJRorCp1s+mw\n",
+              "tm/3aVbjKRTXeSwl+OCfF6rMqjf/Zw8/4yrjLNmiyOgD8OWqATkM50NFqOShrrTCaHdcxgVW70ss\n",
+              "cCXKxvzAUCe+4nK4C3zP8QAAAWMBnoVqQ38Ai2Rc7ISR6q0L0pberS7nbElvP1eAuajd6ehFPCEk\n",
+              "va4007gA4DkP0YAYAumNCN0kma3A2DvFPa+NTDmrilkXNhiNVTFRLzynsy8rdgQPBH6k5DFr/4eZ\n",
+              "jmJjfYPWB5+2eEYYc9uJ5Ni70hsVFfV+T8zp+ZkLZnd2wv7AZ7A8baF9R5O9oQlCkoVPxkDHTrmt\n",
+              "rElQhX8Fi0yj2+BVP5O9UNPGQU0+M3KYUTg9yTBG2cCw6Drt49/5M/86NN03F5R9JS9KGOfJjIlA\n",
+              "koCavGpTFqq7OYU0RM3ilfXBmxvL5QoIK28Uvs71J3h/IvKmg4v/14n3/eoSpqNUCC77ty2SgAAi\n",
+              "rxQNIHz2GF/lpTynlwsORrYNT1lJMVud8AAQb+/SaHWQXmhJ+8cZTt8XuMgG/t/hdF6GqyG0A/Pn\n",
+              "hWRq+asN+zBaeyQUWZrjl8ry0h3WPkAZksFb/gV7ABWxAAAB/0GaikmoQWyZTAgj//61KoAWw9mB\n",
+              "34Nmlq4DQoTYIkneVdOFHxDDrFwsv7yxZXXwNkGuLMduj7QGT/7lr2bNfzApMJfo9/ffM5g789Cz\n",
+              "1Mn0zxePHMHBL6IHHRVXWyqDMhVLYnQ9xFtc1jml18If/8STBCOf+AZjMnARcFmX1IwLt/ziVSoN\n",
+              "e4GPKKZqfZWytoW7461OuaeZ9dvtxrCL+W45zobgR5vOrVM+Opl+w/eFlupHlgpQBWgJcPy8sZC4\n",
+              "/O9laiYA63xx6M701UUvGFsRI+RM6anXyjKc7TVrmZ/YQKRjqB6Mejs2G1mTDkBn7T2ZURI2vZ3u\n",
+              "VXRNsQnGYDxRUokS3YRHs9LEF/gxKSdLEEiHDqcoIHyS2FPM+cIJRSvB7sxIA3hgfN/O4qDK6VO+\n",
+              "t71oi1H0Bkz1ugONnVTpQr+WeMS5AtXXNBMXU+ycO0+R9eRe9BwSk0V6tHm/HJ45oIYvyWTj3yZa\n",
+              "JQ6q+o4isbf26PsTbuSAcvQoMnzEXJkqElGJ8Z3rZtdkIzQW0DDnXeNRbj2wQmuUNBknMsWOw2/t\n",
+              "fD8BErzYLXI65PwTY+6R5c6RWYzF9HNMLBaO1c6cI4yEu1DMKtZW5FrmVuc6hg7VnWxgAgOdFKFA\n",
+              "QvmmcrbHsqCH4rkez1y5GoMlxeOuW5WKa/JdcefAflYgakEAAAEQQZ6oRRUsO/8AZUEtmg0dqwLy\n",
+              "ubLYtABfXw0ri+bvSnwBqWW9hB3/jYP94x5LyZNY560IvuBe5T4EX3/71Gbqj7BS5SJLQ7X1JK0z\n",
+              "I9iR6McwRU2BDEhu+2JQm1RA2fBVxnzCyNr1JVnfyyuumlkNzE8n1UgnkIbS/FMxc8DghB7zqZzK\n",
+              "rkagW0hHwSjNf+LJf3DnbXyvnzmB1lcv8Z9QlsnPKDef2giSgbZeTNWRMfeu91kckRy0SSKkaYVK\n",
+              "KUUpf450Vl2TzPLRaNhk7Du1IJzIJRf9supxssXD9v31LAVibgyznyLU/cS57Vr8KEXG+WpKysV+\n",
+              "6iQmQ/hCoRg82drzuniAPltxm8MMUZwVMGAAAAEzAZ7HdEN/AHUKF3WsfCAA7NAZyuGlRySXJzA8\n",
+              "WtPYIqCp+udF6BaVoG3w794kSqeP3syNbVlr+uFhruNMOOzTsNGrbATFZMl9DU6mhIXZ1HEAskmI\n",
+              "VVSgXlz4sVX35JqYrDPP8r9Bsg/O9tAp7LnTMjWlqOdgOPhHpyqf/hmokPsCwqtKfsDhxP/tmX60\n",
+              "fhM4KsfvpygzK8jmUmY/GDBCISRQeW6U8uaq8guf+cvy+sP09JLJ4HsULhIsm6kyYO04HBdOFUDr\n",
+              "/8IzlOKX3w/FCxhimlJIduY8iySAFQmALOuag1Ry1Z3p7NpGIGhZp/q5hzsMAsH2jpHXQPdtFNFH\n",
+              "4VkqDlRDeGqieCr6gwu3hPQQfF9yauq4qf5R+bfPha9tZ3XjpRO4eqNaj2xEQrcb5cIJOAAAAUsB\n",
+              "nslqQ38Aj1e+ZhXsJE07lvgA5ryx/X3Tt1hQ2T/wP93u+Km2fQtCsS47kHT/v+BMMbdxEWzwYvcd\n",
+              "d3NYalS7o/aUthPBRfYGmx2hUIQijLOXN4leC3SONeoCputIRor3Lgsy985K8UL4nvf1+pFmRQg0\n",
+              "eJgJ9ubt7jVqU4S6enDDZ82+hYwxDWOROomkxsOv8nlizRgAHHE1n42Dq5sLIu8oVYp/4M1h4rCy\n",
+              "m7AmDrR9dbHlpV6pqPLshIJSKr7R6XCF5H/mgt+78ttEoS2XxbrmVQj6DQtTzcYF1gqzE9DaiXTc\n",
+              "rKcf1aBAFclenBiNHhbAMEE20Br4FIkr51a0ynzJocMgaUhstOH+7gKJGCsTPkykOiVzQeIGOfi6\n",
+              "AmLkbzIds0NOnV21ExFbxIFAMu1BymG8Kjwvo1cLb7372R2f+Qt5Z8LjmGrBAAABxUGazkmoQWyZ\n",
+              "TAgj//61KoAWP/AeMmkxh4qDG8hcZFMZjYIY//v8PGtlbWZ+A0oGGFPTAdgmU2TFbrR0QmwUCouN\n",
+              "e8fq+V7LhZ4IhSGjAEZXRALCc6lvXQaVk4Hy29vGup69bTfpCSIWWGXFW7WfQjL50GRbZZRZHQ2m\n",
+              "pjAJ2N9/bloCCNQEfrVxCeDkKfJqKlRpIdnOUaiQpsnEysqkLqMfxaCLAtiv1vFXcLPLizzlMPs7\n",
+              "NIiiAuhD4+CMokPsODEut5yq6fM1zRym2P9iids6rfyvN0EtWlvUXkAIdmS8HfE5DlX5rtipWZ2i\n",
+              "d9rb+tQcwCfWN6erokI6tARQJu2c+ZSF/sI7qofDkfNVCHii2Msza0cnJEbLkEfdF+gBET2KrdRv\n",
+              "E5mgO+6ICEAI6O/h7r7DxvTQ9Wxzo3mHNo6898yojVZYUAEyiEUBn5+alz6XfA0d5GcOXFRjv906\n",
+              "SVSt5h/ZyjXd+HmcrubYPlDuxhjCrkqyrKcbhfJHp/Mq+DI065H9OXdNO/+uDSHvPcKkibqiAVhI\n",
+              "DqTA+NZM5+PbtXMsqU6iKpSzqr3AN5mBITP84n9JoTkmCR2U/+5h8eajZc3UcAAAAOdBnuxFFSw7\n",
+              "/wBlSP3uCsGGoV8bqfG+TF6JTvUuRSAD4pZzJUFnxrFOJYnshFJtjPOw7rAcguf7FPJIlPqbN5qs\n",
+              "fqCPl7TU74m2w4/OJHMnDpS1+crxo620hZORUqqaN/UeMSuSm/KKx2/MSsIgkvOy0fYS1MAD67Fk\n",
+              "Z5FUhBYQOPZatG+Xc3Icj+kvLjp5v9fX+nJsaNN4CCl0quEK1R//8eZO87p6DKKxlnRfV62uCNE9\n",
+              "o2MWYwf9qwHYbtyqG6I4xWPTngQnrsOmiw1Sy0bIvHiKKw6nsCsKdLVPqCFU/q5rppy8Ah4AAAIT\n",
+              "AZ8LdEN/AI9CIO0JMMhrV/0AB0HLuqwUdobO4BdVbPV1Ioua5WZC0IWTaPE/7qAFTCgAnl3rAoSn\n",
+              "Kk1336t4zGyyPYAAOSIcqQwF8zee7dn7XFk1tvgy6W/qOMTmkEiEdwceoRsnhNmrNp/TK9OoMIUg\n",
+              "ShyIuwXG8nP6tDCpAEYSuvpzo5kchXf9jICMUEGqQZjLulIdzbNUEecLTDRk1r3gpdToPPcXdXTM\n",
+              "AElxf3acmkXSo1kx4tBmKJrXm4kNQ2oDIaqLOc1dGZ+ccoProxsI+jQiCldj17rGF1/E4alcIa3L\n",
+              "dIofRLGOPkev2msNj9eN+tELiQktxoUq9fKnDsRx9Nbc5IkysRYA/KsIu02gpfPyisLPQwjLSjpr\n",
+              "jTxnZViCfPC6UCMSLVKUvso8AB0eV8Q+lldoHmqd+EeBeeJOkPU3vuU/GQacMWsLnKmVt/65Nw0r\n",
+              "y1AnL9+YKkDmvNgpqgQANfZvj5NhddHche/p4la1cXWhY3W/jmtWxMTkOC4tX16bao5sNwcVWRvt\n",
+              "UHjkDIOIXB+3akBV5Lzaef6YjjT1MeUeFh/FB0tOMV3Bhvdw35krP/ItZ1RF5hRCk1oYqz0ykGZW\n",
+              "YkciBlvCsweWM2wXwX55h7SZHtxiKM3rO4Aff+TOWGbe8hXaapPE+4wKof+j5KoQ530gP62KsQIG\n",
+              "BV49pf0LYkAEd7yVzO9dhYYFAAAA+QGfDWpDfwCPWoxxjdaiaFtca/OwfG9dSAC6jYuqYuZmzKSC\n",
+              "kzbTtnf9idy9v7frgKuFjQymibohZCHRXBQdujo9Laqcw233I4Za+//Mdf06kxHe/IBTsCsxcSfV\n",
+              "ksVUEdqCe9dEwWwg//4Ee8Le2gLXqz21e4jiFyBOjP5GsM1hpupcfwZtr5Mo/ou28BY4QZExXJ0H\n",
+              "FzCqK0jKq6c//ut1tsd+kiOyZUVGRAFVkS8bi0vvjrj3zga9Zaa6Mt7yQii43DdcrobbVIWdc0QI\n",
+              "3+rsc8fgmOnJ+GJGdWYzpFLd5zMjS5ofw5IMBt0GmHVcG82Z6YQkqKJHzQAAAe9BmxJJqEFsmUwI\n",
+              "I//+tSqAFjc3NgONUfiwAKbp/vtZn3NtK6t0V/4sA0MV4unWIJlE1N72EjQeUPmvxOpceaVXIrAK\n",
+              "21oMRdsBwM4wyEJDPiji6fXmMlmmsCvOtr78Aj8gA+xKnVDFjoVlH7PPNvnMo0iZJruZeFy1B4T9\n",
+              "/2iVnlLy1r3LZhoykeyNXqaKEANWeqYl2HjpH92g+fHSONko5D2m4SRKJwFWFllUBg2RTQ3etVYS\n",
+              "PdQGNCLeaZwhH8zjnIe5Vuu46VBC79Le/PF0x5A18FileZQS8Adcvcamp8leUQ9dML537b7ARaSt\n",
+              "9Lyu3Sdke9BouNe3+hTyxzxAi1Setn//aNMjVtdKZIT0wLvPIMCsfe3gvhpNMtez9cWJYRUO4qU0\n",
+              "Dlg6h/pUIog+BzidDDvn6SZ9WUgEXhGZOFeOBYowQfwTGI3ac1V8O93aTpJwa/om7scQbOrwAjjK\n",
+              "gaYt9yqViBt3FWYRIoJJGYqmGJkf0tLvcymA+Hyayho8kg3J33tLzi7Gkd8xVzsn0AbjvoJ9u5le\n",
+              "OKsB4L1kcStddnytXouu9GStBCQSRLPeb+iGeZTwQ5uYY8D5fTAcb3C6Ob+B7IWRbbytzq93Kz0y\n",
+              "yYvbeUq1qJCNW3/zJeXeH+8yV69x5FRyM+55j6UAAAEdQZ8wRRUsO/8AYsUcQvOGOSSADI46r94B\n",
+              "/W+PEO3biH5wUahFid/4E5wZcJb1S+5KPsyD0qQEL2HibG5BPsDLysut2eDJfU6ijjP6zrYmNEWR\n",
+              "huQfgh9NsMVuoggiphkYt9ccXxVhYHn++9K8YAnkm28Kzp0jUWHgD2VeIoDjCfJPNnBqH+CERm3s\n",
+              "nubUQ9LmttVf/+MNJAJgtOFW5A6IBAcBpJtd5kPS+zJ8VxzguhOiD6Pf/zfgjMDUsehmT57QUanw\n",
+              "gbdNgBf1mSXZw3Czfs4swXmaj+42V39PQblTRJ5hVxxBfyBMHdtD+eP+pUlQP8pBAAnf3v75+Q0T\n",
+              "L19oeS5dx79IIwiodA3vtFf2KOiU2gODZqY3kJGizWNAAAAA3AGfT3RDfwB2j3tYlaKo3hdLneRM\n",
+              "Dlhayh8NourV4B4kYRi+kgAOdUf8hAGAI5XCPTeroAwXn8G2yGEphnv3FPeZqmLNmvgLgUkPciaQ\n",
+              "A3x0WVLvMk+lZn6cJdklOXHEnjNKsClw6wU0RbMDBk1zQUzYb/75rZ2h0N0KqL096XGATDutyhUZ\n",
+              "RVkyTgfbEgHdPAmzdroStgpcOUEN4xVVZX2E+XrryGs2/tIi+iUaglsBszkGSHUeEuoEpHc8PRHH\n",
+              "tDc+6s5rO2oABm+Gux/PUd+4yoXEBbF4DtdMIooAAAHGAZ9RakN/AHaNgkMVTymoPnXABzXUf7nM\n",
+              "R8KlDfCSlxubwbY5y13VVoGV2GO0t+vExf+APmeqLrIGM9X5aCQgGSaQJX4OQoECqyNRzFZQDLhW\n",
+              "KA4dfYJp7oYRPF8AMOzGYqm7AO7w7FtM2J0yD1XqM3LrKYS1dGZTAzMM0YXyhFuS7+8HWwRTCnl1\n",
+              "B1MtLMYaA8qvJY/AATH13D2takXBcx78I1sCsI+P57X6Q2Nh62/bggQuV3uhAAN0tyrIgbNQYVBH\n",
+              "gFwoUmXrxaEApAv0P2E40tM9SJDDcZe8DyE7ljCyxGjQA+gKJHzTkZCCQsmlxDg5It6wsdQ6cusN\n",
+              "DyWnlyoq3MMo7ugMYcm1YMEY73l36Y/R5wo4wUzuNvV2tJ3rSYBCfXsVjc5o1oA8OllKUpgpBG5u\n",
+              "9AavXOqCqjA07sUF9WlQ9JPrhiXa9bThYRp0lNBazKKlKwsBPK9zJ1/OayuptCCUOtFLyDYWpp2k\n",
+              "qNXWH8r0IpnJjxnQFcNmI3LKk+rH0vqX+48vd2BUqTcJ4rwX4e+V6oU1+lJyU8fmS4Kj/iQFUx5A\n",
+              "ntiGKLVWwqfkoYN2YexrEPVBTpKi81wf61aU8NAxYQAAAjdBm1ZJqEFsmUwII//+tSqAFj3B7fR5\n",
+              "G4ADaQx//3+BfZIcqzxSrotcVc8CLm7cBBc8JifUTg3KyGbsl0UtvUGR3t77PRffuzjjVfcKeiAp\n",
+              "EmDpLoqmMXTQU5wmHksjapt36fasfEiGyN1dOKyOI9nT0TFFL0pzQSss7Ux5GajOaQUF29zSIoeo\n",
+              "7hOusjWiFyZylISVuEBU8nCgDYn9P601XpFko2u3FAuYp/svCLJOzc9W7b14FY05eVZdhfmiv0Wm\n",
+              "d+i5ZPIv9mhB+8Cb50V0LQeFfsyfPeAABtfp/HIPaN+amWONE9vQ2YbC1JsqKljPbi6Vrd258gHB\n",
+              "PNyXvESqATfkK1Gnk0AWxo7XFr5y0Ce95pJr1n6gAd91M5RV5lL/XAgE7sYG4524aA+cXAa2XPdd\n",
+              "1BugfbN6YGWbktwAoVIXoUq7TnrmhBrw2FHa1aE9uMJerl9x/Rs847iKP+iuBUD2VIUOVa/G9Po0\n",
+              "ksPo1bHVIsITIKnrhXV1NabDgHAc5kIv+PJk6IroGA19oMw2I1d4rGiaYQZE9dmK1VRARJ9VXDBJ\n",
+              "Vlz3aoQhCyQZvwzvxWhVA1iU1RO1TWnJsppajNeO4Vg4/b+BSviIvrSwwqmjaRr8iuCpVTgz+ZJ6\n",
+              "95zLiSdnoIFqQJA1Hz4YR/KIOmAfhTTnHcdDelso1m8Bx2oHlzAOiYwR4NhSSRD6EhhCU2kXf5vn\n",
+              "vYdShk1Y3/pp+Wd9yZwIwTneJB0AoI0bbmfrtbbWj1oAAAFQQZ90RRUsO/8AVxVRwqizyog1fzvw\n",
+              "w3oFk0s5kH60rPhj0qbUv+9nJnU5H1hbksC+yivmpdt3FAylOp/Re8NoooEKQr4q7MX/kjNCB5zj\n",
+              "aCmG5E3TxVGWGCYMCsdEF1I+HuXX2a3wLCwf1iqCfznNMRG46GE6nIgxc91oY/zfMduLLCzyb8AQ\n",
+              "b20W2eRODsXd4+7XC1RndLreJ7Km543AdL1iUo99hYdoASXjyWRNv6wvJrmyFngIDlQOrLluZf/9\n",
+              "T8Y21pcggXpfTtvdj+B+3lZv29AFHkL2xGPZvyL4UyVUgb3U1DWd/iySeGzlK1IbRNu7obP1czi4\n",
+              "Rchm1nI/pS+cSuamJbhlQHIreF0u2/zcrSGkuOpbObSfAY//5j6RVfcQovw5wL1RQN0tcA1GtFxu\n",
+              "ZpovaLthGUkeOPh8iV5bEpupJR1R79Ew1sEkTDugAAABwQGfk3RDfwB2dNpntdq7wHtHkfExb8Mi\n",
+              "4AOIW+6weDVD4WeLhja/JOA5FtORnuW7CfHWfWrXcPJWyNJJfpx2maEKeggtR3RVEAdA1a1truYO\n",
+              "N3PBvt2C5hri51AyWveiUQtRNh8OhcT8b+NVPo5dLHlfN2wr8ZipKDuUP3k1md+EiPqVCrK5TuMQ\n",
+              "knvfHHEV8fXqrrFiHhWYrAGbSJdOrXgrQTN4JDv0LMwXs1Nl1nmEdfSgT5BF3DohYi4r2xGfiJcJ\n",
+              "KMZ1oPHaRBjgxhu40ZP5HqUG5rQWHD92UCH/Terh0cf4e0554mxHgDF9CBXD2Ey6LaV8LB9Jb9nA\n",
+              "f7tFFMQRIVaLiP+uig+B5OoeaCY5+GdEeHuY+ZE9jNToZ4yOUwNfysZaXJBrtfqEkQosI3EYRZQA\n",
+              "COu9BHjZjXsKjEmWe9Jj9yWusbXq4WMANyEJEPNSeDcqy2nLsc2OqSE4CgyCqy8blbRZqycUiZt/\n",
+              "3NpFflI5dk/7eeQ8Uo727U5FhceNm/3Tv/0N3CZNlPGV4f+3/HHJknpIjibzMw4AkTq3Lkxy1XZ+\n",
+              "FA9yAR3cZ0/eN1EscyudULe5dTvs1EvlYMWBAAABtgGflWpDfwB5Rz+lHWcxYALocP/IVGxKQ/5l\n",
+              "P8Y+UVeKYTw8iTn+GjVV8vbhgCZ5cI/70wvHdrfJYaZZyRIawh8+61+/vwo8HAkEyAQL0QVrU8Db\n",
+              "Z7+ORIRATWUQyS/LIyP8q4/O5rf7OuybqgrrJ5JQm3dvb5EYgnYLHCULt4xtpfvTsT5gEynxu9HL\n",
+              "Km20sO4q1oqcF4MPx2dj7xETa3veUfVJqfvwop/9NWsmPrdhY/wz7rinYt2HcWm7+ulSBZtWIRv3\n",
+              "yMRoNM+lyCvZDr0PaN2HfwYWOYr/NgyLM3qvI6TujkJkGWBIPuiFK/SHsSPx7iAMcrZ3CQvQC1rq\n",
+              "psLEx1Lx0vtWsdQAcjEYe6l7VHqUFbgcjcHAYPQIIgi8NauIxLhxUOQnkJo1mXO/e5w2N9AAHA22\n",
+              "RlXXsFU92TGe3GmYdLlI4OC3IklyabPhxs95veQzY6n0a2BnyANXxWrQG1vVVVAYgtb88NEdo6By\n",
+              "gCh1aEE1VpUTP0of4shaZpNk/2gd6T34r4uIClLqdADAAdaA4/epPc357p2Ro8OkrT9okATGaQDM\n",
+              "AYBiPC2kAQBkyn5ImAAAAdBBm5pJqEFsmUwII//+tSqAF4In0o7iUdIU6DQAMu59v/f4eNbK2my3\n",
+              "LFfU4bVvmOXvurgANJp+yhdNshfKZWyf1yiq02eNo25TtXkBg+c9UZquU5KtxkSr2wTyRJb5fWbg\n",
+              "+NL8Fosje7XYkSxYEiB3sVwPhHSvNWh2d4v6fN1lP9qvuUnfb1Bn+TdruqmJdM2vx9efbO5Th2CP\n",
+              "KiH3jeuRzoCzSIUG7cY38FVzT4nUIJdz+2KjjjJ0E7ZNKQ6lROaPqjFN4utrXaZfqGFX2nWmlL+h\n",
+              "PxS7plcEcSC1oWpbRWphWgodqD5c2VmFV0yO9NkxWYeDoEeaPVORAB/gqWAbIHdoZVHMBBV6fLyv\n",
+              "D3u5FppjGB4tzB+WC5jnXJKg0Sk3SkInESay6cwWUVJt/G4Tfg6wbMdEkCvCKlRosg/RTpp5P6wR\n",
+              "Z2iZfctuN2EQi36vtriULh4PVI/bw9ZXWlyhMpAYPlW3C1NvZrlJMNaSqGSSnh5cJMfrxHquXcAN\n",
+              "CTgojRhZ3tMe14Ny/HV3UfnpEJgrqxN8KZxlRpYS28Q96uqEu6NBBsBIIz0ei/Mg1x57c0aguL4j\n",
+              "dVBDXATm12Zi0uXfiRBRiIror0O2CDrlUQAAAPNBn7hFFSw7/wBgSQL3wIE2Tv5B6OJXPcoXMcSb\n",
+              "cE8qv/1v/uy5HaAJNUQCTSWlcVovOwe/GLZOdN2BNEgb1OlzNEinzyASzg3GuZ9zFeyJHe/zvxXW\n",
+              "qHgQlhmuH8QdE1M1s5tXy5mwAyoAiCrzupaN60ez6jWL/yRvGdGiPt3qJJLeMG60zAMKa7QhUJFJ\n",
+              "FMWUFrcLW6iQXx7VTZR7Qo0gz/aCe+BxT2h34J4bdpQTH59SHjOd2X4DMr2kpW5buE3EQBEKSUD8\n",
+              "yEiNy7MVRtsZHXt1V4Pb6TljTGXtC9pzGwEXtgadiRP8dhtDjxgpVN3IyoEAAAFOAZ/XdEN/AHkx\n",
+              "u7J3fsEfo6cXtbkNOd4swcOB3voAJyKHu0c0/MGiiYXv+2wca3XUwSOEG+s8df2rHPxj/J/Armyt\n",
+              "j86AAAWOWZsl8AgjGF9fWv1mQf9jrWNuA4APvfeLBFbZJZm7otp6Fc0DFqB0XCbEvLTkRU5ySc7e\n",
+              "Y4CD3ziWyxgWkLgxNxAV0V3rzOqUGhFxcTbBCJI75knYyulzgB9+SazwgLVSR2N8nND844Y7GLCN\n",
+              "0aeRWZgNIAWJkPPhP1VnSRo1jOpV+axgAXL8ExpNwIvLk+O8lekZ0/1o7sI+uJ46XyI2SuA6uJHd\n",
+              "bwUKNMI2qDKAM6f4kKlJLSQWqzXAi8hAQzI017i25Vpi5npQJ4TsJeyOHRvmO1wY5ZnIEZHyhgB4\n",
+              "IoLWrdA5opbAou9XxH6m1F6osqepeJLd97Dr7+5BqWzoHoOLhOxNwAAAAQ4Bn9lqQ38Ah1fDGltb\n",
+              "SoFNBABy4LNe514R+dnaDTYn5E46OmsRrJgYyAm1lSXdflAXI1+CFQXE0A4eKb0poyZSLaaXfRBJ\n",
+              "r/tA3jW8xYt/UxFDszVrqnPHP/Ny6pw3mJ+pwWr+YYAHxNaLyZj85nxRNPFMUkOr96iCB+MslYrg\n",
+              "cr/vUoZCrrFka9nw08yFJlyN4Ky9KHUYJOXDrBIiz8KQQaHFalCe3rENKk9raHLB9E2PdI37xydW\n",
+              "9R3Ktqa3KW5rMJCOoArO2/3trkkCh+/FDlbsei4VdbDQ32DjCaAkDFjCyuqOJNsi8nSI2KDSRFCB\n",
+              "83l81kCObhPemVMTlMBQzSDvOtDFUtuVwHtirD8AAAFqQZvcSahBbJlMFEwR//61KoAWweTusUEY\n",
+              "AFR7WLigAceU/KgvW9LBBRTRioW652v1Xpv5tYMFhkRmmlUca4/8lM9NJwOZFgbdLq3dhRjr1SQ+\n",
+              "iitgTnIKVe77qt/yWy3INzcVxffYfGucVy2ypyvLSUZVvVzu37Ufe4d1uKQAC1EE3Wwzkx7sEK4N\n",
+              "QwJyCdTZZnLiyrlEXcLAMbB36CvMtmCiaP8XPpa1U2RaJxnBB9qYeP0+JCORflaC8m/hyWfMppd0\n",
+              "XeCFuAYTEakC9vO4HVF02QH4GZZigg7j7bXnvstEtP5QgYZViZcOoAaQGKtWm3PCHoS8mKWfCUk8\n",
+              "ZLC6z2a10V0U2DavVH2m02W1Lc4/2WzrwUTHr66DOaP+urnPdabeHdXruv1HJ087InGSipJtxGko\n",
+              "4rppNbdlP4z6g2o/ksCKcSZ76uS1diKM/39wzVYDu1tkCD1lomve9NoQwUToKqCn30PDqMAAAAEr\n",
+              "AZ/7akN/AIdka2XuDkeawxOj/BZhZtP+kNbRABb4RmWT8vSOMSH2HVKuz5/n3pn38gQM6YQqY5bV\n",
+              "v8KsLMWKt//3BpX7BUiSjA/GsXEpiGachc2o+KqjjRfujy3SLc+TvzNfgePwT9w0Jj9Y8j6ORxA7\n",
+              "13x9/iM5Lx1s2OQQyRluiOYKxXDE9QjNulPCcMLJFKpvAfnZmzl0pzzHw/ANcBEDhABHQ9ftCkUs\n",
+              "Q4pQOQF20mJ1++bXoRcUz/lR79ACwohpzpGuaQCknCVhUL3lnnyQzloB0PAIRq1VnOd+y8D18t8/\n",
+              "IEva3L9FTrRi90eT/2pNxjMaqrOmFzrhjd2kmSd3YBlll+A3KrjDn/HtXx8SDjztM7Km7BEd2LVO\n",
+              "U1pVGn0+C8gCov9gxoEAAAIMQZvgSeEKUmUwII///rUqgBet471BV4xl2QAFRvb+6Uilj9hVaCt9\n",
+              "oXOXB19FM5G4bNDJAOl9w7HrxMOF2dPOUf977Rp9NoBObCR9cN42Ht77Y+l36qfp5SrWPFz3DG9k\n",
+              "Uks1s5yfRvMME5RxPYk9+qohbe5TR7z2WNWBJjaTvhnu4485WU3BaTyIbA4BRRdj0/JwsbCXRVZy\n",
+              "OMmFdXnFdxhNGZ5JMCQy+ip435WTv8KevLzG3OUTxX5d8x0gaiQZdaPwNC9GVrgmtqTc0z7He5Hx\n",
+              "p/UnXiE+WgHU095CwXga4AbeOtQbj0tjxKUoS9sAoJ5fyTlHv9FnU0ujgUuoA3Kj0ma5qF69zgnv\n",
+              "MTXEIqf8zuYuInk435YB6s5Aa1W77q49/ZLR70JdKU9F42nWnuaGIFvaX8JNp0NTGvA0s1VSOWIl\n",
+              "YVdpY6hSPbDqLYXO/LE7X1D3sWpexh+/kcA2B6pYDzx14bD7OD1f9pMDWxIrW6BpNH75M54gOMY1\n",
+              "SxoTsfh6KVoyFK4Yqd6lPKCLY4O17tm0vzqLEva8zNeuM7b2yHKwMHpqK8FV5yaEer9Zd+uSgIqd\n",
+              "eftECExc0GDPrda1mDLPyRR8iDjZRvRS/EElnceTaWiUEonB934ThxItQqnJINdKSyNdNwx44Jgq\n",
+              "H9/Zh55FLA3sdVDr+1aesKMfNmYnbwaje7GN0y0AAAENQZ4eRTRMO/8AYEUc98FD5/CYkGD6VZTK\n",
+              "7qaMD8JeD5Yvz1s+LaCSFWcn3aLtkXWLu76WBTjEp2boTz2lISGgYIiIhTqGBdSAvn4GaApcqQ2+\n",
+              "sy0LjwIg9aZXDdjP9AWFTV1H8wY3dWCf+Rn8X8p7dsAFRxXZ4015PG0t6STtIq5DOqARSPJ32oCq\n",
+              "OenP2L2rQhT0bU7kBXZqDOvuedMFko4K8dbR3EOKtstAjt1gHGNubjQIVeNhJsdrdMtXEY7juX3P\n",
+              "NuPteAILXrR8S3R5mIOtuZ+vWEUdS+Inr7FnZsbQiIv9i7KDzU2m3LJLNdjmArFBBLgFXYHDvQmL\n",
+              "9VT51Mb8gx1TyNar/CPWDggAAADyAZ49dEN/AInJdfYNr4ilmYSAMFB4GADpypoeWWXE3q20mGL8\n",
+              "wfGmH6ZgcbtTXJWZn5/uB2IPeQFG/rqNYZ/bmIUcKhccFRuPa9wOgu4Qnm9oi81y+ChWQK1KoKDK\n",
+              "TWWDeg/SDhV8w/q9dFY0rcekgnjPKbKFgzK+IO7hoMF7vhpMoVCqvwMtBaesBfF4bzxIufyftMba\n",
+              "VRaJWuZpM22/FtH8FxujQ6EjGNr9PHZg3rsxXbkYHRqZvH6RGypNdfKRL4serPMKtCeuCWEKaj1Z\n",
+              "h+pr+ULdNvwpLLHfA3OCu3Ql8v/sLDD/O1LVB9ug+l/wHpAAAAGVAZ4/akN/AInJdjcgUcZACEqh\n",
+              "GvWiTtr19IbQdv8WE1dBOa+lNipi00vM+C9W8F7IDH0aaS+KKFaekfOwUNG520lVemVKNYbjnPl7\n",
+              "LimE+s4N2NJ5SYT5+XRMb+vTvKCkG/By5wQO/WbZo9HorEm10+Tu4CVIj+2Ky5hDZl+kA6mkBK7E\n",
+              "3LwAW+4rGYiO9JH1BLFQj0ZOJq0ybrdVynOYOw8TudsCI+I3fiT5nmYCkIO1N7h++s67fASBLfgP\n",
+              "CYo7yLNwfifRM3ay+JhoRmwX5tGJ8l9w676Zo1wDaqZ0Q5guAYSxSJk2jHShR6LxlZmIVJnq7S00\n",
+              "iBOM0mxomzMhjpxeX6zqy/aA2SEREi4ulxZsEvlIWhLQ5YFv6LMkVEh9RITRQOsKGEls7Y4eSRWc\n",
+              "f23FGWOVxL2MZUmPGVh++Xygx19XCiXwoatt/s2T7zGfLkQ2IBiMKXoeDb7yiR4q+0v6UjACWT2H\n",
+              "kOIRMpG/B4KQPsfMRT0Rk3cAwV9dNnKm4XTlo9P9TmyT71B/Greq+KvhEBDxAAACJkGaJEmoQWiZ\n",
+              "TAgj//61KoAW5ktFwTkgtAAhBassVgP2a7WSOTniW7GlpUC5YARIimzpboyDKn/53KIxVBS+A0NS\n",
+              "3NuuWMzq53zfHvhoSdYO4dYooBUDN2VkLpVK3v3kQo1FoE02X3cyV2j6ziOTJORgWGzqU5k0XKJO\n",
+              "1VCPDS1gJclQYem5NlGAENmSiR9I8XvNQLGvpLGF/2+aU31xCZzIPp4tUxyLu/gVqq+6L5DezfDz\n",
+              "gPP3+vv4JFttE5Nyc7LysmCaQfUhi6zPymHmdLjs3bZdma4hV61UMMsGBNZfYf2GUkV1dVZ9kkfz\n",
+              "RyUYJPFdwjA5S++T8sc03o81MYXnXYkO9hGiG6RRLRRV2fPSgGhghnaqxRhYVQiuVS0ENIpjxqqc\n",
+              "KBEaAMs1VoaLKEOrNhZ8yB1VLLV9KSiM7/prkkNKRuNLp0WeTv2eHtXhIdAfhKb+ic7Pb48CqpOl\n",
+              "FnnbgphlxDaS1dplrA4VxMNzEL/27xNMQzhuRvnSDNb60j/kSJHw5x2JG6G/VwCoVAfFrZll45AB\n",
+              "Puajv4y9+7flMd/pR8Rg9UAn+cey+vNCcCbbn7FNSWq2hl9cymk4fwW6iqBgiFEQ7YZtyDoNCyYz\n",
+              "KAnW0gvHCg+5n6+qxC+xDS291Y4JfSW927ZZudU0tXxvupwcKf6fDXxz/bqsOMvxj6Y81+e6Dezh\n",
+              "B2/8nCpk1Qc7N5s0JoStEQ8+K2ir0vIXayhFQIgAAAEeQZ5CRREsO/8AZTZTJbuKD3PiQhYpzA/Q\n",
+              "3Iqsld8XUz3sHppFsAHZevvXPBLN2cIUd+YCbEEH6MplVFEcbuDDV0dnlBcrCNrbp3+CAOdBsr6h\n",
+              "0YfLGDPxHlFlUCi4qTS1o0TT2Jzkq8/O+TU7SSImG1EjEmOGpKvxjn7KxERq2Pbd/0y1sNHk5hiQ\n",
+              "eJwHwc7Z19aIrWes4h3UYQqHeU6kfCpUHVgnGubU2A0Xjg0UrouNSumFogz0StLk4fuhL5slF3Bb\n",
+              "3NpP7YhgiVLV0FNM21/pfbXvRQFzmliOaZuScgePqa02nvOdEHEpGVRPLCGL/tvzSkZqhXResmQg\n",
+              "1qZ/TxlvqjWYqPRThBIk2nP66jbd6NLagdWz1BtbrwB3TQAAAVkBnmF0Q38Ajz7dDL7wKLyRAA5r\n",
+              "u/5Co2KbB/AnQg3XvWeaImUuto8KuobiZ5Rpi0jf/+r5lFprj/mYxpQ5OwqjQqFG0eXwqi1D6M23\n",
+              "HLH/3LvgYXkbAAGr9uWkQaEU+TeJ38WNXodDC29t8Y0uYEpwNzyC6FqtgkCyDYDpd/nESpdVRRJh\n",
+              "15SV0TP88AKwZsT7yWH2r5gpJv8AhXnnWmKJ/WMwiS/2+Kf3ikj614P+BDohXhMYGO4GSZ19EkRI\n",
+              "RjwO1zoy3Umd4iOMuBBPzevAs74sU7IUdkUF24rNAstoyqnAUgY510L3SgPXbZmJYMv+tRpT7ZuM\n",
+              "oLxE5ACIQ+eHStmGZgh2P1nvrIaZRiBxoWZ1B+DDOtu5OZpc7LbajGP/oy8HbEFyJIcGXHGB5VXY\n",
+              "HnskMmabuu5xyFIJcVaqbGg3TlqrbBE29OX6xO7K38oavU/okVlIM+AAAAGEAZ5jakN/AIdXv9ZL\n",
+              "/wCpeCQF0zyG8897iu+TVNq8xXl3pE8eXm424VBKoADmOQ/RgBgC6Y0IzpqUKPVKwCZafdEIuhUv\n",
+              "zhgtxewRpr3F4VdMy9NUqqvPfGroLPxDW64Af18RtCEv8t7amX9ezvEWK8AgZjHjHXeVi2k8dp4r\n",
+              "TuMjdngEOGe6y0V0qXE0vJudyGSblaiStnW6rV0e34JxbdN3Qbajy6ozlLfOkq7Wqx1iLXxa4foY\n",
+              "IPBIjzxdye8gOjZW7bP0axd+wppVHkXrrvuxUf9dp18AanJIIFv6MCm6ujRO2wyu4ZfSbZp/KVFm\n",
+              "xvxpBAJyjKSdCoPxWylEDyms9NAmwAADmUiy6WUOIsiAC130X9MRKfeLHi3miJh/YDGeINuX+P+e\n",
+              "NWBXxp3RqAzo1eISPcPztmgXUHCSN2VRpnCOFQoF4yyryK4v7s2U4a7V5e2sVJBhb7kguiVFACK3\n",
+              "rbLSCnWI4OCs6u017nghnGW3Juq0rF80iqmo5QCt19S62wAAAkZBmmhJqEFsmUwII//+tSqAFu/w\n",
+              "HjJpMYeKfGxaFh4NwH9VzFzipiNnWLhZf3lim8qQP0NcWviT9hCfSjxxrnYEE59yPQn7u6+tCr/u\n",
+              "vn8/iyWB73TxWIDTyqwOWzo0R8Wj7McP4QWP8yE0svd//Wkug5+3cHmcpP/ONbeBn+TAQ0VzErlc\n",
+              "2hXFLnmGW7EB004qvGi/S7JfG21T+V5Sx9Nre0PuomioWltV0uJSYiMg18UwZktQhoyeO+qpPgky\n",
+              "U9/xX6NUrUyAfCz03v4wSV58lpzV7BxftApX8ZGWBx2zWQV/YeOCEWbmbHqvN18Jd5FxK1iHRqe+\n",
+              "nBGg6SyBQEQQfCMxCo37AXM212ulRN9X2fE3P9HkhvkaOxQZ5AElyFJ4BlaM9J8bcUgOX6NS6Cqb\n",
+              "n7IHMcCIPjAIJ36atWVr0EheDYyrwatT/sRxqfSoF0RgoVqtGqstMXZF7XACu2N9LDV5Ss0B+mSl\n",
+              "kJJqGxc50wazbtpofP341QOLrRCoQigLO2IFkJyqTpln4FgoWIMbx8x6cKkFmIESXv7mZEx6LOrL\n",
+              "ggZa/EdzllkBPCO/+zBjmey1Y55MrbMpoidNDpdQ6yZ4UDU0ai3HtghNjtrUaVDC+dCrSCASLB02\n",
+              "bO819PX27qwUTWW1MCrVhUzQkUkht4Xa4bdnUW7zTudPa++EPxUMVY36vPDJoCGilCgIXzTOV6S9\n",
+              "OVTh4+OA6S/XkcoA6ZjbQLERX5kZSQMoFJs4bPot93titzpDSKAhc1QMx6eKK6Ol2IEAAAEkQZ6G\n",
+              "RRUsO/8AZUEFdKFRxHYcrgnLV1IJewAc5dAL6/Pr5YWcZb4ejev9b/lpY1ea5Xk1AlTe44c3rPkF\n",
+              "DXI6yAdEC7kxPh5StAse03AARSF2nro+Dr5bfPJyYF/ERJ9NScPmUIVihvTCsyh5qmuoAH9P7eCu\n",
+              "Y8rdH1hF/pTSa+Z1tzZc8gwGtgV/YsMtlWLs3VbLWxt2KTDW5Y2b0HA6zgNn25rXu72r6iiN5aw7\n",
+              "sjFipq/8rjgHE9K0EK2Opn+0SPK2Rbo28aoNdC9V8VxW1CpMNxKjFOs8YmQmJE6Qtkw+Uo5mh3ic\n",
+              "7Ng6Xje5wAF7a8Iyr8DMIwvMZnnVp6ilQ1B/LSGEPncviRIHH8w83Grtt0CsL1L2isuyMboY11N9\n",
+              "lxQPpwAAAUABnqV0Q38Aiz6zZgMl5b2XXQAXQ9yHCqNv7FVD9CxHdTnw5pqRTLAoFiba5ss3lqXG\n",
+              "QCf4/o32jzmzNKjZDN2ghdo3OS7n/NFKTMs4yX0NTqaEhdnVRvrbcGvcKo0NYMgzE8UNwneueU22\n",
+              "1vpuKbOkae4P82iS9XSi8TlOPcF8mmD+n9qfVTXzL4r0M/s5xxZempvnxqhz38EgmSM/Zw7kEyiv\n",
+              "giyuP/YjNhFl3FVcOSLiQTCj+F0nLUE7lia+UkuO/YNBXwUKZKD8Add8BG6ZTC4bD/RSktc7uv8w\n",
+              "NB82AXgnpuELTB2xZFOLAYJncjo03/3uAK678Cl8cw8fzlbnSpp5eUkHacCUtAY9LPrz/OMf2bA9\n",
+              "vBE2eUwrxz/W0Sg0tjzkUrpnJSF+xYsA2fgRolT6A0NA++mVN8PJVhaGzQAAAX4BnqdqQ38Aj1eg\n",
+              "HO2BrhbSJp3bjAA7Lyx/X3Tt1hQ2T/wP93u+Km2fQtCsS47kHT/v6cxSu0EEWzwOVr17m7uMIt8s\n",
+              "rOS2NL0s+wNbNsQiUhFGWcubxLdtukca9QFTdaQjRXuW15l7gz2QnuVPe/r9SLMinrQ8TAT7c4JB\n",
+              "GrUpwbYY2wvPKUw4NOIKdjGz2TGxM02Yhqm+YQD7nu+MPeXg/5dBf+XeKfPK+RchTbfnRfx28pUm\n",
+              "+MUq+ynmpWVmmfO3TbD8gZCbZRUeK4LOH5lP3nvVvkbZlQVhN5vPlxxNouZsDfsmprxmWrHzH3vb\n",
+              "E+c7VsDA88L9wCH+ZmQGzxFjyOQ8cz4P9rsZSuU8vQS1h6fmk4XXUosrmweEGKJT/Sv5qb0OG8e9\n",
+              "voRxFaPrroiqkALWSnA5n4zcQMwfY/xXX1aR5rslt9ItB406qJIsbsrkl8pXUe2CwOVm9B72bhd1\n",
+              "lqsCRNktqyPMF/Ek4JsxscPvDjbSqbQZL+uT8zjgAAAB5EGarEmoQWyZTAgj//61KoAZQB+OVG5p\n",
+              "SZHABUb2//v8PGtlbWZ+A0oGGFPTAdgmU2TFbsuJ6mwUCouNe8f1I2ythN04JSJ5lx+ik6KpnC91\n",
+              "1FD3eD5Jit+kJIg5holbnldcijL50GRMV+Tt0L65TPBxqSAUdrQu+eLUTHPpJCL4CV5RJau8pEIv\n",
+              "uK3a7QA/UMQ/nrDjeZ6jqf1BF3JjbyaeIc5drvnYbR6lQ0gBIzp/QRU9xrHm8FESnIe42aooWDJ9\n",
+              "bVMccs59QBQd45WisW0MXV7NFtyepgfK7biPJN57MDsWL2A4LYHAXH6f6In3GVsSrYQ2HUKGlxpv\n",
+              "Yf/Xvk0pBnHsuIEsslXTjxwTTzuRb2YT7QCJp6yHiUVL67n8RfvHMNoHfUzP4rVgPSXcPL8FOP2d\n",
+              "F8GxovHNOmsOSUyc+t9OZXQFF+4FJNSN23FsgARohBEJ3c1u0ax3ACLYlwfCd3/U1mT29ftZkWMR\n",
+              "uj01t9v2AGHvgKM29X2Vs/ALzLNDd2OM9z+AC4TlcpgcRujIhnjHf17Je/8RMBqJCZtdfrFmz6AW\n",
+              "Z/aNIv/p/WX6adpvStFWxoDAnf+Tai9COS20TO4GHDviQkpMo6tbNTk4tiYWsmvBNq5u/aO08r2y\n",
+              "Bs1eH2kAAAD6QZ7KRRUsO/8AZUj9pUTz7rNMoHjJ4gSsLw2wABNFEVCVBZ8at73oa3C8UmeDMVba\n",
+              "M3uHP8p2EFDXTkl9EiChbxZZgpuvefKfc50lYhoTJ/7H62X0Z9NX2I7S32WT1XJeJtD32zfVBu3K\n",
+              "VmE+30x6+W2pKnyMM0ZejDKLq8WyIyi+9rC0QVVyU0N739nDCyt6aqRfMfSdljqTnwOmgDB5pHyK\n",
+              "U8Nf/BZxnIET5uBVX/VcS4bjmT9sCYYwmAz5vBy8cv5J53FYPh0/wF7kP2myhm8SfTnmNtpTej0y\n",
+              "JjLbrdGSBUAu+lwbCsr/YdOCYrxvvrklZP4j4s5VlQAAAgYBnul0Q38Aiz6zZf6skuDOogA4jl3V\n",
+              "YKO0NncAuqtob34dJ/eVmQtCFk2jxP+6gBUwoAJ5d6wKEpypNd+AlIf83kNIAAC8trXyGAv3zzzV\n",
+              "tAa7kzCHOXS39Rxic+qZEHcHH0Hx0iIZnH1UNeoS6dQYQqolDkQpOXG8nP6tDCpAEYSQsJzo5kch\n",
+              "Xf9jICMUCBjMQXeVS1i3FdA07mrKCBowVzEdee9WvqvXV7KuMTufiL0hA8BHvtD6VFvEZ6eiqgvN\n",
+              "8RNM5cYXQ2i+4Lx4R2QlAIN1NNxqM8GvSjSh/rgipqY8DwHJh8p9Jbu0Zs+w86pgxJN8m/cvWxRZ\n",
+              "yFAtI7sBhDbJnNXx83ll0o93YVJhxi0TxWXPf6PlHZeEyvr6QOF2VVafQjsZUg34P/p6tj3lkAer\n",
+              "aZouLIrbfbTrpoGdtXuXR2qC418s780GZsUBVTlvppC7dgGYqQzB5daoV61BoiIg6tQyG20Yk/Ib\n",
+              "TtwSJmeU5Eiu/zRo0bpbU2jgV79WVCB/SVzxsmoD1jJEhzN1FHxsbajOijl9Vp76GofsezNr+37n\n",
+              "UWWhPPzCk1rCLQgaI34ekcMUWq/vBK2WDe7wKACe/5M5UglN5Ct9Orsd3SfYPc0336usW56marFA\n",
+              "xW2XgVLc1GludnoFyQrT+oASHSl68jJc1j3I4WTIeU/p+eW8RtUF4AAAAR4BnutqQ38Ai1egJmdK\n",
+              "YqnGBlYUAF9obzNVJ+s4Wyt0Rq0YuZmzKSClvCu/741bUzMW9+2RqBxHf8xROd9WCD2DFO6m3iiG\n",
+              "ZOgLMC6WQsGlrWDKBATBQkW8M70y/ztO1ZzNQj1ow5FREW75+T8qWeYnaEkP0sDPfhS/8A++EHpT\n",
+              "ONUZpoNHugOpCj8EFvE/MnQhkWbqDB+V4zYJeD+V1h9PGTTPeM5Ykyq4ZMi+8E5Gka9dd2CFXMaQ\n",
+              "M99mRo+FOH0+y87A4U4JusoMgrnGwBHn7tNdR1Jgk+wKYqmIwBj2jGPnQFJXhHhE3ZkpIjaeakM2\n",
+              "8MH5c8xC359KRjK1nfiZHGSkxS98YPps7lGGiAJ2WdM/l0XaVpItX1VPHy/wAAACGUGa8EmoQWyZ\n",
+              "TAgj//61KoAWNzc2A41R+LAApun++OIZUz7EikV/szjfxvYPLx+f9K2/F/he8DHawkBMdV2wRLxA\n",
+              "t50GIuRUSWE/39Xo4nAQqkjDTJdufKMgNIx0erMAcY2QA5ejjVo1tlzncJOxCqGpuGwA+5/4IKyu\n",
+              "bmTzdPecTw0ZdpVPq5j/sb/uUTmyS5oriK2QJUn4uMhurpWU0pM90BFHxmx/55iJQnC/E4AiRjGv\n",
+              "TSfvy9eol7L6q3/AmWDGKQmta5h6TQecJSS7keMMTmFMkcgh+dQEUTFbphGIZpTz6vxfkWPPyqpQ\n",
+              "VmS0gectGBeLssajkGiu1ivhXeMUvGnpqjpc6XSD8FJ8sVdfwdsse9JozsVq/t5YFq5+AnEYcopl\n",
+              "mlIiLVwif6/glDa/FvPVZyUrYuYY9L3TA7eEHe1IcHWSOPxpnafEFBrVGoeZPrbfymiVcHOQ/3CX\n",
+              "aGrpVwdWrmOHr8jLuajUxWOW37ajHobcyT1hYWMxRTx80fZmsfvsrNw/Nztdx7LidHGE8jPZ4gQZ\n",
+              "DABlByR/bof6mTmjqkfbsR1PCXy4RDNnn9nCnaSnb8pCApsF6YsDTv0+UmVzx2ZPSdm2LhZIqOim\n",
+              "mhiXHWt+ZE1dnYkLwTdsgNYEeAUTjY5XG25CAykSMfKGwGWeeOwqKmLAqTmb7mCXXxxpy4+bbELo\n",
+              "RAxOLFOR7z+Rlt4VIVMH4QAAASRBnw5FFSw7/wBiyP2mEJvZyVx6ACpM7CM8ZBKHKR5j7ndOem+L\n",
+              "X5lQTliSlHrc19blDxI+BarmPxVVRFr/CorqLGvI+vHNUfF9L5rOth1seL+LchCRD6bYXJMlctoQ\n",
+              "KBnrSfN8OsFA3rCX0rxhgXIKgdEDuCNRYd4XCiw0AyO8VPwgQ3UKQOwN4T9AdwOVZht3xWSjlGSY\n",
+              "LTfR+DOcni9vpFUI/V99yTFNeriW/Ezi0Mmb4Xp+UrrTAn+/oqePQryHATZ97i1I4TzdZJ6ol421\n",
+              "ZZiGDIa6I2z+mz36WJISXYfn5PcaqZon5evy7wkHdXdLSXQuyy6RoW3UMK1kv4eYGMx6MEUBV881\n",
+              "1DxJ4Az2tfQhJ60iq3lK6xGARpoGTWiGA3pBAAABAwGfLXRDfwCHPtdry+v+2nyY2Sk+gF5YW5HN\n",
+              "XoAL6QRR4alJgXnPRJGLu1H/XzBsCOVwj2OHZ7/Befz18ioG7PdTUWTo/DFmzXwFwKSHq5MESJ/K\n",
+              "+czoaBaMU0SilMUvvgF9NaNkzEcYOJjCpUUkl+lvc9iWY7aNcNT0YkO2YuPLl1ZJa6XpXyzgvJfC\n",
+              "YABMMMlHP4hWdgac8C4JyYJle4OEiXwhanMhhDIkpZpmZqqPP6iXGzuSTb+0ZDMJHqoDGqJmkb8S\n",
+              "IJuvyZGNE4panvJTPVd9f7g4/aXxMPm3Cn3wfT3mTthI056NzanOEWKjM1qGy4olpTOi0cV3zUKu\n",
+              "VGl1k7sAAAHXAZ8vakN/AInJcXImIY9AsY+/nZAB2XUf7nMR8KlDfCSlxubwbY5yyAvaK6FdhjtI\n",
+              "iTEMX/gD5nqi6yBjPV+WgerMVdQiwmsTWCh4ZDRMTEvRNiTK06p6H4BM93iWfwAaKh8Gz9Gaukwy\n",
+              "InHLEZ0yD1XqM2twrrM9K/zMIWUOeN0Z6Qpdges4mCaPjYBUMA0KTxEuHmES85gUYlt0s0Ks9Nu+\n",
+              "2hfyb2t0rmyvRs70WgBBgYrdeTZMCwmoCbRHPK4oxsSlCang/p1gu/DmbjnwYRln/v7ufz7R3gdP\n",
+              "Fr7XrHKEZc+f98DBxQMF82PBbmDGtLAQXHwptz6g5mqHfaJhvvgj78jkqTGrQ4WXMBaKzHGNvGYe\n",
+              "XIR0bHtcMMQd0uz0UHs+NS8bhlZ93PGBn0DI4S7X4qFOiND2PCIg5ogjbfFqU4Kuh5oLH4L3vi2E\n",
+              "bzWP7DaofhwjMqjCqAvZAgznNJDsvnJzQxJ6Pqjj2ny04t1drdQRUisSLN+PcLenLQZbe401Xg2H\n",
+              "yhW845ouHrITGSqb9EOEeoN97gj42PjsdYRMVLRDVvCV2BOAqdLbEmICPHZnyy75qPsejK7duPuc\n",
+              "fJ9rEnjynB/HxYz7zf/RM6xyYbzIoc3AAAACEkGbNEmoQWyZTAgj//61KoAbj1lLPyvb6PAZgAh9\n",
+              "7f/9/gX2SHKs8Uq31kdycpXc3bf6XPCYn1E4Nyshm7SbxYTXwR3t77AgzFtBuE6fBgZeY48yXmAW\n",
+              "rqOr3iMlgArjVOjemrjz47grY/T9rKmhvhaqPi8pvZTzkzZCl+tV6nzXVbBFw15yZW9xk2z611V7\n",
+              "GITjv5GH4Oi/06B5IbjEMVKEcRpvt893HwIyUBXniM9I90uh0TBxOedvsxxE2iLZsr/m/GNXryb+\n",
+              "9as6btju6GU5FfXHAHKy97PxI2Rac5Rx/FoPiuKEecRx7EQrDfRmlggPPP63oMY4jkBeTzC7Drwp\n",
+              "8ik2Z4rhoAMWlcRPfXCI56oe4Jt09oRInuaD3ww9/jGDjhHIXGbNYM/s5UG1XuYLCqaLxESIyPG/\n",
+              "eNnETthXX/QZDvDCFX3YINANkqDvHlUQ+vcUvksaWF/g1aVcMu45c8BoP1coWBAVWVE6iyDMwfYl\n",
+              "RYTcnNfp26mpOfqiSJnYH+AFj0qGJttgeZBuJCzdV4F5EDreo0WWAiq/0jdXljJ+ZxDij/UazQOM\n",
+              "0ct15Q7rTOqLKy+lpOVa/koSWj06e8eyy0wY1FBSVaROGYbDgXze1QzYiVyP6+WTk1fjz+Do+J+/\n",
+              "TxVlHJsfUOz0tbPJ3R4cSjRVigTxPg9VAYynpzzMlIr0/pCOGd4XYyl3SGTwAAABOUGfUkUVLDv/\n",
+              "AGU2ltMhgssRVFnYDYHdfwUIOpARUIP1pWfDHpU2pf97OTOpyP7SrW+j72yMHgCy10/KQJvVenOE\n",
+              "eMrSHUfyq6lVIsdEDgl0M+/NXx5VMpg+IZB+I7xozsY2f0ARjiAjA8ZSqG32YEqaGwpGp+vfKL3P\n",
+              "hav1CfnyaUmopPCa0Y5ww/PZN4YINPOwE+Gg36kaKP/ME/B0d8v00CzvLXmI8pIa3TqrGIa7PF4X\n",
+              "8miGO6oXkRH45ag0gFdgkGj+BD1PvtIptIkuqTa5jzG/NewDN9cCfws/hjc474K6NoCTyr++7Tth\n",
+              "LSIM60DcVje0csuhEMwOmCNob99l/AJp/9hMVsVsEaxUNsWBZFMKnZoLJU/ljkNlTtF1zcUwJoZD\n",
+              "oLTT6FmWVzlFnyfjiJdVIqMAAYsAAAIPAZ9xdEN/AI8+s1VkrBucudR5tN1L4cUDsugAOgW+6weD\n",
+              "VD4WeLhja/JOA5FtORnuW7CfHWfWrXcPJlwit0rQdaNL8wYmpMOBxVMKErdopYTnWfb0EZST9ZFP\n",
+              "kGeAI5wBNyE7pmk7U/hz6/Uncd5yONsvInzdtLdlFGIUuwPsZsiC4nxcPKJ4ER73zqMcPC62dMwB\n",
+              "YeP2JTSzcWxmsY8AuUeSUMff3wugzCWo2dZWIqj8MEevc9dnI6e4RX4rfqOmeKfJ7QFxuPllAOzz\n",
+              "FkyERujhdmr2mdRExctZgI01tg+iF/NwBCqP+hQ0BZaq12BgDPwBcWyuj8PXGo/75aroqbic3atK\n",
+              "78lcQoP6TccBH3q4TpJbdFKZCXZFrS7Hh71ZQxzuADlZ8DDRzGHyvFJs8+7LX0Z3SVEeli/7hzNR\n",
+              "3en2BovQV52x/rwTox00ojUHS89/I6QK5rr9xZ5z1Evdog7ewBETCofR8FQPxE+2X576ofb9SYpa\n",
+              "RU+FFWJ4WPQBj/u1ljXdmoINHOgs90YcpGG37DHSgRaxKh3h9samVWdsr/7ZPH7Krx9nfE8zJoXc\n",
+              "5Frf0sUOO22BhUTf6MatKarbA54SuNAmIi3ejRZKQJ4XCjhpsLBrmw33yy9Nk6OT0LCi0ELysL29\n",
+              "OvbOK/J+/iRz4bP6v+/3ppYXG9MzSEeggmS96wm6yOsevJy9wrAAAAHWAZ9zakN/AIdXwVSZADwX\n",
+              "ZeAC6HD/yFRsSkP+ZT/GPlFXimE8PIk5/ho1VfL2NNL2pqViOd6YYnwc7ksNMs5IkNYQ+fdC2XMm\n",
+              "GpZcBQdS+anJcAkZpOHFxqdIo1pLhI3h3bcsWXXBd+BTXZhbA2JSmhm8EWBGqSBNaO0U3Qcdcea5\n",
+              "428f3xthr08dSK0oFN+HNErgBuKfL3JZNShDHaW66u0MaG1B/cF2Go8z1F6LGKUAmsy0D/C2CM25\n",
+              "q38c827dgYTnZjZnTFxlPuxm+JuWvYpOeWyy3J/wjV/USVL+4BKz61/Ccy+EH/JkQUqRmUOtvYei\n",
+              "XxTdexyug9nI6kyTGc2H3hy0C3uFxKKFKo9PfiwDCQWhQ1+vZIsII4FYexn+pQbkz5kmdlWKB5Lx\n",
+              "ONpNVggWvIuTYEFI34NTLTOf285YYkebB68ywIJ5f1uX/OXMZ5RxH3gjNZ8mKLNX9suvs06qOt/Q\n",
+              "e2ZfZ7Orgt/l3O7GLxwWvzugIsO88I1KhpZhgYDdYZ//1lVBcwG/tKVYjF1obqjtyFctY9LPGIag\n",
+              "318ehZmIvkhW9djj90e+pnWknudbQDv3Os17s3l7qFADdqSGqYyGaSU47a6O12HCRSwmepV1bewA\n",
+              "AAIrQZt4SahBbJlMCCH//qpVAC8LE+AX+ndLRI9AAL65x3/f4eNbK2tvWi3seP5qm31GHdf4edmk\n",
+              "0/ZKv9BuxjUGH/qoYxXDUlaWZFHb65x0lomfbckqRBtklU+1LGTmYtvnPAbKnUSAh/jTBATZpFND\n",
+              "l6V6ofQ5PTBcFjOWwgI6YqalXUkmqnN6g77O4xvodhM7XQWhsA44ADmvatn61wvReF9d9MqoCN9N\n",
+              "Twpkx2kbbrSoHJrSyqidCsv+e2gnLoWDEdLGn/42++dseweQBj40iKRQ7paDrpDRwTZVjGQJ+52c\n",
+              "gaUSUp5A/cAn4FgESmp/sZ0NpfD9/7ZAmCbSUfPUar6ndxZ3XG2DXWcNFu473rzFQZNpJnXg/Pfh\n",
+              "QCQDuu/iX2Vi2NjGs1QVI3BReUxvD8Z/YeLy6w0jDh9dcJGJdKoNjb9Epdy5r0lFeFb9L8AWhdEd\n",
+              "sGreMPdTiMRlq+JOqjdogseyQTcuDo5iesxIsb0dhY+P9VqSJtTxyPO42dn6TXPZDgt1vROlp+Ic\n",
+              "VTutbib7FY5U+jSckVQsLzLRwDuIoa+HpEcHjzuwHMaHrKVljgiPeRI3Afdpqx3nHgy0MFCOhGEr\n",
+              "Jkw+Dadh5qrWjCGOX2K5HPLV0E5qw7krTDhpWX8sTsYsIqvxr/V2EjIFiKwnheBvunmhlbHNUKTl\n",
+              "ykWRC9Afa8QE+vO8sLJHYNqVh5kOrsn0+NP1Mm4JPbYiahSDJa4o8TJzkXFBAAABAkGflkUVLDv/\n",
+              "AGBJAvfAgTZO/kHo4lc9yaSVZkgaxkXEQAgySaAqoJy8U1XmJXFaLzsHv4KqZnckX0gP1AYFUr5X\n",
+              "3Zof5zltHp7OQG87KhkyMuJLOz4diYjf3ctsH2KA3/S29L1hP4qjZ9kfgNEsjrH/nSlX3ikiiFcQ\n",
+              "/2mu5vwlzQMTIUj5/0pAslvbULpI2rwxcgfjtpeW3qe/Q0sCZXyJ3L7VhEaeyKZo/ALUAi114xdn\n",
+              "Gao6fyKpZhWohGCsI53i8XO3Y7Dq+aD4ONx4A265BL770fTZiNNw+oM7dwTK1vcPMdOTVjz4fi6j\n",
+              "bCMBPzMCGM7CsAz7OQTIKiUTlOi8YAAAAakBn7V0Q38AeTG7snd+wR+ioRwfka+slSBm7w4HiigA\n",
+              "mYoe7RzT8waKJhe/5/xyHdk2lI4Qb6yur2vWdYx/k/gVzZWx+dAAALHLM2W5kE06MD+/WY8W9vMg\n",
+              "jgsWx+NCob+sUo3r0m3kC7Z6vE5pa/kp8NVK1XizBU/gSaY6/S/NP+nzZeAUHhvnb6LPnQnTmhI7\n",
+              "+CLAa1UiK6P+lwPbKP0S0Q5RWiopmhls/AKTmwxXB+WRWyrrFglLMCCi/H7yBlZCPn3f1nUi1WXW\n",
+              "txmtCNftDVTPLfu3fbw+YSszpG0LQoe/d+Hn14JtNEXcVveVKgdRtrJ2SZSzkDZoD5uTokEopKbG\n",
+              "geSmsxJSe6mDenK/tstnSjFiozTKWgyJb1mTK9iBWStV+uPeceDypkgatRgkwgz17Zgn457UL8xo\n",
+              "RIb3Rzvhn1PaM6KKHv4wQMqvpqRXKRm+SScKgBhgUzc706tHx+sk3QXrFbfmTj3VwEqpASdMV8SQ\n",
+              "Rc7Pl7VdiwexHM38nPcgZguGyvH4NF1CZay1mT9d+wee9MfU3VHZJgMp057sUGFJIJZNmQAAASYB\n",
+              "n7dqQ38Ah1fDGltbSoFNBABy4LNfpqaOuQiA03rsvInHR01iNZMDGQE2sq9jRvjWYcCsjv8TgHDx\n",
+              "TelM9UgK8aIkbW5xZBO7YH31DMzHB/HcoCKmBUni45/7i/CIo8gF1pGPr0DAA7wV6D09MIgWLTIz\n",
+              "u2RlgzWHXLOhQSqpesq6gEgghz4eO+szzJWiaji2cgnbFYV7gS1iXMpBIisJc8i3U9gywhFgtGxt\n",
+              "IPW/7TiYEwGOLwxyjZX1HkROuSI8lAAdZBpungwbYVpPKSngzu3PnOIcBqes7c29MHD8jRPn7Zrt\n",
+              "720E/jZ4jB2yT62h5AEs+TCYeJmiY6lwGwXm58hIVqeMFafCwAYhd3vDCtfE6mymrvYwtLYQ0YeE\n",
+              "Ebj2MbA5+zEAAAFwQZu6SahBbJlMFEwR//61KoAWx89GABUe1i4OfaowcQHQyqHCv9PnwkHOB5jh\n",
+              "ZaY1nqaJvfgMHLxnx0HRU319XsFiIgZ3fycxZ7MoTbod+V6rFy2y2Qtld8RvCt0Ug4PVQuLFLU9x\n",
+              "N6gbeWntqj92UVkXYHO8rtnoyHbc5vkyDRwK85+1rEknOmV2fCPAJQWJQHZKzqn/akJ6R91HlWya\n",
+              "u/8GgP8q7KTtX0XyZMALsB3jT/UhmW5AlGIwNHeW1rtDiMG/Xy+69i+m2kTOjww4y5o0/8WfwLLR\n",
+              "RKlhEE1LYjJQjoy3+hNy7YguxzdtR0GOg0UsPQLFZIBnnCwGmFharg9MSkzKoZck80tBnNzVcu5F\n",
+              "Ot8W+bdDLv2E/9UTXci1RXlM26z5jearPa/9d/CciU6kElsImbzJ5J2YpzVs+pvW89XbvAJMExZq\n",
+              "wXD26iUkefzti1p2cc2CbM5qN5CGCTCmR13du1Y9J/JQwXkxhEAAAAFiAZ/ZakN/AHwUpp6Dymc0\n",
+              "2L536BR5shJlFypABdlGcrzfdaw/6f5GB/atQKmEnLjISTsAvG6zfbdBMs7bm2yeFrIQxXuK81kC\n",
+              "9pAAAXcBlvswH72knWeKBsU0Ht1g5h3YcKtQv4e82ah693wXobc+mdHgPA3TBKIFWUv/iM+/E90G\n",
+              "S/NmTeZC+lgt/zT/+HMt/QSFK9C1+AMdH9l6Wmy5eJzA8pumBNuqAArwclv8LW1AC9Ryj7J7dIqZ\n",
+              "2nhKIYQ08cavMFAGExrDHt7RiTs4Auer+jpijDT1MWhCFcQjNZn9nbOp1MdYUZ3batlHR94YKH39\n",
+              "SB9iaEe1H+vDrSDRsP3b0PfVLevCUtQQ7tTMju5YxLigI0SkXHby6oMGwH35DOmYdZ/QEHihEbbH\n",
+              "ljlaWypqm6TR7b/zNBCPoaZiHS0IlbTr/gzMbXxGasP7GssB89XtUV2jZihKJYcij8456L2VAAAC\n",
+              "WkGb3knhClJlMCCH//6qVQAvW48vGhnpxPcAFRvWsRQfCH0ZQNKlkI/Fmy/VFBZqjdqwlFWyRDRU\n",
+              "ATa/x8nSCThm/LYIboN0iejGj3Uchm8nyLv3P3+HOOnCw7+XGsyycSpaT/SKI8hu4RwjrdDxqaYn\n",
+              "k6pZ6qjZtX+IZ04XS8X44piBkZKHHklQnddyez3eJG0JjT0fN5b/c72jAD+sOeXlR6iPKkSUzu0o\n",
+              "3ha2oHN6UEDmISbP1cbB3piI/SHrisHlFNjIuHiEdkqSzG95tlcEE5RmJMFHyIZtmV+VUnHUg//H\n",
+              "WOVjyT0+oFlaS4c8th8dtoQJgchjo9u+OPpSDxEJgWI6zeeh28ogNTGzlwRqjfRSsrTItvjA1MD/\n",
+              "oBFhKLk5Gm5LLSkMpDHu9T5I2IaoH3PKDFRJp5FswrHAqK+C6EMiKJRw3UfQ++e71IzTL0xpDNJL\n",
+              "z6AeitOHT7WHH1q0lcaxtRKIXyzlri2FOeAU+zEh7DbcM3wvbzCPYrbD4ePmP1flYALif0DM+F20\n",
+              "woqO1ciEp6KvfcdLwkVhOi6HukmunTXGsruYaqjkaLT2QlUIMJVPTAaXGvEAsJSG/0vfsDXKkk6Z\n",
+              "sB3ElNrSO3yHej1aIEgW5xnCNisEQsWn6TKnOYGilPN4ZN8EB64V0F8PWNB9Aq0baX+T8kKesmFw\n",
+              "2y/668NRP8ypn4s+0TEew3V5nLH+An+XxWolypflMoVnWhEhG2W+IIgxfWfPuSgDmqBKtSemnfnO\n",
+              "mj2z1HJ4yEmqNoBjJwYnWfK8e0PHHb381Mk1zGGJOgWAAAABUEGf/EU0TDv/AFlVerlP4Rak+BQA\n",
+              "rfH1MAekqKZtO9rI3YpPu0XbIusXd4D2mikBBjNWCs5ZCx1/nIkAW78LpHSyCScRX686DgqeELvg\n",
+              "+6gjEvz9oPv/Q5SyPMBeMNrb/QJ3ato+Qw19nLJWjl0bduh+HilMsrklIYKHCWBaC/dNC4s7Xl/r\n",
+              "RCzM7ZJuRKmUY/D5sEAdr/H6TIVmiD0u2jiehC8y8Gw6flB5fdlWyz5ArpMes88RS9cHH1n4Dp5A\n",
+              "9YiKoxa6XsjMVtwy/Q1CE1CcjEE8nX1x2wi3FF+AiuFwqQsSRlHtfUsVksDBdXLvE8zjbyOIuIMV\n",
+              "pnJU22cEHHqRAVAAAQz/a8I3JUwtCYefKDlHQuITIdlhxtkj1S9/MOKY0At1R1tnioLMWN7HUVCo\n",
+              "b6XS9uoGwS6oOJgKcTFbR1vNa4wchWq0XCPds0DBwQAAAPYBnht0Q38AeTSjvudgsbkOLNHOwJSE\n",
+              "7MIAOT4Tae/DlzyAOhFcKHSt+XmND2K3krM1WAe1ksxoXOx8R5ib25iI4yoXHAvjcPvcDoLvQIYy\n",
+              "rfzkEj8FCsgVqTty2M7mcrrsvBMmGI/tSEAq1Wpq/wSUg2I4oZj0GjiChzewD+uw3YnWAi/Ntf5Y\n",
+              "Cv2dU9qEo9e3jPCavhxnj6HVQyqcvxekJ6cEcAGQvRh8PwiQyys4LYMz+Th6jmnZO6zDQlY1h459\n",
+              "aXiX/1NPDVjhvbOibPxdXy1nW8ZFN/ZpmMtUtTAz4mvuGfLCJYTZv8r0n1cztBPRieehovEAAAGy\n",
+              "AZ4dakN/AHwTrqiSAEDVZr7cfUIfCi6SEtf6z4BBmn/qEvCbGFYoG0hJzipIIEfgPxGLOPb5hgYo\n",
+              "3EqlxYfhyi3ADlPB0rSvUe/2K1c1bOHHkBdbN7v2fRCe6cTgBUViIyBzKbW8+YVzs1NjLsftvDLF\n",
+              "Jws+AVbFUOsz2XZO6+tJqS4okplORVfI8Zh8pjE7ly6+HI7Omo301kEp6VZks8VHiVKJOuTRsuFe\n",
+              "1lak9cDIgZS7IV3MkEjdmu8V6wPVTOui5KhgRegdKpe7dvKwiZROacSHUyEpgoiQ49NAkgd9ICSC\n",
+              "nOG96XtcVUK5qLGXI1ECEXtJcuaFVMtCmmOBBiFL8jC1MpHbxQ+4k2qRSUjP3JvFi0NfrsxeXbrH\n",
+              "Ebg5vBmNpJE6T+wdC73c70xC+Mtp+wYFzu5kfTKcL8d+Nzu4GlIr338e6SWwNSpXRGjfdLp9o3Ic\n",
+              "2PzMtQmrlpbEeUDp1vnkaZoqSF5M9xanIk/zohgoPX5++NN/ebYvr56WROjUeIUdsOf6nrJlmboT\n",
+              "DZEat6r4aY15lVCgiz4Mpb/mqSazxzrszmdRYRxGsW8DnzAAAAHfQZoCSahBaJlMCHf//qmWALFy\n",
+              "5oM61QiAB+cxK4+jNCOHXw6RALujtnWF0llKsvjvaSIz+44BdTBn8Dqmduydu0Ab2yYLL8rBa9BR\n",
+              "bM/WBrO6FCt4pfpaT57HiAbORTevnWHgnUCdwsiqbddvhjkiuJYbgCMD0kEP1SURu/b2Z5hWsq5s\n",
+              "eIdJwlVUmffx/GFsHH2OVg2kldaudIzyWEsMXsnZccvZ4+1TTMECSDKdUtlhUW9AAgPUraaePKP1\n",
+              "hatMAsKbsEP5g1nzjTlmyHjs7FjRbwjKng4/qsqVQ+s9Z8Le9mq44VPerxrlkKxdRgf8PQXTEpxP\n",
+              "gMR8UP9I/vRSJBbzTafYsMhPytfC8ESUe9ySga0pNZKSvC+bN1h7zO9OEjqF3rsnXJU2SZN7NAbS\n",
+              "01WCPkWQIdWN39TZ8BwhuM2E1/XfXA9OxCI/7PAG40Z8M1rKVJPTY+iwZnIQA6cEF3rnJVasn/JZ\n",
+              "rircnzzi1JQr5NiwthCEkD02k7GAoyHtF8lIKArvw+GqH7Ox1Tpd6DhPPJm2hmyijeFH6E+9UCJk\n",
+              "Iiolc9K3UW1rmUlHlF/p9jHAvsiiJUpuG/KCfna2LEYj9yn6P2oNlWfqq5P2HNtctaJeVRZv9Qb/\n",
+              "mNVjyjAAAAErQZ4gRREsO/8AZUEtk8LzOoS4AAhIFC88oI10PfUAs3UxxCOOtSzHREgn4/jgVfHt\n",
+              "0r483Tf2Y8D+zGlycQw2lUV6Nidlo0k0sASUCm4dEwF8Hb0+IzseFE0dYexJdLqvhcI7IIUIH6RG\n",
+              "uv8cjTXFD8CTksvYGpGc+uBYXhlwc3/jHhNGtm8G24uHniey+Zy/NtEpSl5dub3bE324kx+/N1gF\n",
+              "sU/CxkQF6UQWvd6Br4nL+i2L6udCLqM/JAVJhScc01UR/bE+NX2i3upx0qofgxfWL8unNZ/BP9Vc\n",
+              "CvVXAtxPw+0JopAnWMlwtBFG9wd+oP4zOIJ88u/VEvyZQd0JJP1Y3qhYk13Deyiv0C1r6ci1z7CQ\n",
+              "UwYqgUT64pT/hlIvHeCzEZxqH+WbUbEAAAGYAZ5fdEN/AIteE+hbrZmAAHNd3/IVGxTYP4E6C+Wr\n",
+              "63le3xAHjzqOqEil1tIAAUY3LvF62/277H30QskV8sEjceHvPe7bE0mfZ44avBY2gS0AAAMByRDk\n",
+              "EKOyh31Y2H0mdsy+zcGsPrGm3pHtO2riBcgILxHO0F5398HG90hK8UgtDUfp9CQyPOvDSyEU4WTb\n",
+              "6/WT9Z3aca6tb4C53W6p8Geyjq/mwbvNpnCVbbqIcx1ZT2+dencovmeYmPlI7jrhk6KwLYEd+5gO\n",
+              "J2YeKk4iWai6BsaO9+Tb5P52jBVHcSZ+Vws5QhTxkBSpdHlWJRcbh50V4ViVltwUN//XNx+jx2bk\n",
+              "KsfglI41FGmS2xAJtr8ZhKDk1VRRL2tGsNB5nztuRXCFd8q4MIuVVWGjim0ntcxZ/R18mzJZN+sI\n",
+              "qKUvfsxoaeZp+oIaU1hLeXzgcHEe+3/6emdZeJWoDNhUqhkfWzWzVZbEzUKpDBS9AbVIA5KR27LD\n",
+              "3HEfRMw9yt8eYILg7m/Rm2ubtU8u6V2QuxVXq1OHry5oY2TAAAABvQGeQWpDfwCPV5unds/RGF4o\n",
+              "aWlq+XwTSVpG+igacFOApaqyNJIXSXT4q7gA4DkP0YAYAumNCN0MwD7HSEeIsv3Q3L9kZ2RagxvU\n",
+              "jle4yQq6Zl5W7AgdlZnaBngH/w8xYsqWx5t90zzi7s9VyRY9jaNshfxuJAZcRgFILNTmQNCPoCtl\n",
+              "wyo5Ht91VCy2qSby6JDLeTD096PzM4KOK7/I+amuefuT0S/QnDNs952oi11JV2mbadqtKDqJE9x4\n",
+              "nX/OjU9PBP1uhsFLNkjsz6ZHlTOcsZvWUxabbw0HBNFuLXWIYqtAYdWN7c/QUoqY2IlVBR//v+NN\n",
+              "Bxf/rxPv+9QlTTeUOAVhzyU/kQACorW+VEL2KFNUPF85LUxlbSGEYQv/98/fAQAu6hKRw3yoJoPy\n",
+              "tyr7S7Za9gGurMYseuvuasNoB+fPCmp37VWgm4yNZQ0LM+8CPtaQgShVMs2/RIG2cXksHuYVqEB7\n",
+              "PJtzP2tl8EYDen8RohIb2UO5d/Xdc8aoi/Nu4IzGq8ApuZIxjC5J9bUYtMDEDA6eChGKPjb20vqg\n",
+              "2PRBI2fSXJrcSROGTC4m+VsF+VagO1LnjrakndEAAAHtQZpDSahBbJlMCG///qeEAVH55ayIAL6z\n",
+              "9D9Go2JR/VsPgULYIy+HM1JNQWUio64eqKV59gHDbxQ77xKGvVi/RlMeepNHF+Cplpp4rKqgivaK\n",
+              "14o0jVVjKwdzXmYfm8QJck76NrSj9rXzMi3Th9DbQ5HQHvlFr1+Ft6fGVXaubVoF+Bx3J4nvsWO+\n",
+              "FhXDphKaWh9geM/3PqX1TK4zqhRL2wKgDCWdLvIi2s2e48RSWR1zksj0SjkMINJfgjA7wVj0dW8Z\n",
+              "NZGlcRPjgkoSgpomI+x9/l7dJ5fHEj4WOkMQMTJnj+KOqaXfgtXbhBachZ0Av1Z6rh+qw/iObJOy\n",
+              "7q2gUdlftEWI7In7KZjqqg18Bg+z35wI2FmknOyXdEiDAPaFiRrhqkKOLfgLssw1BdohiuTGWlKn\n",
+              "NvPL4EzIbAUeS+0qv5cFdXvRjnn1zOMYTMpyN1CZYg4pqjj8mGtGdm1F7w0Xo4Mnm3hRmvZyyOaW\n",
+              "yf38s1SCwyOkhQcwJhrAAebvkxMWrAUWrTq9K9PdCUqFbMVB9+93aovoux8zBfM/WLangtLLXd/D\n",
+              "T9TcgY0eosWGZeAhQk2sxNC3bgvMT328AT2T2XCg2nG4jsOakPWfscwbc0zKfItj/1eXvyR2tk+K\n",
+              "fpgdg9dJ/OdcXINTUAAAB95tb292AAAAbG12aGQAAAAAAAAAAAAAAAAAAAPoAAAnEAABAAABAAAA\n",
+              "AAAAAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAEAAAAAAAAAAAAAAAAAAEAAAAAAAAAAAAAAAAAAAAAA\n",
+              "AAAAAAAAAAAAAAAAAAACAAAHCHRyYWsAAABcdGtoZAAAAAMAAAAAAAAAAAAAAAEAAAAAAAAnEAAA\n",
+              "AAAAAAAAAAAAAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAEAAAAAAAAAAAAAAAAAAEAAAAABsAAAASAA\n",
+              "AAAAACRlZHRzAAAAHGVsc3QAAAAAAAAAAQAAJxAAAAgAAAEAAAAABoBtZGlhAAAAIG1kaGQAAAAA\n",
+              "AAAAAAAAAAAAACgAAAGQAFXEAAAAAAAtaGRscgAAAAAAAAAAdmlkZQAAAAAAAAAAAAAAAFZpZGVv\n",
+              "SGFuZGxlcgAAAAYrbWluZgAAABR2bWhkAAAAAQAAAAAAAAAAAAAAJGRpbmYAAAAcZHJlZgAAAAAA\n",
+              "AAABAAAADHVybCAAAAABAAAF63N0YmwAAACzc3RzZAAAAAAAAAABAAAAo2F2YzEAAAAAAAAAAQAA\n",
+              "AAAAAAAAAAAAAAAAAAABsAEgAEgAAABIAAAAAAAAAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n",
+              "AAAAAAAAAAAY//8AAAAxYXZjQwFkABX/4QAYZ2QAFazZQbCWhAAAAwAEAAADAFA8WLZYAQAGaOvj\n",
+              "yyLAAAAAHHV1aWRraEDyXyRPxbo5pRvPAyPzAAAAAAAAABhzdHRzAAAAAAAAAAEAAABkAAAEAAAA\n",
+              "ABRzdHNzAAAAAAAAAAEAAAABAAADMGN0dHMAAAAAAAAAZAAAAAEAAAgAAAAAAQAAFAAAAAABAAAI\n",
+              "AAAAAAEAAAAAAAAAAQAABAAAAAABAAAUAAAAAAEAAAgAAAAAAQAAAAAAAAABAAAEAAAAAAEAABQA\n",
+              "AAAAAQAACAAAAAABAAAAAAAAAAEAAAQAAAAAAQAAFAAAAAABAAAIAAAAAAEAAAAAAAAAAQAABAAA\n",
+              "AAABAAAUAAAAAAEAAAgAAAAAAQAAAAAAAAABAAAEAAAAAAEAABQAAAAAAQAACAAAAAABAAAAAAAA\n",
+              "AAEAAAQAAAAAAQAAFAAAAAABAAAIAAAAAAEAAAAAAAAAAQAABAAAAAABAAAMAAAAAAEAAAQAAAAA\n",
+              "AQAAFAAAAAABAAAIAAAAAAEAAAAAAAAAAQAABAAAAAABAAAUAAAAAAEAAAgAAAAAAQAAAAAAAAAB\n",
+              "AAAEAAAAAAEAABQAAAAAAQAACAAAAAABAAAAAAAAAAEAAAQAAAAAAQAAFAAAAAABAAAIAAAAAAEA\n",
+              "AAAAAAAAAQAABAAAAAABAAAUAAAAAAEAAAgAAAAAAQAAAAAAAAABAAAEAAAAAAEAABQAAAAAAQAA\n",
+              "CAAAAAABAAAAAAAAAAEAAAQAAAAAAQAAFAAAAAABAAAIAAAAAAEAAAAAAAAAAQAABAAAAAABAAAM\n",
+              "AAAAAAEAAAQAAAAAAQAAFAAAAAABAAAIAAAAAAEAAAAAAAAAAQAABAAAAAABAAAUAAAAAAEAAAgA\n",
+              "AAAAAQAAAAAAAAABAAAEAAAAAAEAABQAAAAAAQAACAAAAAABAAAAAAAAAAEAAAQAAAAAAQAAFAAA\n",
+              "AAABAAAIAAAAAAEAAAAAAAAAAQAABAAAAAABAAAUAAAAAAEAAAgAAAAAAQAAAAAAAAABAAAEAAAA\n",
+              "AAEAABQAAAAAAQAACAAAAAABAAAAAAAAAAEAAAQAAAAAAQAAFAAAAAABAAAIAAAAAAEAAAAAAAAA\n",
+              "AQAABAAAAAABAAAMAAAAAAEAAAQAAAAAAQAAFAAAAAABAAAIAAAAAAEAAAAAAAAAAQAABAAAAAAB\n",
+              "AAAUAAAAAAEAAAgAAAAAAQAAAAAAAAABAAAEAAAAAAEAAAgAAAAAHHN0c2MAAAAAAAAAAQAAAAEA\n",
+              "AABkAAAAAQAAAaRzdHN6AAAAAAAAAAAAAABkAAAGhgAAAl8AAAFjAAAAvgAAAXYAAAHzAAABDgAA\n",
+              "ATYAAAFIAAAB9QAAAOIAAAD6AAABWgAAAbAAAADTAAAB8wAAAN4AAAH+AAABEAAAAOIAAAG2AAAC\n",
+              "DAAAAWUAAAGkAAABmgAAAckAAAEdAAABfQAAAPMAAAFxAAABIgAAAjYAAAEmAAAA5AAAAXoAAAH+\n",
+              "AAAA/wAAAT0AAAFnAAACAwAAARQAAAE3AAABTwAAAckAAADrAAACFwAAAP0AAAHzAAABIQAAAOAA\n",
+              "AAHKAAACOwAAAVQAAAHFAAABugAAAdQAAAD3AAABUgAAARIAAAFuAAABLwAAAhAAAAERAAAA9gAA\n",
+              "AZkAAAIqAAABIgAAAV0AAAGIAAACSgAAASgAAAFEAAABggAAAegAAAD+AAACCgAAASIAAAIdAAAB\n",
+              "KAAAAQcAAAHbAAACFgAAAT0AAAITAAAB2gAAAi8AAAEGAAABrQAAASoAAAF0AAABZgAAAl4AAAFU\n",
+              "AAAA+gAAAbYAAAHjAAABLwAAAZwAAAHBAAAB8QAAABRzdGNvAAAAAAAAAAEAAAAsAAAAYnVkdGEA\n",
+              "AABabWV0YQAAAAAAAAAhaGRscgAAAAAAAAAAbWRpcmFwcGwAAAAAAAAAAAAAAAAtaWxzdAAAACWp\n",
+              "dG9vAAAAHWRhdGEAAAABAAAAAExhdmY1Ny44My4xMDA=\n",
+              "\"\u003e\n",
+              "  Your browser does not support the video tag.\n",
+              "\u003c/video\u003e"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.HTML at 0x7f84b2253b50\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "display_data"
+        }
+      ],
+      "source": [
+        "import time\n",
+        "import traceback\n",
+        "\n",
+        "from matplotlib import pyplot as plt\n",
+        "from matplotlib import animation as anim\n",
+        "import tensorflow as tf\n",
+        "from tensorflow.contrib import autograph as ag\n",
+        "from IPython import display\n",
+        "\n",
+        "\n",
+        "@ag.do_not_convert(ag.RunMode.PY_FUNC)\n",
+        "def render(boards):\n",
+        "  fig = plt.figure()\n",
+        "\n",
+        "  ims = []\n",
+        "  for b in boards:\n",
+        "    im = plt.imshow(b, interpolation='none')\n",
+        "    im.axes.get_xaxis().set_visible(False)\n",
+        "    im.axes.get_yaxis().set_visible(False)\n",
+        "    ims.append([im])\n",
+        "\n",
+        "  try:\n",
+        "    ani = anim.ArtistAnimation(\n",
+        "        fig, ims, interval=100, blit=True, repeat_delay=5000)\n",
+        "    plt.close()\n",
+        "\n",
+        "    display.display(display.HTML(ani.to_html5_video()))\n",
+        "  except RuntimeError:\n",
+        "    print('Coult not render animation:')\n",
+        "    traceback.print_exc()\n",
+        "\n",
+        "\n",
+        "def gol_episode(board):\n",
+        "  directions = tf.constant(\n",
+        "      ((-1, -1), (-1, 0), (-1, 1), (0, -1), (0, 1), (1, -1), (1, 0), (1, 1)))\n",
+        "\n",
+        "  new_board = []\n",
+        "  ag.set_element_type(new_board, tf.int32)\n",
+        "\n",
+        "  for i in range(len(board)):\n",
+        "    for j in range(len(board[i])):\n",
+        "      num_neighbors = 0\n",
+        "      for d in directions:\n",
+        "        ni = i + d[0]\n",
+        "        nj = j + d[1]\n",
+        "        if ni \u003e= 0 and nj \u003e= 0 and ni \u003c len(board) and nj \u003c len(board[i]):\n",
+        "          num_neighbors += board[ni][nj]\n",
+        "      \n",
+        "      new_cell = 0\n",
+        "      if num_neighbors == 2:\n",
+        "        new_cell = board[i][j]\n",
+        "      elif num_neighbors == 3:\n",
+        "        new_cell = 1\n",
+        "      \n",
+        "      new_board.append(new_cell)\n",
+        "  final_board = ag.stack(new_board)\n",
+        "  final_board = tf.reshape(final_board, board.shape)\n",
+        "  return final_board\n",
+        "  \n",
+        "\n",
+        "def gol(initial_board):\n",
+        "  board = initial_board\n",
+        "  boards = []\n",
+        "  ag.set_element_type(boards, tf.int32)\n",
+        "  # We are being explicit about tensor constants to ensure the loop\n",
+        "  # is not unrolled in the graph. This may change in the future.\n",
+        "  for i in range(tf.constant(NUM_STEPS)):\n",
+        "    board = gol_episode(board)\n",
+        "    boards.append(board)\n",
+        "  boards = ag.stack(boards)\n",
+        "  render(boards)\n",
+        "  return tf.no_op()\n",
+        " \n",
+        "\n",
+        "with tf.Graph().as_default():\n",
+        "  # Gosper glider gun\n",
+        "  # Adapted from http://www.cplusplus.com/forum/lounge/75168/\n",
+        "  _ = 0\n",
+        "  initial_board = tf.constant((\n",
+        "      ( _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_ ),\n",
+        "      ( _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,1,_,_,_,_,_,_,_,_,_,_,_,_ ),\n",
+        "      ( _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,1,_,1,_,_,_,_,_,_,_,_,_,_,_,_ ),\n",
+        "      ( _,_,_,_,_,_,_,_,_,_,_,_,_,1,1,_,_,_,_,_,_,1,1,_,_,_,_,_,_,_,_,_,_,_,_,1,1,_ ),\n",
+        "      ( _,_,_,_,_,_,_,_,_,_,_,_,1,_,_,_,1,_,_,_,_,1,1,_,_,_,_,_,_,_,_,_,_,_,_,1,1,_ ),\n",
+        "      ( _,1,1,_,_,_,_,_,_,_,_,1,_,_,_,_,_,1,_,_,_,1,1,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_ ),\n",
+        "      ( _,1,1,_,_,_,_,_,_,_,_,1,_,_,_,1,_,1,1,_,_,_,_,1,_,1,_,_,_,_,_,_,_,_,_,_,_,_ ),\n",
+        "      ( _,_,_,_,_,_,_,_,_,_,_,1,_,_,_,_,_,1,_,_,_,_,_,_,_,1,_,_,_,_,_,_,_,_,_,_,_,_ ),\n",
+        "      ( _,_,_,_,_,_,_,_,_,_,_,_,1,_,_,_,1,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_ ),\n",
+        "      ( _,_,_,_,_,_,_,_,_,_,_,_,_,1,1,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_ ),\n",
+        "      ( _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_ ),\n",
+        "      ( _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_ ),\n",
+        "  ))\n",
+        "  initial_board = tf.pad(initial_board, ((0, 20), (0, 10)))\n",
+        "  \n",
+        "  tf_gol = ag.to_graph(gol)\n",
+        "  game_ops = tf_gol(initial_board)\n",
+        "  with tf.Session() as sess:\n",
+        "    sess.run(game_ops)\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "7NgrSPCZxs3h"
+      },
+      "source": [
+        "#### Generated code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 9,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "height": 2323
+        },
+        "colab_type": "code",
+        "executionInfo": {
+          "elapsed": 753,
+          "status": "ok",
+          "timestamp": 1532101593840,
+          "user": {
+            "displayName": "",
+            "photoUrl": "",
+            "userId": ""
+          },
+          "user_tz": 240
+        },
+        "id": "hIGYeX0Cxs3i",
+        "outputId": "e0b62eb1-3e12-4e53-dc54-8a3fa56d823d"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "from __future__ import print_function\n",
+            "import tensorflow as tf\n",
+            "\n",
+            "def tf__gol_episode(board):\n",
+            "  try:\n",
+            "    with tf.name_scope('gol_episode'):\n",
+            "      directions = tf.constant(((-1, -1), (-1, 0), (-1, 1), (0, -1), (0, 1),\n",
+            "          (1, -1), (1, 0), (1, 1)))\n",
+            "      new_board = ag__.new_list([])\n",
+            "\n",
+            "      def extra_test_2(new_board_2):\n",
+            "        with tf.name_scope('extra_test_2'):\n",
+            "          return True\n",
+            "\n",
+            "      def loop_body_2(i, new_board_2):\n",
+            "        with tf.name_scope('loop_body_2'):\n",
+            "\n",
+            "          def extra_test_1(new_board_1):\n",
+            "            with tf.name_scope('extra_test_1'):\n",
+            "              return True\n",
+            "\n",
+            "          def loop_body_1(j, new_board_1):\n",
+            "            with tf.name_scope('loop_body_1'):\n",
+            "              num_neighbors = 0\n",
+            "\n",
+            "              def extra_test(num_neighbors_2):\n",
+            "                with tf.name_scope('extra_test'):\n",
+            "                  return True\n",
+            "\n",
+            "              def loop_body(d, num_neighbors_2):\n",
+            "                with tf.name_scope('loop_body'):\n",
+            "                  ni = i + ag__.get_item(d, (0), opts=ag__.GetItemOpts(\n",
+            "                      element_dtype=None))\n",
+            "                  nj = j + ag__.get_item(d, (1), opts=ag__.GetItemOpts(\n",
+            "                      element_dtype=None))\n",
+            "\n",
+            "                  def if_true():\n",
+            "                    with tf.name_scope('if_true'):\n",
+            "                      num_neighbors_1, = num_neighbors_2,\n",
+            "                      num_neighbors_1 += ag__.get_item(ag__.get_item(board,\n",
+            "                          (ni), opts=ag__.GetItemOpts(element_dtype=None)),\n",
+            "                          (nj), opts=ag__.GetItemOpts(element_dtype=None))\n",
+            "                      return num_neighbors_1,\n",
+            "\n",
+            "                  def if_false():\n",
+            "                    with tf.name_scope('if_false'):\n",
+            "                      return num_neighbors_2,\n",
+            "                  num_neighbors_2 = ag__.utils.run_cond(tf.logical_and(tf.\n",
+            "                      greater_equal(ni, 0), tf.logical_and(tf.greater_equal\n",
+            "                      (nj, 0), tf.logical_and(tf.less(ni, ag__.utils.\n",
+            "                      dynamic_builtin(len, board)), tf.less(nj, ag__.utils.\n",
+            "                      dynamic_builtin(len, ag__.get_item(board, (i), opts=\n",
+            "                      ag__.GetItemOpts(element_dtype=None))))))), if_true,\n",
+            "                      if_false)\n",
+            "                  return num_neighbors_2,\n",
+            "              num_neighbors = ag__.for_stmt(directions, extra_test,\n",
+            "                  loop_body, (num_neighbors,))\n",
+            "              new_cell = 0\n",
+            "\n",
+            "              def if_true_2():\n",
+            "                with tf.name_scope('if_true_2'):\n",
+            "                  new_cell_2, = new_cell,\n",
+            "                  new_cell_2 = ag__.get_item(ag__.get_item(board, (i), opts\n",
+            "                      =ag__.GetItemOpts(element_dtype=None)), (j), opts=\n",
+            "                      ag__.GetItemOpts(element_dtype=None))\n",
+            "                  return new_cell_2,\n",
+            "\n",
+            "              def if_false_2():\n",
+            "                with tf.name_scope('if_false_2'):\n",
+            "                  new_cell_3, = new_cell,\n",
+            "\n",
+            "                  def if_true_1():\n",
+            "                    with tf.name_scope('if_true_1'):\n",
+            "                      new_cell_1, = new_cell_3,\n",
+            "                      new_cell_1 = 1\n",
+            "                      return new_cell_1,\n",
+            "\n",
+            "                  def if_false_1():\n",
+            "                    with tf.name_scope('if_false_1'):\n",
+            "                      return new_cell_3,\n",
+            "                  new_cell_3 = ag__.utils.run_cond(tf.equal(num_neighbors, \n",
+            "                      3), if_true_1, if_false_1)\n",
+            "                  return new_cell_3,\n",
+            "              new_cell = ag__.utils.run_cond(tf.equal(num_neighbors, 2),\n",
+            "                  if_true_2, if_false_2)\n",
+            "              new_board_1 = ag__.list_append(new_board_1, new_cell)\n",
+            "              return new_board_1,\n",
+            "          new_board_2 = ag__.for_stmt(ag__.utils.dynamic_builtin(range,\n",
+            "              ag__.utils.dynamic_builtin(len, ag__.get_item(board, (i),\n",
+            "              opts=ag__.GetItemOpts(element_dtype=None)))), extra_test_1,\n",
+            "              loop_body_1, (new_board_2,))\n",
+            "          return new_board_2,\n",
+            "      new_board = ag__.for_stmt(ag__.utils.dynamic_builtin(range, ag__.\n",
+            "          utils.dynamic_builtin(len, board)), extra_test_2, loop_body_2, (\n",
+            "          new_board,))\n",
+            "      final_board = ag__.list_stack(new_board, opts=ag__.ListStackOpts(\n",
+            "          element_dtype=tf.int32, original_call=ag.stack))\n",
+            "      final_board = tf.reshape(final_board, board.shape)\n",
+            "      return final_board\n",
+            "  except:\n",
+            "    ag__.rewrite_graph_construction_error(ag_source_map__)\n",
+            "\n",
+            "def tf__gol(initial_board):\n",
+            "  try:\n",
+            "    with tf.name_scope('gol'):\n",
+            "      board = initial_board\n",
+            "      boards = ag__.new_list([])\n",
+            "\n",
+            "      def extra_test(board_1, boards_1):\n",
+            "        with tf.name_scope('extra_test'):\n",
+            "          return True\n",
+            "\n",
+            "      def loop_body(i, board_1, boards_1):\n",
+            "        with tf.name_scope('loop_body'):\n",
+            "          board_1 = tf__gol_episode(board_1)\n",
+            "          boards_1 = ag__.list_append(boards_1, board_1)\n",
+            "          return board_1, boards_1\n",
+            "      board, boards = ag__.for_stmt(ag__.utils.dynamic_builtin(range, tf.\n",
+            "          constant(NUM_STEPS)), extra_test, loop_body, (board, boards))\n",
+            "      boards = ag__.list_stack(boards, opts=ag__.ListStackOpts(\n",
+            "          element_dtype=tf.int32, original_call=ag.stack))\n",
+            "      with ag__.utils.control_dependency_on_returns(render(boards)):\n",
+            "        boards_2 = ag__.utils.alias_tensors(boards)\n",
+            "        return tf.no_op()\n",
+            "  except:\n",
+            "    ag__.rewrite_graph_construction_error(ag_source_map__)\n",
+            "\n"
+          ]
+        }
+      ],
+      "source": [
+        "print(ag.to_code(gol))"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "collapsed_sections": [
+        "p8zZyj-tq4K3",
+        "Lkq3DBGOv3fA",
+        "r8_0ioEuAI-a",
+        "7NgrSPCZxs3h"
+      ],
+      "default_view": {},
+      "last_runtime": {
+        "build_target": "",
+        "kind": "local"
+      },
+      "name": "Simple algorithms using AutoGraph",
+      "provenance": [
+        {
+          "file_id": "19q8KdVF8Cb_fDd13i-WDOG_6n_QGNW5-",
+          "timestamp": 1528465909719
+        }
+      ],
+      "version": "0.3.2",
+      "views": {}
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/tensorflow/contrib/autograph/examples/notebooks/dev_summit_2018_demo.ipynb b/tensorflow/contrib/autograph/examples/notebooks/dev_summit_2018_demo.ipynb
index 86e38c3490..7e9cc54d4c 100644
--- a/tensorflow/contrib/autograph/examples/notebooks/dev_summit_2018_demo.ipynb
+++ b/tensorflow/contrib/autograph/examples/notebooks/dev_summit_2018_demo.ipynb
@@ -130,7 +130,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 4,
+      "execution_count": 0,
       "metadata": {
         "colab": {
           "autoexec": {
@@ -201,7 +201,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 5,
+      "execution_count": 0,
       "metadata": {
         "colab": {
           "autoexec": {
@@ -275,7 +275,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 6,
+      "execution_count": 0,
       "metadata": {
         "colab": {
           "autoexec": {
@@ -362,7 +362,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 7,
+      "execution_count": 0,
       "metadata": {
         "colab": {
           "autoexec": {
@@ -392,7 +392,7 @@
           "output_type": "stream",
           "text": [
             "Got error message: assertion failed: [Do not pass zero!]\n",
-            "\t [[Node: f/Assert/Assert = Assert[T=[DT_STRING], summarize=3, _device=\"/job:localhost/replica:0/task:0/device:CPU:0\"](f/NotEqual, f/Assert/Assert/data_0)]]\n"
+            "\t [[{{node f/Assert/Assert}} = Assert[T=[DT_STRING], summarize=3, _device=\"/job:localhost/replica:0/task:0/device:CPU:0\"](f/NotEqual, f/Assert/Assert/data_0)]]\n"
           ]
         }
       ],
@@ -467,7 +467,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 9,
+      "execution_count": 0,
       "metadata": {
         "colab": {
           "autoexec": {
@@ -531,7 +531,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 10,
+      "execution_count": 0,
       "metadata": {
         "cellView": "code",
         "colab": {
@@ -896,7 +896,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 17,
+      "execution_count": 0,
       "metadata": {
         "colab": {
           "autoexec": {
@@ -1070,8 +1070,8 @@
         "  return dataset\n",
         "\n",
         "\n",
-        "train_url = \"https://raw.githubusercontent.com/random-forests/tensorflow-workshop/master/extras/colorbot/data/train.csv\"\n",
-        "test_url = \"https://raw.githubusercontent.com/random-forests/tensorflow-workshop/master/extras/colorbot/data/test.csv\"\n",
+        "train_url = \"https://raw.githubusercontent.com/random-forests/tensorflow-workshop/master/archive/extras/colorbot/data/train.csv\"\n",
+        "test_url = \"https://raw.githubusercontent.com/random-forests/tensorflow-workshop/master/archive/extras/colorbot/data/test.csv\"\n",
         "data_dir = \"tmp/rnn/data\""
       ]
     },
@@ -1304,7 +1304,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 22,
+      "execution_count": 0,
       "metadata": {
         "colab": {
           "autoexec": {
@@ -1905,10 +1905,6 @@
     "colab": {
       "collapsed_sections": [],
       "default_view": {},
-      "last_runtime": {
-        "build_target": "",
-        "kind": "local"
-      },
       "name": "Dev Summit 2018 - Autograph",
       "provenance": [
         {
diff --git a/tensorflow/contrib/autograph/examples/notebooks/graph_vs_ag_vs_eager_sum_speed_test.ipynb b/tensorflow/contrib/autograph/examples/notebooks/graph_vs_ag_vs_eager_sum_speed_test.ipynb
new file mode 100644
index 0000000000..32742bec7e
--- /dev/null
+++ b/tensorflow/contrib/autograph/examples/notebooks/graph_vs_ag_vs_eager_sum_speed_test.ipynb
@@ -0,0 +1,519 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "moMkWaT_TTHi"
+      },
+      "source": [
+        "This Colab illustrates the differing overhead* between a custom, vectorized graph operation and a loop over a tensor\n",
+        "that computes the same function. The loop is implemented in TensorFlow Eager mode using Python syntax and control-flow, and using AutoGraph which takes a python function and converts it into graph mode. In AutoGraph the Python loop is converted into a tf.while_loop.\n",
+        "\n",
+        "The actual computation, summing a small number of scalar values, takes very little time to compute, so the graphs below are showing the overhead of the differing approaches. As such, this is more of a \"micro-benchmark\" than a representation of real-world performance of the three approaches.\n",
+        "\n",
+        "*Note the differing scales of the included plots"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "a0X_rfvuav98"
+      },
+      "source": [
+        "### Imports"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "EdxWv4Vn0ync"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install -U -q tf-nightly"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "erq3_S7QsjkU"
+      },
+      "outputs": [],
+      "source": [
+        "from __future__ import absolute_import\n",
+        "from __future__ import division\n",
+        "from __future__ import print_function\n",
+        "\n",
+        "import numpy as np\n",
+        "import tensorflow as tf\n",
+        "import matplotlib.pyplot as plt\n",
+        "import math\n",
+        "import time\n",
+        "import random\n",
+        "from colabtools import adhoc_import\n",
+        "from tensorflow.contrib import autograph as ag\n",
+        "from tensorflow.python.framework import function"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "1JgnsXooa2RP"
+      },
+      "source": [
+        "### Testing boilerplate"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "UyD5LLjVZzny"
+      },
+      "outputs": [],
+      "source": [
+        "# Test-only parameters. Test checks successful completion not correctness. \n",
+        "burn_ins = 1\n",
+        "trials = 1\n",
+        "batches = 2\n",
+        "max_elements = 2"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "4_NBL0RQa8gY"
+      },
+      "source": [
+        "### Speed comparison parameters"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "Yq6daecyiJV5"
+      },
+      "outputs": [],
+      "source": [
+        "#@test {\"skip\": true} \n",
+        "burn_ins = 3 # Batches not counted in the average\n",
+        "trials = 10 # Batches run per vector-size (and averaged)\n",
+        "batches = 1000 # Number of random vectors summed over per trial\n",
+        "max_elements = 100 # Vectors of size 0 to this-1 will be executed and plotted"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "fiR8m13CbKH2"
+      },
+      "source": [
+        "### Random input"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "d8vrTlyNXuxc"
+      },
+      "outputs": [],
+      "source": [
+        "# Construct a random num x 1 tensor\n",
+        "def get_elements(num):\n",
+        "  return tf.random_uniform(shape=(num, 1), maxval=1)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "ILJ6SbF3bXFQ"
+      },
+      "source": [
+        "## Graph mode"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "vovRf597X55n"
+      },
+      "outputs": [],
+      "source": [
+        "def tf_sum(elements):\n",
+        "  # Using custom vectorized op\n",
+        "  return tf.reduce_sum(elements)\n",
+        "\n",
+        "def run_trial(num):\n",
+        "  elements = get_elements(num)\n",
+        "  return tf_sum(elements)\n",
+        "\n",
+        "\n",
+        "\n",
+        "graph_means = []\n",
+        "for num in range(max_elements):\n",
+        "  with tf.Graph().as_default():\n",
+        "    durations = []\n",
+        "    foo = run_trial(num)\n",
+        "  \n",
+        "    with tf.Session() as sess:\n",
+        "      \n",
+        "      for _ in range(burn_ins):\n",
+        "        for _ in range(batches):\n",
+        "          sess.run(foo)\n",
+        "      \n",
+        "      for _ in range(trials):\n",
+        "      \n",
+        "        start = time.time()\n",
+        "        for _ in range(batches):\n",
+        "          sess.run(foo)\n",
+        "      \n",
+        "        duration = time.time() - start\n",
+        "        durations.append(duration)    \n",
+        "      \n",
+        "    graph_means.append(np.mean(durations))  "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "height": 301
+        },
+        "colab_type": "code",
+        "executionInfo": {
+          "elapsed": 278,
+          "status": "ok",
+          "timestamp": 1532447361278,
+          "user": {
+            "displayName": "",
+            "photoUrl": "",
+            "userId": ""
+          },
+          "user_tz": 240
+        },
+        "id": "Jm9Blkyx90Eq",
+        "outputId": "d83cd51f-7e56-4d73-f7df-bb157dee46df"
+      },
+      "outputs": [
+        {
+          "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAa8AAAEcCAYAAABwNTvaAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzs3WdgVFXegPFnZtI7kJ5QQwlCKIGERGroEoSAYZFVEFAR\ngV1XXHvbFWEtK6xlUVgRXMuLiqBSZQUh9E5ChxRIn/ReJjNz3g8hNxkzCUMJkHh+X2DmtnPPnXv+\n95R7ohJCCCRJkiSpGVHf6QRIkiRJ0vWSwUuSJElqdmTwkiRJkpodGbwkSZKkZkcGL0mSJKnZkcFL\nkiRJanaaJHitWLGCV199tSl23WINHz6cAwcONPlxXnzxRd5///0mP87dIikpiUmTJtGvXz++/PLL\nO52cFi0+Pp4HHnjgTifjmsaPH8+RI0du6T5/b/eVObcqX9966y3Wrl17zfWsbmTnffv2RaVSAVBe\nXo6NjQ1qtRqVSsUbb7zBE088cSO7vW5paWmMGDGCs2fPolY3n0rkiy++iLe3N0899dSdTspdoSmv\n46effsqAAQPYsGGD2eVbt27l888/5/z58/Tq1Yv//ve/JsvPnTvHyy+/TGJiIgEBASxevJjAwEBl\n+bvvvsu6detQqVQ88MADPPvssxZv29SmT5/OxIkTiY6Ovi3H++CDD3jsscea9BjDhw9n8eLFhIeH\n3/A+Nm3adAtT1LgVK1bwySefoFKp0Ov16PV67OzsEELg7+/Pxo0bCQwMxN7eHpVKhRACa2trDh8+\nfNvSeCPMlWG3Kl8fffRRpkyZQnR0NFZWDYeoGyopTpw4wfHjxzl+/Di+vr6sWLFC+W78+PE3nOjr\nJYRQLrjUfDXldUxPT6dz584NLndzc2PmzJnMmTOn3rKqqirmz59PVFQUR44cISoqinnz5qHX6wFY\nu3YtO3fuZOPGjfz000/s2rWLb775xqJtm4PruR7Z2dkcOnSIESNGNGGKbo7BYLjtx3ziiSeUsvHv\nf/87ffv25fjx45w4cYKNGzcCoFKp+Omnn5Tv73TguhP5VJeHhwcBAQHs3Lmz0fVu+jFXCFHvR/7R\nRx8pT6BpaWkEBgayfv16hg0bxoABA1i7di2nTp1iwoQJhIaGsmjRIpPt161bx7hx4xgwYACPPfYY\n6enpZo89ffp0APr3709wcDCxsbEIIVi+fDnDhw9n4MCBvPDCC5SUlJjdPj8/n7lz5xISEsKAAQN4\n+OGHlWWBgYGkpKQon+s2Cxw+fJihQ4fy6aefcu+99zJ48GB++eUXdu/ezZgxYxgwYAArVqwwe8xv\nv/2WjRs38umnnxIcHMyTTz6pLDt37hwTJkwgJCSEhQsXotPplGW//vorUVFRhISEMG3aNC5cuGB2\n/wAJCQnMnj2bAQMGcN9997F169YG121sv8OHD2fVqlVMmDCBvn378sorr5Cbm8vjjz9OcHAws2fP\npri4WFn/5MmTPPjgg4SEhBAVFWVyE06fPp3333+fadOmERwczKOPPkpBQYGyDEyvY3JyMtOnT6d/\n//6Eh4ezcOHCBs9hx44djB8/ntDQUGbMmEFiYiIAjzzyCIcOHeKNN94gODiYK1eu1Ns2PDycsWPH\n4uHhUW/Z4cOHMRgMzJgxA2tra6ZPn44QgoMHDwLwww8/MHv2bDw9PfH09GTWrFlKDe/QoUONblvX\nli1b6jW3rVmzhnnz5gGg0+l4++23iYiIYNCgQfztb38z+W388ssvREVF0a9fP0aPHs3evXtZtmwZ\nx44dY9GiRQQHB/Pmm28CcPz4caKjowkJCWHKlCmcOHHC5BotW7aMadOm0adPH1JTU1m/fj0jR44k\nODiYkSNHNvh0vW/fPnr06IGNjQ0AK1eu5M9//rPJOm+++SaLFy8GoKSkhJdffplBgwYxdOhQ/vWv\nf5mUI99++y3jxo0jODiY8ePHc+7cOZ577jkyMjJ48sknCQ4OZtWqVUD965+QkKDsZ/jw4fznP/9R\nfsMGg8GkiT4kJITg4GCCg4Pp27cvgYGBSnnT2L1x9uxZJk+eTL9+/Xj66aeprKw0my+WsPQhobGy\nraac/fbbbxk8eDCDBw9m9erVJtuuXLmSUaNGERYWxtNPP01RUZHJtuvWrSMiIoKZM2cC8NRTTzFo\n0CBCQkKYPn26kq8NlWF181Wn07F48WIGDx7MkCFDWLJkCVVVVUBt+bl69Wql/Fy/fr3JuYaEhLBr\n165rZshNiYiIEPv37zf57sMPPxTPPvusEEKI1NRU0a1bN/H666+LyspKsW/fPhEUFCTmz58v8vLy\nRGZmpggPDxdHjhwRQgjxv//9T4wePVokJiYKg8EgPv74YzF16lSzx05NTRWBgYHCaDQq33333Xdi\n9OjRIjU1VZSVlYkFCxYoafmt9957T7z++uvCYDAIvV4vjh49qiwLDAwUycnJyucXXnhB/Otf/xJC\nCHHo0CFxzz33iOXLlwu9Xi++/fZbERYWJp555hlRVlYmLl26JIKCgkRKSorZ49bdV918nDJlisjO\nzhaFhYXivvvuE2vXrhVCCHH69GkRHh4u4uLihNFoFBs2bBARERFCp9PV23dZWZkYOnSo2LBhgzAa\njeLs2bNiwIABIj4+vt6xr7XfiIgIMXXqVJGbmyu0Wq0IDw8XkyZNEufOnRM6nU7MmDFDfPTRR0II\nITIzM0VoaKiIiYkRQgixf/9+ERoaKvLy8oQQQjz88MNi1KhR4sqVK6KyslI8/PDD4r333mvwOi5c\nuFB88sknQgghKisrxbFjx8zmZWJioujTp4/Yv3+/0Ov14j//+Y8YNWqUqKqqUo773Xffmd22rm+/\n/VZMnz7d5LvVq1eLxx9/3OS7J554QqxevVoIIUS/fv1EbGyssuzUqVMiODjYom3rKi8vF8HBweLK\nlSvKdw888IDYsmWLEEKIN998Uzz55JOiqKhIlJaWirlz54qlS5cKIYSIjY0V/fr1U+5BrVYrEhMT\nzZ57QUGBCAkJET/99JMwGAxi06ZNIiQkRBQUFCjrR0REiPj4eGEwGERxcbEIDg4Wly9fFkIIkZ2d\nrfyOfuvtt98Wb7zxhvI5LS1N9OnTR5SUlAghhDAYDGLgwIFKfj355JPi9ddfFxUVFSI3N1dMmTJF\nfPPNN0IIIbZs2SKGDBkiTp8+LYQQIjk5WaSnpwshqn+TBw4cUI5zresfEREhoqKiRGZmpqisrFS+\n+22ZJYQQS5cuFQ8//LDQ6/WN3hs6nU5ERESIzz//XOj1erFt2zbRo0ePevf0b61fv1788Y9/rPd9\nt27dTMqahjRWttWUswsXLhQVFRXiwoULIiwsTDnP1atXi6lTpwqtVit0Op147bXXxMKFC022ff75\n50V5ebmST99//70oKysTOp1OLFmyREycOFFJS0NlWM3x/vWvf4mpU6eKvLw8kZeXJ6ZOnSref/99\nIURt+fnhhx8KvV4vdu3aJXr37i2KioqUfW3fvl1MmjSp0fy4LR1FKpWK+fPnY2Njw7333ou9vT2R\nkZG0atUKLy8v+vfvz9mzZwH45ptvmDNnDh07dkStVjNnzhzOnz9PRkZGYwFY+f+mTZuYOXMmfn5+\n2Nvbs3DhQrZs2YLRaKy3nZWVFdnZ2aSmpqLRaOjXr5/ZfZpjbW3N3Llz0Wg0jBs3jvz8fB555BHs\n7e3p3LkznTt3brR2ZM6MGTNwd3fHxcWFiIgIzp07B8B3333Hgw8+SFBQECqViqioKGxsbIiNja23\nj19//RV/f3+ioqJQqVR0796d0aNHs23btnrrWrLfhx9+mNatW+Pp6Un//v3p3bs3gYGBWFtbM2rU\nKCWNP/30E8OGDWPw4MFAdY2mZ8+e7N69W9nX5MmTadeuHTY2Ntx3333KtjXq5rmVlRVpaWlotVps\nbGwIDg42m2dbt25l2LBhhIeHo9FoePTRR6moqDCpUdyosrIynJ2dTb5zcnJSnnZ/u9zZ2ZmysjKL\ntq3Lzs6OESNGKLWay5cvk5SUpDTBrVu3jhdffBFnZ2ccHByYM2eOsu66deuIjo5W+oA8PT3p2LGj\n2fPZtWsXHTp04P7770etVhMZGUmnTp349ddflXUmTZpEQEAAarUajUaDRqPh4sWLVFZW4u7uTkBA\ngNl9FxcX4+joqHz29fXlnnvu4ZdffgHgwIEDODg40KtXL3JyctizZw8vvfQStra2tG7dmkceeYTN\nmzcr5/TYY4/Ro0cPANq2bYuPj4+y77q/E0uu/4wZM/Dy8lJqheZs2bKFTZs28eGHH6LRaBq9N2Jj\nY9Hr9cyYMQONRsOYMWPo2bNng/u2xKRJkwgJCSE0NFSpnf6WJWXbn/70J2xtbenatSuTJ09W8vTb\nb7/lL3/5C56enlhbWzN//nx+/vlnZVuVSsWf/vQn7OzslHyaPHky9vb2yvrnz59vsBXLXFrnz59P\nq1ataNWqFQsWLODHH39UlltbWzNv3jw0Gg1Dhw7FwcGBpKQkZbmjo6NJq445NzRg40a0adNG+b+d\nnR3u7u7KZ1tbW+WmT09PZ/Hixbz99ttAbX+IVqs1+QE3JCsrC19fX+Wzn58fer2enJwcPD09TdZ9\n7LHH+PDDD5k9ezYqlYopU6aY7fswx83NTRm0YmdnZ/Yca87JUnW3t7e3Jzs7G6jOkx9//FEZLSeE\nQK/Xk5WVVW8f6enpnDx5ktDQUGVdg8FAVFSU2XWvtd+6abK1ta33ue5127p1q1IQ1uyrbsd63Wtu\nb2/faP4899xz/Otf/yI6OlrplzI3ku2311ulUuHj44NWq21w35ZycHCod7OWlJTg5ORkdnlJSQkO\nDg4WbftbkZGRvPPOO8ybN49NmzYxcuRIbGxsyMvLo7y83OTcjUajUoBnZmYydOhQi87nt3kF1UGm\nbl55e3sr/7e3t2fZsmWsWrWKl156iX79+vHcc8/RqVOnevt2cXGhtLS03jlt3ryZiRMnsmnTJqU/\nPD09Hb1ez6BBg4Daroea+zszM5N27drd0DmZu/51z8mcs2fPsmjRIlavXo2bm5uSxsbuDS8vL5N9\n+Pn5WZTehmzYsIG2bds2uk5jZRtUn3vdc/X19eXSpUvK+SxYsEAZECWEwMrKStkWTPPJaDSydOlS\nfv75Z/Lz81GpVKhUKvLz8xv8DTeWVl9fX5Nyxc3NzWRwlp2dncnvp7S0tN7D32/dtuBlKW9vb558\n8kmLBn7UBI+6PD09TfrI0tLSsLKyMik4azg4OPD888/z/PPPk5CQwPTp0+nVqxdhYWHY29tTXl6u\nrJudnX3Nm6CpeHt7M3fuXItGcfr4+DBgwAClP+BW7deS40ZFRfHGG29c97bmrmObNm2UvtBjx44x\na9YsQkND693gnp6eyg1aIyMj45Zcqy5durBmzRqT7y5evKj00XXu3Jnz588TFBQEVPdZdunSpdFt\n6/ar1jVo0CBefPFFzp8/z+bNm3nppZcAaNWqFfb29mzatKnewxdUX8O6fbN1/TZfPT092b59u8l3\n6enpDBkypMFtBg4cyMCBA9HpdCxbtoxXX32Vr776qt6xunXrZvJkDTB27FjeeecdtFotv/zyizKY\nxcfHB1tbWw4dOmT22nt7e5OcnGzxOd3M9c/Ly2PBggW8/vrrJiNBG7s3jhw5Uu/hKD093eKAe6Ma\nK9syMjIQQpCRkaHUvDMyMpTfjI+PD0uWLKFv37719puWlgaY5u3GjRv59ddf+fzzz/H19aW4uJiQ\nkJDrSmtaWppSU09PTzf7+21IQkLCNUfm3pZmw2s1wdU1bdo0VqxYQXx8PFDdHGGuyQugdevWqNVq\nkx96ZGQka9asITU1ldLSUpYtW0ZkZKTZIdi7du1StnVwcFCaSaB6wMamTZswGo3ExMTc0vdC3N3d\nGyxwzPnDH/7A2rVriYuLA6qbpHbv3m225jJs2DCSkpL48ccf0ev1VFVVcerUKWUQw43u91omTJjA\nzp072bt3L0ajkcrKSg4fPmxRDcjcddy2bZuyrYuLC2q12uw1vO+++9i1axcHDx5Er9ezatUqbG1t\n6dOnj0XpNhqN6HQ69Hq9yf8BQkNDUavVfPHFF+h0OuUpfMCAAQBERUWxZs0atFotWq2WNWvWMHny\n5Ea3DQsLM5uOmuand955h6KiIgYOHAigtAgsWbKEvLw8ALRaLXv37gUgOjqa9evXc/DgQYQQaLVa\n5Vr/9nc2dOhQrly5wubNmzEYDGzZsoXExEQiIiLMpik3N5edO3dSXl6OlZWVco+YM3DgQM6cOWMy\nkKR169aEhITw4osv0rZtW6XG5uHhwcCBA1myZAklJSUIIUhJSVHusSlTpvDZZ59x5swZAJKTk5Vu\nA3d3d1JTU5Vj3Mz1NxgM/OlPf2LChAmMHTvWZFlj90afPn2wsrLiiy++wGAwsH37dk6dOnXN490s\nS8q25cuXU1FRwaVLl1i/fj2RkZEATJ06laVLlyrBLy8vjx07dijb/baMLi0txcbGBhcXF8rKynjv\nvfdMgtu1yrDIyEg+/vhj8vLyyMvLY/ny5UycONHicz1y5IjJQ5U5Nx28zD05XWudxj6PHDmSxx9/\nnKeffpr+/fszYcIE9uzZY3a/dnZ2zJ07l2nTphEaGkpcXBzR0dFMnDiRhx9+mFGjRmFvb88rr7xi\ndvvLly8zc+ZM+vbty7Rp03jooYeUp4uXX36ZnTt3EhISwubNmxk5cuRNnWNd0dHRxMfHExoayoIF\nC665fs+ePVm0aBFvvPEGoaGhjBkzpsH3lhwdHfnss8/YsmWLMurovffeMylULN3v9ZyTt7c3y5cv\nZ8WKFYSHhxMREcFnn32m3BSNbWvuOp46dYopU6YQHBzM/Pnzefnll802zXTs2JF3332XRYsWER4e\nzq5du/jkk0+U90Ou9fv88ccf6dWrF2+88QbHjh2jd+/eygv21tbWLF++nA0bNhAaGsr69etZvny5\nsu8HH3yQiIgIJkyYwIQJE4iIiOAPf/iDRduaExkZyYEDB7jvvvtMCqS//vWvtG/fnj/84Q/079+f\n2bNnc/nyZQB69erFkiVLWLJkCf369WPGjBlKQT9jxgy2bdvGgAEDWLx4MW5ubnzyySesWrWKsLAw\nVq1axYoVK3B1dTWbV0ajkdWrVzNkyBDCwsI4cuQIr7/+utm0t2nThrCwMKWPq8b48eM5cOAA999/\nv8n3b7/9NlVVVURGRhIaGspTTz2lNJOPHTuWuXPn8swzzyjXv7CwEIA5c+awfPlyQkNDWb169Q1d\n/5rvMjMzOX78OJ9//rky2jA4OJjMzMxG7w1ra2s+/PBD1q9fT2hoKNu2bWP06NENXtdrsaQMBSwq\n20JDQxk1ahSzZs3iscceU5rtH3nkEUaMGMHs2bPp168fDz74oBKYzaUhKioKHx8fhgwZwvjx4+vV\n2K5Vhs2bN4+ePXsyYcIEJk6cSM+ePZk7d65FeZCVlUVCQsK1y1xxPdUiSZKkBiQkJPDCCy/w3Xff\n3emk/O6kpaUxcuRIzpw506wmbDDn7bffpl27dkybNq3R9WTwkiRJauaa62xDN+P3cZaSJEktnKXN\njy2FrHlJkiRJzY6seUmSJEnNzl33ntfN0OsN5Odf/zDvlqhVKweZF1fJvKgl86KWzItaHh6NvxB8\nN2pRNS8rK/PvoPweybyoJfOilsyLWjIvmrcWFbwkSZKk3wcZvCSpBSsq1VFcVv8FdUlq7mTwkqQW\n7J9rT/LBurhrryhJzUyLGrAhSVIto1GQnlOKrY3s25FaHlnzkqQWqri8CqMQlFfq0Rvq/z07SWrO\nZPCSpBaqsKT2T9MXl1XdwZRI0q0ng5cktVAFJbUDNeSgDamlkcFLklqoujWvIhm8pBamyYNXTEwM\nY8eOZcyYMaxcubLe8qNHjzJ58mR69OhR76+8QvWfTh8yZAhvvvlmUydVklqUgtI6Na9S2WwotSxN\nGryMRiOLFi1i1apVbNq0ic2bN5OQkGCyjq+vL2+99Va9P1ZX4/333yc0NLQpkylJLZJpn5eseUkt\nS5MGr7i4ONq3b4+fnx/W1tZERkaa/OlpqA5eXbt2NTud/+nTp8nLy2PQoEFNmUxJapEK6/R5FckB\nG1IL06TBS6vV4uPjo3z28vIiKyvLom2FELz99ts899xzyL/aIknXr6BU1ryklqtJX1K+maDz9ddf\nM2zYMLy8vK5rX81xduSmIvOi1u8xL4rL9djbaiivNFCpF0oe/B7zoiEyL5qvJg1e3t7epKenK5+1\nWi2enp4WbXvixAmOHz/O119/TWlpKXq9HkdHRxYuXNjodtnZxTeV5pbCw8NZ5sVVv8e8EEKQV1iB\nn4cjqVkl5BaUkZ1d/LvMi4bIvKjVHIN4kwavoKAgkpOTSUtLw8PDg82bN7N06dIG169bu/rnP/+p\n/H/Dhg2cOXPmmoFLkqRqZVdn1WjlZEthSaUcKi+1OE3a56XRaHj11VeZPXs248ePJzIykoCAAD74\n4AN+/fVXAE6dOsXQoUPZtm0br7/+eoOjDiVJslxBcXV/l6uTDS4ONnLAhtTiqEQLGw0hmwGqySaR\nWr/HvDhzOY/31p5kwsAOJKQVcuZyPp88MxQ/X7ffXV405Pf4u2hIc2w2lDNsSFILVPOOl5uTLc6O\nNoCc31BqWWTwkqQWqOYdL1cnG5ztrwavctnvJbUcMnjdJpU6A2u2nufoecvec5Okm1EzKa+bky0u\njtYAFMkpoqQWRAav2+RkfA4xseks/+E0KzeeobRCFiRS0ym8+oKyq6MNzg41zYay5iW1HPIvKd8m\nlzOLAGjjYsfBM1ouJBfwxIQedG3rdodTdnuUlFdx8lIO+cUV5JfoqKjUM3FQR7xaO9zppLVIBSU6\nVICLow3ODtU1L9nnJbUkMnjdJpczilEBf58dwi9HU/lp32VWbz3PP+aE3emk3RZfbr/A4XOmTaat\nXGyZMqzzHUpRy1ZYUomzgzVWGjUuV2te8l0vqSWRzYa3gVEILmuL8XF3xMHOmgmDOtLZz4WsvDKq\n9IY7nbwmpzcYiUvIpY2LLU//oTcvPhwMQFp26R1OWctVUKrD1ckWoE7Nq+UEr5LyKjnn6e+cDF63\ngTavjEqdgQ7ete9SeLdxQADa/PI7l7Db5GJKARU6A327eBDUqQ1d/N1wdbIhNbvkTietRarQ6anU\nGXB1qq5x1fZ5tYxmw0upBfz5/T38c+1Jsgta/v0jmSeD121wObP6RUiT4NXaEYDM3LI7kqbbKTY+\nF4Dend2V7/w9nMgrqqRMDly55WqGybs5Vte87Gw0WGnULabmde5KvvLvq6sO8cvRFIyyFva7I4PX\nbXA5oyZ4uSjfeV8dqJCZ1/KDV1xCDrY2GpPBKf4e1cE7VTYd3nIFJbVTQwGoVCpcHK1bzFD5mt/M\nlIgArDVqvv7lEv/dduEOp0q63eSAjdvgcmYRKhW09XJSvvNu8/sIXpl5ZWjzy+nX1QNrq9pnJX+P\n6rxIyy753Yy4vF56g5HSCj2uV2fIsFRhae07XjWc7W3IyL27HxTKKqrYeiiZvKJKist1lJRVMaS3\nL8P6+pmsl5Zdgr2tFWND23FvTx+WfHGUg2cyeWhUF6ytNHco9dLtJmteTcxoFFzRFuPn7oitde2N\n5e5qh0atQtvCg1dsfA4AvTq3Mfm+JnjJmlfDvvrfRZ7/ZD9FpdfX3KdMylsn6Dk7WqPTG6mo1N/S\nNN5K2w4ns/nAFQ6cyeR0Yh6XM4v5+UiKyTpVegPavHL8PBxRqVS4OtrQr6snOr2R88kFdyjl0p0g\ng1cTy8gtRVdlNGkyBLDSqPFwsyczr6xFj5pSgleAu8n3Pm0cUKmQgzYaUFpRxf7TmeiqjCSmF13X\ntgVmal41w+VrmhTvNlV6I7tPpuNoZ8U/ngjj44VD6d6+Fdq8MpMX+tNzyjAKQVuP2laMoIDqB6O4\nq32rUjVtfhknLmbf6WQ0GRm8mpgyWMOn/qzN3q0dKK3QU1x+Y30Rd3vQK6uo4lJqIR19XOo1fdlY\na/Bq5UBqduldfx51ZeSWsnzDKfKLmzYIHDidSZXeCNS+4G6pwt/0eUHtcPnrrcXdLkfPZ1FcVsXg\n3r54tXLA1kZDJ9/qB76kOsG75mGnps8UoIu/K/a2GmITcprVb6mprd5yng/XnyIhvfBOJ6VJyODV\nxMwN1qihDNpoZMThtzvj+b9fLtX7Pq+ogqc+2Mv2w8m3KKW33umkPAxGQe+ANmaX+3s4Ul6pNwkE\nCWmFrN1xCYPReLuSeV02xCRy9EI2/zuaUm9ZUZmO9JybbwYVQhATm45GrQJqH4AsVTuvYW3wuttr\nXjuOp6ICIur0b9UEr8SM+sHLr07Ny0qjpkeH1uQUVpDxOxi9a4n84koupVQ3o/58uP5vtSWQwauJ\nXc4sQqNW0dbTsd6yaw3aiE8tZNvhZP53NKVeoRgTm05JeRUHz2pvfaJvQn5xJTmF5RSWVHL8apNF\n3SHyddX2e9U2Ha7deYntR1K4eBf2X+QUlnPs6jntP52J3lAbYIUQfPh9HH9bfZicm3z3KDG9iNTs\nUvp29aCNiy2XM4quq0ZRWKrD0c7KZPCCU03N6y4MXkkZRSSmF9G7szsebvbK9518XQFMmk1r+kjr\n1rygtlk6LkE2HQIcOZ+FADRqFccuZLXI9+Fk8GpCeoOR5KwS/DwczY6Camy4vBCC73cnKJ93nUxT\n/m8wGtkTlwHAFW3xLZvk1yjETTW7nLyUwzP/3sdzHx/g6Y/2cfhcFm5ONrSrM8qyLj9lxGF1gZSZ\nV0ZCWnVBdSn17mvq2Hk8DSGqB9sUleo4lVhbUF5MKSAhrQi9QbD54JWbOs7u2HQAhvb2pYO3C0Vl\nVdfVTFlYUqnMrlGjtuZ19zUb7jyWCsDwfqajCl0dbWjjYkdiem3wTs0uoY2LLQ521ibrKv1eCTm3\nIcV3vyPntKhUED0sACEw21LQ3Mng1YTSc0qp0tcfrFGjsWbDM5fzuJBSQI+OrXF1tGH/qUwqq6qn\nkjqVmEd+cSU21mqE4JbUUsor9Ty7fD///fnG3pcxCsH6mERUKgjr4UVIoCd9u7jz4IguqFQqs9v4\ne9a861WD0HifAAAgAElEQVRd89p/OlNZdjH17qp5VeoMxJxMx8XBmicm9gBg79UHCIBth6qbb53s\nrdkbl0FuYcUNHae8Us/hc1rcXe3o3qGV0ldqadNhld5gdnh9zSwbhXdZzauoTMehc1l4tXbgng6t\n6y3v5OtCSXkV2YUVFJfpKCzRmTQZ1nB1tKGDtzOXUgspq7gzIypv9kXp3zah36icgnIS0osIbNeK\nEf38aeVsy57YjBb3lyxk8GpCjQ3WgOpOdAdbq3o1r+paVyIAU4YFMLi3L2VXCzWAmJPVT+YPDAkA\n4Fxy/k2n9XRSdUDcfTKds5fzrnv7ExdzSM0uYcA9Xsy5vwdPRvXkTw/0IrS7V4PbeLjZY2OtJjW7\nFKMQHDidgZ2NBk83exLSiu6qfq/9pzMoq9QzrK8fAb6utPNyIjY+l8KSStJySolNyKWznysPjuiM\nwXjjta9DZ7XoqowM7u2LWqVSHnwsHbRRaKa/C8DlarPh3RS8Siuq+GFPEnqDkeHBfqjNPOQo/V7p\nhXWaDM3X5HsFtMFgFDf0+71Ze2LTmbd0NxdTbuyhq7xSz5v/Pcqrnx666Vlnjlz9m4ED7vHCSqNm\nZH9/KqsM7L5abrQUMnjdpLKKKtbuuGS2Tfn81WlsOjZQ81KpVHi3cSC7oNykoD52IZsrmcWEdvek\nnZczQ3v7olLBrhPp5BVVEJuQQwdvZ4b19cPaSq0c52bUHVL7xc8XrmvCYCEEG/cloQLGh3eweDu1\nSoWfuyMZuaWcu5xPblEl/QM96d6hFZVVBpK1t3YYfUZuKa+tOsSWRgJLld5ITGw6b311nP/+fIH8\n4kqMQvC/o6lo1CplQMHgXr7VAfeMlp+v1rrGDmjHgHu88Gxlz57Y6mt1PfQGI7+eSEOtUjEoyAeA\n9lenFKsZ+NMQo1EQl5DD59vOA9RrNqyted1cs2FZhf6mA2BiehErN55h4Uf72HUiDRdHGwb29DG7\nbm3wKjI70rCumr7V2NvcdGgUgi0Hr6CrMvLpprOUX+e7dEIIVm0+R0ZuGWWVemJiM669USMOn8tC\no1YR3NUDgKG9/bCz0fDL0RQycku5kJzP0fNZN9w6cLeQM2zcpB/2JvHL0VS0eWU8NaW38n1uYQVH\nzmfh08bBZGaN3/Ju7UBiehE5BRV4tXbAaBRs2JOIWqUianAnANq42tGrUxtiE3L5+pdLCAFD+/hi\nbaWmi78rZy/nU1Sqw+U6Z2KooTcYiU3IpY2LHX27uvPL0VS2HExm4qCOFm1/Mj6H5KwSQrt74utu\nvmBpiJ+HE0kZxayPqa5pDuzpTV5RdQ3wUkoBHX3MB/7rVVSqY9m3seQUVrBuVwKujjYMDKotMCt0\nenYcS+WXo6nKDBUXUwrYfyqD3p3dycwr496e3kpQGHCPF9/svMTO46nkF1fi1dqBPl3cUatUjA/v\nwGdbzrH54BWmj+5mUfqMQvDZ5nOkZFXXXls5Vx/Hyd4ad1c7LmcWI4Qw2wR75nIen289T87VwqiT\nr4vJuQHY2miwsVYrf6SyMWcu53HyYg4TB3fEyb62b6msooq/rT5CaYWev88Kwb3O4ApLZeWX8Y8v\nj2EwCrxa2TOkty/3BvngYGe+KGrv5YxGrSIpvQjd1Wbzhmpe7b2dcXG0ITY+l437krC11mBtrUGv\nN1JRZUBXZSCwfSt6mGmevBnnLuejzS/H0c6KnMIKvv01nkfGBlq8/ZaDVzh+MZvO/q6kaEv45VgK\no0L80ajN1y1yCsrJK640OzONNq+MK9pigjq1Ua6dg50VQ3r7sv1ICi//55Cybr+uHsyfHHSdZ3v3\naPKaV0xMDGPHjmXMmDGsXLmy3vKjR48yefJkevTowfbt25Xvz58/z4MPPsj999/PxIkT2bJlS1Mn\n9bplFZTz6/HqgRSxCbkmo6K2H0nBYBTcN6C92eaQGjX9XhlXmw73nsogI7eMgUHeyjJAmSLn+MVs\nbG00SnNc9/atADh/E02H55PzKa/U07erO5MGd6KVsy2bD1y2aOoqIQQ/7b2MCrh/oGXBri7/q8Eu\nKaMId1c7urR1o0vb6lFmF2/RoI0KnZ7318WRU1jB0D6+ONhasWbreaWJJz6tkL99doTvdydSWWVg\nbGg73pkbzsz7AnG0t1aaYUb1b6vs08nemr5dPMgprMBgFIwJbatc5/CeXni42bEnNp2kjPrNffFp\nhRw4k6k0Dwkh+HL7RQ6e1RLg58LM3xR8HXyq+33MPSnnFJTz8YbTFJRUMqS3D6/PDOGVGf3xM/MQ\n4eJgQ2EjfSpVeiPf7LzEe2tPsuN4Kh//cFoZUSmE4L8/XyCnsILySj2fbjqL0Xj9fTw198VDo7qy\nZE4Y94W1b3T6KxtrDf4eTlzRlnA5sxiNWqWM0v0ttUpFv64elJRXsWFPEmt3xvPFzxf4vx2X2BCT\nyOYDV1i+4dRN9f2cuZxXr1lv5/HqASd/eqAX/h5O7D6ZbvGoxzNJeayPSaSVsy0LJgUxMKj64e3Y\nBfMvF5+8lMNrnx3mra+Om32lpKZrIbS7p8n348LaE9bDi0G9fIgMb88fR3Zh6vDm/bf0mrTmZTQa\nWbRoEWvWrMHT05Po6GhGjBhBQECAso6vry9vvfUWn332mcm29vb2vPPOO7Rr146srCwmT57MkCFD\ncHJquBZzu22IScRgFAzr68euE2n8uDeJp//Qm5LyKmJi02nlbEtYj4b7fMB00EZlOwM/7EnExkqt\n1LpqBHVqQxsXO3KLKhjQ3Qt72+pLF6gEr4JG+5cac+JidTNLcBcP7G2tmDaiC8t/OM0XP1/gmQf7\nNBp8YxNyuaItJiTQ02yBeS1+nrXX896e3qhVKtxd7WntYsul1AKT2sbxi9moVSr6dDE/9N4cg9HI\ne18dIymjiHt7ejNjTDdCAz1Z+m0sH60/xb09vatHYgkYG9qO8fd2UGoBQ9zsCbvHi10n0hDUNuHV\nGNzLhyPns3BxsGZgT2/le41aTdSgTvxn01kWfX6UPp3dGX9vB/KLK9l2+IoyotLaSk2/bh7YWGmI\niU2nracTT0/pja2N6cjUjt7OHD2fxeXMYpPajt5g5OMfz1BWqWfWuEAG9/JtNC+cHayVl8Lr1uCM\nRkFSRhFf/HyB5KwSvFrZ08bVjrOX8/lmZzwPjerKgTOZHD6XRYCfC26Othy7mM22w8mMC2tv8bUo\nLtOxNy6DNi62DO3j2+BAnt/q5OvCFW0xydoS/D0csdI0/Mz94IguhPf0plJnoPJqbcvaSo2tjYaz\nl/PZdiiZnw+nMHlI7f1VXqlnQ0wirVxs6dGhNf6eTmZ/86cTc1n6bSyd/Vx57o99sdKoyS2s4GR8\nDu29neni78rj99/DG2uOsHrrOSYO6kh8aiHxqYU4OVgzc2wg/nV+76cTc1nx0xk0ahXzJvXExdGG\nUf3b8uvxNLYfSTG5n4UQbD5whQ0xiVhZqfFws2P7kRTSckqZO7EH1ho1567ks/dUBlYaFX27eJik\n3cXRhjn397Aov5uLJg1ecXFxtG/fHj+/6lpDZGQkO3bsqBe8gHo/5Pbta28KT09P2rRpQ15eXqPB\nq6lmD9AbjPxn41kc7ayYEtEZe1srkjKKOHRWS3tvZx4e3ZXM3FJOJeYSn1bI2aQ8KqsMRA3u2OiN\nBqbD5bcfTaGgRMf4e9srzUY11GoVYwe045ud8QwPrh1S3MHbGTsbjfJnIqC6U/7AuSw6eTnh1dr8\nU2oNoxCcuJSNk721UuPp182D3gHVzZQ/H07mvgHmC6hKnYFvdsYDcP/ADo0epyF1m4DC6wSALv5u\nHDqrJTOvDJ82jqTnlLJ8w2mMQjDzvkCG9DZfUFfqDOyJS+dCcgEZeWVk5ZehNwgC27kx875AVCoV\n3Tu0ZvqYbqzZep7tR1Jo42LHY+O7061dq3r7s7HWMDq0ndlj3dOhNSOC/Qls71bvVYjwnt44O1jz\n077LnIzP4WR8bT9Mn87udPB25sCZTA6eqX5S9mrtwDNT+9QbAg61f0onKbOI/oG1T9TrdiWQlFFE\neA8vpY+sMc4ONlTpiymr1KPNK+dMUi4XUwtJSCukQlfdJDe4lw/TRnZBCFjyxTF2HEvF0c6K7UdS\nsLPR8Pj9PXCwtSI+vZANMYn06NC6XlBvyK8n0tDpjYwKaXfN+6KuTr4u/HqiuoWjoSbDGtZWajr7\nuZpd1sXfjf2nM/nlaAqjQ9pSU7x/uf0iB85Uj3T9jgScHawZHdKWyN/032692rcZn1bI97sTmDq8\nC7tjq1+fGN7XD5VKRVtPJ6IGd+T73YnKTPf2thqyCspZ9N+jTBvRhUG9fPhhTxJbDl7BSqNi1n3d\nCbj6TptXawd6d3bnZHwO8WmFdPZzpbBUx5fbL3DsQjatXWz50+ReeLjZs3LjGeIScnn5P4eoqNSj\nuzojy5Devg02w7YkTXqGWq0WH5/am8rLy4tTp05d937i4uLQ6/W0a2e+EKnx0GtbmTaiC6NC2ja6\nXg0hBMnaElq52CrvwZizcd9lpenodFIecyb0YMPVPpo/DAtQ+qfe+uo463YlkJ5TiuPVduZr8Wxl\njwpISC/k8Dktzg7WDQaL4cF+DO7lg02dCX41ajVd27oRl5BLXlEFGo2at78+oTT5dW3rxuBePrT1\ndEKjVqFWq2jtbKc83SdlFFFQomNgkLfSxq5SqZg1rjuvrz7M97sS6eLvZrZA+L8dl9DmlTE6pO01\nC5WGuDra0M7TiVbOtni1qg20Xf1dOXRWy6XUQnzaOPLdr/EYhcDGWs3nW89jY6UmrEdtsCutqFL6\nrEquTrdlb6uhracTgR3bEBna1qTAHNLbl0qdgdyiCiYO6qjUZK+HWq3iodFdG1zes1MbenRszfkr\n+ew4noaTfXWhWNMveP/ADlxKLeR0Uh7D+vg22GdpbtDGiUvZbD+SgndrB6aP6WZRLaZmiqjnPt5P\neWXtgBzv1g508XelXzdPetWZDeVP0b1YtOYIP+27DMBj47vjebXm9+i47iz9NpaVG8/w2swQk0mn\nzdFVGdhxLBV7WysG97p2oK2rZtAGgF8DgzUsYWutYVxYe9buuMTPh5OZ2641h85qOXAmk44+zozs\n35azSXnEJuTy/e5EurVtRWf/6t/9lcxizl3Jp4u/K8VlVfx8OIWOPi7EXJ2PMfSe2lrSfQPao1ar\nsLPW0KWtG77ujsTG5/DZ5nP89+cL/Lg3icJSHZ5u9syN6lHvVZpRIW05GZ/DtkPJdGvnxg97Eimv\nNNC1rRvzonoqv5M/P9CLDXsS2XYouTroBbShd2f3BoN3S9OkwetWzDOWlZXFc889xzvvvHPNdVs5\n27J25yU6+LtxbyNNKAaj4NDpDNbviufClXzcnG15ZVYo3drX78i9mJzP5oNX8Gxlz+A+fqzfFc8/\nvjyGEBAc6MmQkOpA4+HhTJ/DKZy8VN1WPXVkV9r513+SN8ejtYPyou7MyHss3q5GSA9v4hJyOZ9W\nxC+Hk8nMK2NYP3/yCiuIi8+pN3zXyd6a52f0p09XT7ZcnTpmWP92eHjUPkF7eMDz00N45ZN9rNx4\nlg+eGaaMWAM4cCqdmNh0Ovq6MDe69039KYoPnx2uzAZQY0AvP77YfpHk7FLSCyqITcilZ0AbHpvQ\nk5c/3senm89hY2dNhc7A8QtZnE7IRVdlwMnemgdHdWP0gPa4u9k1Wqj/cdw9N5zm6+Hp6aL8Tswt\nGxh87YctH3dHkrNKcHd3YvfxVFZuPIuNlZqXZoXS1teywiqwozv7TmXiaGfN4D7+BHfzpGdAm3oj\nE2t4eDjz4sxQ/v7pQQb38WPCsNp39iI8nLmYVsSmfUms3nqBF2eG1KtNGY0C9dVruu3AZYrLqoge\n3uW6f99t2jjhaGdFaYWeHp09TH6n1yt6VDe2H0lmx7FURoZ14MvtF7Cz0fDCI6H4ejgxYRicTcrl\n+Y/28u2ueN57aihqtYrPf74IwEP3dcfd1Z6F78ew8qczGAVEDQ3A39d08MSM8T1NPo/2dKFvdx/+\n+dVRziblMaSPH/On9DZb03Z3d2Ld7gSOX8zm+MVsHO2tmTu5B2PDO5jcIwBzo/vw+OTe9b7/PWjS\n4OXt7U16eu27BVqtFk9Pz0a2MFVSUsLcuXNZuHAhvXr1uub6rz0Wxgsf7eWfXx3jWaPR7BNIfGoh\nqzafRZtfPbS9W1s3LqYW8MK/9zF7XKDJ07yuysA/vzyK0Sh4ZGwg3du3orOPM//ZdJbCEh0TwtuT\nnV37NDxuQDtOXsrG2kpNeHdPk2WN8XS1IyuvDK9W9gR3bmPxdjXaXu3A/vTH0wCMCPbnqWnB5OSU\nkFVQzqGzWopLdRiMAl2VgUPntLy+8iAPjujMvtg0bKzU+Le2r3dcb1dbJgzsyA97k3jn8yPMmXAP\ndjZW5BdX8v7aE1hbqZk9rjsF+bd+Pjk7DTjaWRF3KZv4q4NRJg/uiIuthqem9Oa9tSf56LtYZX1f\nd0cGBnkzrI9fdS1Krycnp3potYeH83Xn6d2mrYcjh3NK+ft/DnDsQjZ2NhrmTOyBk7Xa4nML7+7B\nkL5j0FfolCCkK9eRXd5wc7uvmx3LFgzC3laj5GeN+8Pbk5RWwOGzmbzz+REeHd8dtUpFUkYRq7ec\nI6ugnF4B7oQGevJ9TCIateq67ou6Ovq4cDopDxdbzU1fy7Gh7fj6l0s8/9FedFUGZt4XiDVC2a+H\nkw1h93hx8KyWDTsvck+HVuw5mYa/hyNtW9ujUqmYProrqzafA2BANw+L07RwSm+0+WXVk3IXV1Ba\nbH64emRYez7+4TThPb2JHhaAi4MNeblN9xcYbuaB4E5p0uAVFBREcnIyaWlpeHh4sHnzZpYuXdrg\n+nVralVVVcyfP5+oqChGjx5t0fE6+7vxZFRPPlgXxwfr4njuj31NmrMOn9Py6aZzGI2Cwb18GDug\nHT5tHDmVmMsnP55m5cazxKcV0qNja/zcHdlxLI2M3DJG9vNXRvV1a9eKxY+FUVymqzdUuLO/K9HD\nAnB1tLmuYettPZ04nZRH9LCA6+oLqOHvWftkOqS3D9NG1T4he7rZc/+9HUzWH9rHj482nOLrqxP+\nBnf1aLDZZ/y9HbiQUsDJ+BzmLY3BzckGlUpFaYWe6aO73tAgDUuoVSo6+7kSe3XUVngPb6V5pbOf\nKwun9mZPXAZd/F3p0aE1rV3smiQdd4sO3i4cPpfFsQvZtPV0Yt6knibNrJZQq1S0drEju/L6Rts1\n1H9ibaVm/uQg3lt7kgNnMnGws8LWWsPWQ1cQAtq42HH0fBZHrza5DwryqdeXa6lpI7uQll16S67z\n0D6+bD2UTH5xJcFdPcw2Y06J6MyJSzl8vzuB+FR3jEIwJrSdcl8NDPKpfgfQKK7Zr1yXWq3Cp821\n75ngrh6s+OswpeYq1acSTfw3BGJiYli8eDFCCKKjo5kzZw4ffPABQUFBREREcOrUKRYsWEBRURG2\ntrZ4eHiwceNGfvrpJ1566SW6dOmijI76xz/+QWBg4+9PZGcXExObzpqt51EBfbq4Mzqk7dVO1kTs\nbDTMi+pJz06mM52n55Tywbo4sn7zsrFXawf+Nuvabfo3o6xCT0pWsdkBA5baE5tOdmEFUYM6olar\nrlnbyCuq4IPv40jWljBnwj2E3ePd4LrFZTq2HLxCanYp2rwycgsrCO7qwbxJPS0eMXYjth68wne7\nErC2UvOPOWE3XHC1hJpXSlYJiz4/SngPLx4a1dWk3/N6NEVelJRX8fZXx0m7Onm0u6sds8Z1J7Cd\nGylZJRw+l8UVbTEzxnQzmXj3Tjp+MZtD57N4eGQXk+bwujbtv6y8f9jK2Za354bf0MNlc9Aca15N\nHrxut5ob89iFLDYfuGIyJ1wrZ1v+MqU3bT3NDy6o0Ok5dyWf9JxS0nJKyS+qZOqIzg3OTXg3s6SQ\nqqwycCWzmC7+rtcVhPQGIxq1qkkDF0BqVgmvrz7MxEEdmXAD75DVaAnBC6rz/WYLz6bKi/ziSlb+\ndIa2Xk5MHtIJO5u7f7TbtfKiSm/glU8PkV1QwZSIgAYHUrUEMnjdBer+GIUQJKQVsf1oCuWVemaP\n637DzRbNTUspsEvKq3C0s7qpQNlS8uJWkHlRy5K8iE8tZPfJNP44qusNjUhtLppj8Gq5V4PqId+d\n/V2V4a5S81N3eiJJut1k+XH3apkNuJIkSVKLJoOXJEmS1OzI4CVJkiQ1OzJ4SZIkSc2ODF6SJElS\nsyODlyRJktTsyOAlSZIkNTsyeEmSJEnNjgxekiRJUrMjg5ckSZLU7MjgJUmSJDU7MnhJkiRJzY4M\nXpIkSVKzI4OXJEmS1OzI4CVJkiQ1OzJ4SZIkSc2ODF6SJElSsyODlyRJktTsyOAlSZIkNTtNHrxi\nYmIYO3YsY8aMYeXKlfWWHz16lMmTJ9OjRw+2b99usmzDhg2MGTOGMWPG8MMPPzR1UiVJkqRmwupa\nK6SkpLBu3ToOHTpEZmYmtra2BAYGMmbMGEaPHo2VVcO7MBqNLFq0iDVr1uDp6Ul0dDQjRowgICBA\nWcfX15e33nqLzz77zGTbwsJC/v3vf7NhwwaEEEyePJkRI0bg7Ox8E6crSZIktQSNBq/XXnuNM2fO\nMHbsWP7617/i7u5OZWUlCQkJ7N27l5UrV/K3v/2NPn36mN0+Li6O9u3b4+fnB0BkZCQ7duyoF7wA\nVCqVybZ79+5l4MCBSrAaOHAge/bsYdy4cTd+tpIkSVKL0GjwGjFiBG+88Ua977t168a4ceMoKCgg\nJSWlwe21Wi0+Pj7KZy8vL06dOmVRwsxtq9VqLdpWkiRJatkaDV5Dhw5tdGM3Nzfc3NwaXC6EuLFU\nNbDtb2tn5nh4yGbFGjIvasm8qCXzopbMi+brmn1eAG+99Rbz58/H3t6eGTNmcPbsWf7+978zceLE\nRrfz9vYmPT1d+azVavH09LQoYd7e3hw6dEj5nJmZSVhY2DW3y84utmj/LZ2Hh7PMi6tkXtSSeVFL\n5kWt5hjELRptuH//fpydndm7dy9eXl78/PPP9QZYmBMUFERycjJpaWnodDo2b97MiBEjGly/bm1r\n0KBB7N+/n+LiYgoLC9m/fz+DBg2yJLmSJElSC2dRzavGkSNHGDVqFF5eXhY14Wk0Gl599VVmz56N\nEILo6GgCAgL44IMPCAoKIiIiglOnTrFgwQKKior49ddf+eijj9i4cSOurq7MmzePBx54AJVKxYIF\nC3BxcbnhE5UkSZJaDpWwoGNq1qxZ+Pn5sW/fPn744QccHR2ZNGkSGzduvB1pvC6yGaCabBKpJfOi\nlsyLWjIvarXYZsP33nuPzp07s2zZMlxdXcnMzGTWrFlNnTZJkiRJMsuiZsPWrVszc+ZM5bO/vz/+\n/v5NlSZJkiRJalSjwSssLKzRvq0DBw7c8gRJkiRJ0rU0Gry+//57ANatW0dBQQFTp05FCMH333+P\nl5fXbUmgJEmSJP1Wo8GrZlqnI0eO8OWXXyrfv/LKKzz88MM8/vjjTZs6SZIkSTLDogEbWVlZ5OXl\nKZ/z8vLIzs5uskRJkiRJUmMsGrDxyCOPEBUVxbBhwwDYvXs3TzzxRFOmS5IkSZIaZFHweuihh+jX\nrx9HjhxBCMFDDz1Et27dmjptkiRJkmSWxTNsBAYGEhgY2JRpkSRJkiSLWBS8jh8/zrvvvktKSgoG\ngwEhBCqVSg6VlyRJku4Ii4LXyy+/zLx58+jTpw9qtUVjPCRJkiSpyVgUvOzs7Lj//vubOi2SJEmS\nZBGLqlFDhgxh9+7dTZ0WSZIkSbKIRTWvb775hhUrVuDo6IiNjY3s85IkSZLuKIuCV800UZIkSZJ0\nN7AoePn5+aHX60lKSkKlUtGhQwesrK7r71hKkiRJ0i1jUQQ6deoUf/7zn5UmQ71ez4cffkiPHj2a\nOn2SJEmSVI9FwWvx4sUsWbKE8PBwAA4ePMiiRYtYu3ZtkyZOkiRJksyxaLRheXm5Erig+u98lZeX\nN1miJEmSJKkxFgUve3t7Dh48qHw+fPgw9vb2TZYoSZIkSWqMRc2GL730Ek899RQ2NjYAVFVV8cEH\nH1h0gJiYGJYsWYIQggceeIA5c+aYLNfpdDz//POcOXOGVq1asWzZMnx9fdHr9bzyyiucOXMGo9HI\nxIkT620rSZIk/T5ZFLx69erF9u3bSUpKQghBp06dsLa2vuZ2RqORRYsWsWbNGjw9PYmOjmbEiBEE\nBAQo66xbtw5XV1e2b9/Oli1bePfdd1m2bBnbtm2jqqqKjRs3UlFRwbhx4xg/fjy+vr43fraSJElS\ni2BRs+H+/fupqKiga9eudOvWjfLycoteUI6Li6N9+/b4+flhbW1NZGQkO3bsMFlnx44dTJo0CYAx\nY8YozZMqlYqysjIMBgPl5eXY2Njg5OR0vecnSZIktUAWBa933nnHJHA4OTnxzjvvXHM7rVaLj4+P\n8tnLy4usrCyTdbKysvD29gZAo9Hg7OxMQUEBY8aMwd7enkGDBjF8+HAeffRRXFxcLDopSZIkqWWz\nqNmwZjqoGmq1GoPBYNF217tOzbHi4uLQaDTs27ePgoIC/vjHPxIeHo6/v78lSZYkSZJaMIuCl6Oj\nI7GxsfTu3RuA2NhYHBwcrrmdt7c36enpymetVounp2e9dTIzM/Hy8sJgMFBSUoKrqyubNm1i8ODB\nqNVqWrduTXBwMKdPn75m8PLwcLbklH4XZF7UknlRS+ZFLZkXzZdFwevZZ59l/vz5dO7cGYD4+Hg+\n+uija24XFBREcnIyaWlpeHh4sHnzZpYuXWqyTkREBBs2bKB3795s27aNsLAwAHx8fDh48CATJkyg\nrKyM2NhYZs6cec1jZmcXW3JKLZ6Hh7PMi6tkXtSSeVFL5kWt5hjEVcKStj2gsLCQkydPIoSgb9++\nuLq6WnSAmJgYFi9ejBCC6Oho5syZwwcffEBQUBARERHodDqeffZZzp07h5ubG0uXLsXf35+ysjJe\nfNEYr50AABg9SURBVPFFEhISAHjggQeYNWvWNY8nf4zV5I1ZS+ZFLZkXtWRe1GrRwSspKYmEhARG\njhxJaWkpVVVVuLm5NXX6rpv8MVaTN2YtmRe1ZF7UknlRqzkGL4tGG27YsIEnn3ySf/zjH0B139Vf\n/vKXJk2YJEmSJDXEouD1+eef8/333+PsXB2dO3XqRE5OTpMmTJIkSZIaYlHwsra2xtHR0eQ7jUbT\nJAmSJEmSpGuxKHi5ubkpf4gS4Mcff1ReLJYkSZKk283iiXmfeeYZkpKSGD58OHZ2dnzyySdNnTZJ\nkiRJMsui4NWxY0e+++47Ll++jBCCjh07ymZDSZIk6Y6xqNkwKSkJvV5PQEAAGRkZrFq1isLCwqZO\nmyRJkiSZZVHw+stf/oJarSYlJYXXX3+dlJQUnn/++aZOmyRJkiSZZVHwUqvVWFtbs3v3bqZNm8ai\nRYvIyMho6rRJkiRJklkWBa/Kykq0Wi07d+5U5h60cGIOSZIkSbrlLApejzzyCJGRkTg6OhIUFERK\nSorywrIkSZIk3W4Wz21Yl8FgwGAwYGNj0xRpuilyrrJqct62WjIvasm8qCXzolaLm9vw9OnTZr/X\naDTY2Nig0+mUWd8lSZIk6XZp9D2vFStWUF5ezvjx4+nduzfu7u5UVlaSlJTEnj172L17Ny+88AIB\nAQG3K72SJEmS1Hjw+vDDD4mLi+Obb77h3//+N5mZmdjb29O1a1dGjhzJV199hZOT0+1KqyRJkiQB\nFsyw0atXL3r16nU70iJJkiRJFrFotKEkSZIk3U1k8JIkSZKaHRm8JEmSpGZHBi9JkiSp2bEoeOXm\n5vLXv/6Vhx56CIDz58/zf//3f02aMEmSJElqiEXB65VXXqFfv34UFRUB0KlTJ77++muLDhATE8PY\nsWMZM2YMK1eurLdcp9Px9NNPM3r0aKZOnUp6erqy7Pz58zz44IOMHz+eCRMmoNPpLDqmJEmS1LJZ\nFLy0Wi3Tpk1T/gCljY0NavW1NzUajSxatIhVq1axadMmNm/eXG9GjnXr1uHq6sr27dt55JFHePfd\nd4HqKaiee+453njjDTZt2sQXX3yBtbX19Z6fJEmS1AJZFLysrExfBysqKrJoVvm4uDjat2+Pn58f\n1tbWREZGsmPHDpN1duzYwaRJkwAYM2YMBw8eBGDv3r0EBgbStWtXAFxdXVGpVJYkV5IkSWrhLApe\no0eP5rXXXqO0tJT169cze/ZsHnjggWtup9Vq8fHxUT57eXmRlZVlsk5WVhbe3t5A9ZyJzs7OFBT8\nf3v3HhxVef9x/L1sAlJMgpiQRaS0JraQGqAzKsERIYBZIITsBiIMUsKlpdoBKqFYwck4crXGyUhk\nOhIBKzRMa4HIJRBSgxI6XGy1hZkCRUEn3JJwS5NgypLN8/sjP3YbgrBWNvEkn9df7Nlnz373yzN8\nOGfPPqeKL774AoAZM2aQlpbG6tWrA/1MIiLSxt12hQ2An/70p2zdupXq6mr27NnDT37yE1JTU2/7\nukCOzm4cY4zBZrPh9Xr55JNP2LRpE506dWLq1Kk89NBDvvuJiYhI+xVQeAGMHTuWsWPHfq2dOxyO\nJhdgVFRU0L1792ZjysvLiY6Oxuv1UltbS0REBA6Hg0ceeYSIiAgAnnjiCY4cOXLb8LLi0v7Bol74\nqRd+6oWfemFdAYXXxYsX+f3vf09ZWRn19fW+7StWrLjl6+Lj4ykrK+PMmTNERUVRWFhITk5OkzGJ\niYkUFBTQv39/ioqKfOH0+OOPs3r1aq5evYrdbuevf/0rU6dOvW2tuj9PI92ryE+98FMv/NQLPyuG\neEDh9Ytf/IK4uDgGDRrku+IwEHa7naysLKZPn44xhvHjxxMTE0Nubi7x8fEkJiaSnp7O/PnzSUpK\nomvXrr5wCw8PZ9q0aYwbNw6bzcbQoUMZMmTI//YpRUSkTQnoTspjx45l69atLVHPN6b/STXS/yr9\n1As/9cJPvfCz4pFXQFcb9u/fn3/961/BrkVERCQgAZ02nDhxIpMnT8bhcNCpUyff9o0bNwatMBER\nka8SUHjNnz+fZ555hri4uK/1nZeIiEgwBBRenTp1YsaMGcGuRUREJCABfec1ePBgSktLg12LiIhI\nQAI68nr33XfJy8ujS5cudOzY0bcKxv79+4Ndn4iISDMBhdemTZuCXYeIiEjAAgqvnj17BrsOERGR\ngN0yvObPn092drZvlYsb6VJ5ERFpDbcMr4yMDAB+/etft0gxIiIigbhleG3YsIFly5bx6KOPtlQ9\nIiIit3XLS+WPHj3aUnWIiIgELKDfeYmIiHyb3PK04fHjxxk0aFCz7fqdl4iItKZbhtf3vvc98vLy\nWqoWERGRgNwyvDp27KjfeImIyLfOLb/zCg0Nbak6REREAnbL8Hr33Xdbqg4REZGA6WpDERGxHIWX\niIhYjsJLREQsJ+jhVVpaysiRI3E6nTe97N7j8TB37lySkpKYMGECZ8+ebfL82bNn+fGPf8zbb78d\n7FJFRMQighpeDQ0NLF68mDVr1rB9+3YKCws5ceJEkzEbN24kIiKC4uJiMjIyyM7ObvL8K6+8wpAh\nQ4JZpoiIWExQw+vw4cP07t2bnj17EhoaSnJyMiUlJU3GlJSU4Ha7AXA6nU1W7Xj//ffp1asXsbGx\nwSxTREQsJqjhVVFRQY8ePXyPo6OjqaysbDKmsrISh8MBgN1uJzw8nKqqKurq6li9ejWzZs0KZoki\nImJBAd1J+X9ljPnaY66vm5ibm8vUqVPp3LlzwPsCiIoK+/qFtlHqhZ964ade+KkX1hXU8HI4HE0u\nwKioqKB79+7NxpSXlxMdHY3X66W2tpaIiAgOHz5McXEx2dnZVFdX06FDBzp16sTTTz99y/c8f74m\nKJ/FaqKiwtSL/6de+KkXfuqFnxVDPKjhFR8fT1lZGWfOnCEqKorCwkJycnKajElMTKSgoID+/ftT\nVFREQkICAPn5+b4xK1eupEuXLrcNLhERaR+CGl52u52srCymT5+OMYbx48cTExNDbm4u8fHxJCYm\nkp6ezvz580lKSqJr167Nwk1ERORGNhPol0kWodMAjXRKxE+98FMv/NQLPyueNtQKGyIiYjkKLxER\nsRyFl4iIWI7CS0RELEfhJSIilqPwEhERy1F4iYiI5Si8RETEchReIiJiOQovERGxHIWXiIhYjsJL\nREQsR+ElIiKWo/ASERHLUXiJiIjlKLxERMRyFF4iImI5Ci8REbEchZeIiFiOwktERCxH4SUiIpYT\n9PAqLS1l5MiROJ1O8vLymj3v8XiYO3cuSUlJTJgwgbNnzwKwb98+0tLSGDt2LOPGjePAgQPBLlVE\nRCwiqOHV0NDA4sWLWbNmDdu3b6ewsJATJ040GbNx40YiIiIoLi4mIyOD7OxsALp168aqVavYunUr\nr7zyCs8//3wwSxUREQsJangdPnyY3r1707NnT0JDQ0lOTqakpKTJmJKSEtxuNwBOp5P9+/cD0KdP\nH6KiogB48MEH8Xg8XLt2LZjlioiIRQQ1vCoqKujRo4fvcXR0NJWVlU3GVFZW4nA4ALDb7YSHh1NV\nVdVkTFFREXFxcYSGhgazXBERsYiQYO7cGPO1xxhjsNlsvseffvopOTk5rF27NqD3jIoK+3pFtmHq\nhZ964ade+KkX1hXU8HI4HL4LMKDxSKx79+7NxpSXlxMdHY3X66W2tpaIiAgAysvLmTVrFq+++ir3\n339/QO95/nzNnfsAFhYVFaZe/D/1wk+98FMv/KwY4kE9bRgfH09ZWRlnzpzB4/FQWFjI8OHDm4xJ\nTEykoKAAaDw9mJCQAEB1dTU///nP+dWvfsWAAQOCWaaIiFhMUMPLbreTlZXF9OnTGTNmDMnJycTE\nxJCbm8sHH3wAQHp6OpcvXyYpKYl33nmHefPmAZCfn09ZWRm//e1vcblcuN1uLl26FMxyRUTEImwm\nkC+mLESnARrplIifeuGnXvipF346bSgiItICFF4iImI5Ci8REbEchZeIiFiOwktERCxH4SUiIpaj\n8BIREctReImIiOUovERExHIUXiIiYjkKLxERsRyFl4iIWI7CS0RELEfhJSIilqPwEhERy1F4iYiI\n5Si8RETEchReIiJiOQovERGxHIWXiIhYTtDDq7S0lJEjR+J0OsnLy2v2vMfjYe7cuSQlJTFhwgTO\nnj3re27VqlUkJSUxatQo/vKXvwS7VBERsYighldDQwOLFy9mzZo1bN++ncLCQk6cONFkzMaNG4mI\niKC4uJiMjAyys7MB+Oyzz9i5cyc7duzgrbfe4uWXX8YYE8xyRUTEIoIaXocPH6Z379707NmT0NBQ\nkpOTKSkpaTKmpKQEt9sNgNPp5MCBAwDs3r2b0aNHExISwv3330/v3r05fPhwMMsVERGLCGp4VVRU\n0KNHD9/j6OhoKisrm4yprKzE4XAAYLfbCQsLo6qq6qavraioCGa5IiJiEUENr0BO891sjM1m+8rt\nIiIiIcHcucPhaHIBRkVFBd27d282pry8nOjoaLxeLzU1NUREROBwODh37pxvXHl5ebPX3kxUVNid\n+wAWp174qRd+6oWfemFdQT3yio+Pp6ysjDNnzuDxeCgsLGT48OFNxiQmJlJQUABAUVERCQkJAAwb\nNowdO3bg8Xg4deoUZWVl9OvXL5jlioiIRQT1yMtut5OVlcX06dMxxjB+/HhiYmLIzc0lPj6exMRE\n0tPTmT9/PklJSXTt2pWcnBwAYmNjGTVqFMnJyYSEhPDSSy/ptKGIiABgM7r+XERELEYrbIiIiOUo\nvERExHIUXiIiYjltJrxut4ZiW1ZeXs6UKVMYPXo0KSkprFu3DoB///vfTJ8+HafTyYwZM6ipqWnl\nSltOQ0MDbrebZ555BoDTp0/z1FNP4XQ6yczMpL6+vpUrbBk1NTXMmTPHd/HToUOH2u28+N3vfseY\nMWNISUlh3rx5eDyedjMvFi5cyGOPPUZKSopv263mwZIlS0hKSiI1NZWjR4+2Rsm31SbCK5A1FNsy\nu93OggUL2LFjB3/4wx/Iz8/nxIkT5OXlMWjQIHbt2sXAgQNZtWpVa5faYtatW0dMTIzv8Wuvvca0\nadPYtWsXYWFhbNy4sRWrazlLly5lyJAh7Ny5ky1btvDAAw+0y3lRUVHB+vXr2bx5M9u2bcPr9VJY\nWNhu5kVaWhpr1qxpsu2r5sGePXsoKyujuLiYRYsW8dJLL7VGybfVJsIrkDUU27KoqCj69u0LQJcu\nXYiJiaGioqLJupFut5v333+/NctsMeXl5ezZs4f09HTftgMHDuB0OoHGXvz5z39urfJaTG1tLX/7\n298YN24cACEhIYSFhbXbedHQ0EBdXR319fX85z//oXv37hw8eLBdzIuHH36Y8PDwJttunAfX/80s\nKSnB5XIB0L9/f2pqarhw4ULLFhyANhFegayh2F6cPn2aY8eO0b9/fy5evEhkZCTQGHCXL19u5epa\nxrJly3j++ed9vwu8fPkyERERdOjQON0dDke7mB+nT5/mnnvuYcGCBbjdbrKysqirq2uX8yI6Oppp\n06YxdOhQnnjiCcLCwoiLiyM8PLzdzYvrLl261GQeXLp0CWi63ix8e9eVbRPhpZ+qNbpy5Qpz5sxh\n4cKFdOnSpV3+qPvDDz8kMjKSvn37+uaFMabZHGkPvamvr+fIkSNMmjSJgoICOnfuTF5eXrv47Deq\nrq6mpKSEDz74gL1791JXV0dpaWmzce2xNzeyyrqyQV1ho6UEsoZiW1dfX8+cOXNITU1lxIgRANx7\n771cuHCByMhIzp8/T7du3Vq5yuD75JNP2L17N3v27OHq1atcuXKFZcuWUVNTQ0NDAx06dAh4nUyr\nczgcOBwO4uPjAUhKSuKtt95ql/Ni37599OrVi65duwIwYsQI/v73v1NdXd3u5sV1XzUPoqOjKS8v\n9437tvalTRx5BbKGYlu3cOFCYmNjycjI8G0bNmwYmzdvBqCgoKBd9CQzM5MPP/yQkpIScnJyGDhw\nIK+99hoDBw6kqKgIaD+9iIyMpEePHnz++edA4/d+sbGx7XJe3HfffRw6dIirV69ijOHAgQM8+OCD\n7Wpe3HhE9VXzYPjw4bz33nsA/OMf/yA8PNx3evHbpM0sD1VaWsrSpUt9ayjOnDmztUtqMR9//DGT\nJ0/mBz/4ATabDZvNxty5c+nXrx/PPfcc586d47777mPFihXNvrRtyz766CPWrl3Lm2++yalTp8jM\nzKS6upq+ffuSnZ1NaGhoa5cYdMeOHePFF1+kvr6eXr16sXz5crxeb7ucFytXrqSwsJCQkBDi4uJY\nsmQJ5eXl7WJezJs3j4MHD1JVVUVkZCSzZ89mxIgR/PKXv7zpPFi0aBF79+6lc+fOLF++nB/96Eet\n/AmaazPhJSIi7UebOG0oIiLti8JLREQsR+ElIiKWo/ASERHLUXiJiIjlKLxERMRyFF5iScOGDeOz\nzz5rkfdauXJlk1tlLFiwgPz8/G+83wULFpCSkkJmZuY33tetHDt2jJ07dwb1PURamsJL5DZWrlzJ\ntWvX7ug+L1y4QHFxMdu2bSMnJ+eO7vtGR44c+Z/Dq6Gh4Q5XI3JnKLykTfn888/52c9+Rnp6Oi6X\ny7f8DUCfPn1YtWoV48eP58knn6S4uNj33K5duxg1ahRpaWmsWrWKPn36UFdXx6JFi7DZbEycOBG3\n201tbS0Ax48fJyMjA6fTyQsvvPCV9bz33nukpKSQmprK7NmzuXTpEleuXCEjI4OrV6/idrt55513\nmrxmy5YtzJo1y/fY6/UyePBg3/qdq1ev5qmnniItLY1nn32WixcvAnDt2jV+85vfkJKSgsvlYvbs\n2VRVVfHGG29w4MAB3G43S5cuBRpXpHG73aSmpjJt2jROnToFNK5K4nK5WLJkCRMnTmTv3r3f5K9D\nJHiMiAUlJiaaTz/9tMm2+vp643a7zcmTJ40xxtTW1hqn0+l7/MMf/tDk5+cbY4z5+OOPzeDBg40x\nxly4cME8+uijpqyszBhjzNtvv2369OljvvzyS9/r6urqfO/zwgsvmEmTJhmPx2M8Ho9JTk42+/bt\na1bj8ePHzeOPP24uXLhgjDHm9ddfN88995wxxpjTp0+bhISEm362uro6k5CQYC5fvmyMMWb37t0m\nIyPDGGPMli1bTFZWlm/shg0bzLx584wxxrzxxhtm9uzZpr6+3hhjfK/fvHmzmTNnju81Fy9eNAkJ\nCebEiRPGGGP+9Kc/mfT0dGOMMQcPHjRxcXHm0KFDN61N5NtCR17SZnzxxRecPHmSzMxMXC4XTz/9\nNNeuXWtyV+3Ro0cDMGDAAM6fP4/H4+HQoUM89NBD9OrVC4Dx48c327e5YRW1ESNGEBoaSmhoKHFx\ncZSVlTV7zcGDBxk6dCj33nsvABMnTmTfvn23/Rx33XUXw4cPZ/v27UDjoqnXbyi5e/du9u/fj8vl\nwuVysWHDBs6dOwc03g5mypQp2O12AN8K6jc6dOgQffv25YEHHgBg3LhxHD16lC+//BKA3r17069f\nv9vWKdKa2sQtUUSgMWC6detGQUHBTZ+32Wx06tQJwHcDQq/X2yyYbnx8Mx07dvT92W63N7mg47/3\nc+N9kK6/7+24XC6WL1/OmDFj+Oijj8jOzvbt89lnnyUtLe2m7xeIm9X134+/853vBLQfkdakIy9p\nM77//e9z1113sWXLFt+2kydPcuXKFaD5P+7XHw8YMIB//vOfvu99/vt7MoC7776bmpqar13PoEGD\n2LNnj+87qT/+8Y889thjzd7/Zh5++GFqa2vJycnhySef9IXusGHD2LBhA9XV1QB4PB6OHTsGQGJi\nIuvWrfNdXHL9Dsl3332377u665/36NGjvlulbN68mbi4OIWWWIqOvMSSbDYbU6dOJSQkxHcksW3b\nNt58802WLl3K2rVr8Xq9REZG8vrrr/tec+M+oPGmfC+//DIzZ87knnvuYejQoYSEhNC5c2cApk2b\nxpQpU+jcuTPr168PuMbY2FgyMzOZOnUqHTp0oFevXixatKjZ+38Vl8tFbm4uGzZs8G1LTU2lqqqK\nyZMnY7PZaGhoYNKkSfTp04eZM2eSk5ODy+WiY8eOfPe732XFihUMGjSINWvW4HK5eOSRR3jxxRd5\n9dVXmTdvHl6vl27duvmO7ESsQrdEEQGuXLlCly5dgMYjkU2bNt2R33KJSHDoyEsEWL9+PUVFRXi9\nXrp27crixYtbuyQRuQUdeYmIiOXogg0REbEchZeIiFiOwktERCxH4SUiIpaj8BIREctReImIiOX8\nH4gzFtcS9o9MAAAAAElFTkSuQmCC\n",
+            "text/plain": [
+              "\u003cmatplotlib.figure.Figure at 0x7f47b20dd690\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "display_data"
+        }
+      ],
+      "source": [
+        "plt.plot(graph_means)\n",
+        "plt.ylabel('Time (seconds)')\n",
+        "plt.xlabel('Length of vector')\n",
+        "_ = plt.title('Time to sum the elements of 1000 vectors (vectorized TF operation)')\n",
+        "_ = plt.ylim(ymin=0)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "4KZg2WXjbhg5"
+      },
+      "source": [
+        "## AutoGraph"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "UQJBQWbCbinm"
+      },
+      "outputs": [],
+      "source": [
+        "# Sum written using for loop and converted with AutoGraph\n",
+        "def sum_all(elements):\n",
+        "  sum_ = 0.0\n",
+        "  length = len(elements)\n",
+        "  for i in tf.range(length): \n",
+        "    sum_ += elements[i][0]\n",
+        "  return sum_\n",
+        "\n",
+        "def run_trial(num):\n",
+        "  elements = get_elements(num)\n",
+        "  return sum_all(elements)\n",
+        "    \n",
+        "ag_means = []\n",
+        "ag_run_trial = ag.to_graph(run_trial)\n",
+        "\n",
+        "for num in range(max_elements):\n",
+        "  with tf.Graph().as_default():\n",
+        "    durations = []\n",
+        "    foo = ag_run_trial(num)\n",
+        "    with tf.Session() as sess:\n",
+        "      for _ in range(burn_ins):\n",
+        "        for _ in range(batches):\n",
+        "          sess.run(foo)\n",
+        "        \n",
+        "      for _ in range(trials):\n",
+        "        start = time.time()\n",
+        "        for _ in range(batches):\n",
+        "          sess.run(foo)\n",
+        "      \n",
+        "        duration = time.time() - start\n",
+        "        durations.append(duration)\n",
+        "    ag_means.append(np.mean(durations))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "height": 301
+        },
+        "colab_type": "code",
+        "executionInfo": {
+          "elapsed": 310,
+          "status": "ok",
+          "timestamp": 1532448438694,
+          "user": {
+            "displayName": "",
+            "photoUrl": "",
+            "userId": ""
+          },
+          "user_tz": 240
+        },
+        "id": "DLDOmrRW99v5",
+        "outputId": "ae0e0573-39db-4004-a064-efc618dbf867"
+      },
+      "outputs": [
+        {
+          "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYwAAAEcCAYAAADUX4MJAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzs3XdYVFf++PH3DE1AinTEjgULioggNiTYjcZCoiZqjEmM\n6RuzcVc32exPE7O72dTNmmhi1u+abLJqLFFsib1E7GLDjkgbQUSKyDAz5/eHySARcYwMQ/m8nsfn\nce4999zPPXOHzz23nKtRSimEEEKIu9DaOgAhhBC1gyQMIYQQFpGEIYQQwiKSMIQQQlhEEoYQQgiL\nSMIQQghhkTqZMObPn88bb7xh6zBqlQceeICffvrJ6uuZOXMmH330kdXXU1NcuHCBUaNG0a1bN776\n6itbh1OnnT17ljFjxtg6jGpV2e9Jr9czZMgQcnNzq2x9tTJhdO3alfDwcMLDw2nfvj1dunQxT1uz\nZg3PPPMMc+bMsXoc6enphISEYDKZrL6uqlTf/mjfjTW/xy+++IKoqCgOHDjAhAkTbpu/bt06xo0b\nR1hYGJMmTbpt/smTJxk9ejRhYWGMGTOG5OTkcvPfffddoqKi6NGjB+++++49LWttEydOZNmyZdW2\nvo8//pinnnqqwjgiIyMpLS29p/pCQkK4dOnSPS3z1VdfMWLECMLCwujduzeTJk1i7dq191RHVXF0\ndCQ+Pp7PP/+8yuqslQnj0KFDHDx4kIMHD9K4cWPmz59vnvbggw9WWxxKKTQaDfLsY+1mze8xIyOD\n1q1b33G+p6cnkydPZurUqbfNKy0t5fnnn2fkyJHs27ePkSNH8txzz2EwGAD49ttv2bx5M6tXr+b7\n779n69at/O9//7No2drgXr6P7OxsEhMTiYuLKzc9PT2dAwcOoNFo2Lx58z2tX6PR3FP5OXPmsHjx\nYmbOnMnevXvZsWMHv/vd79ixY8cdl7H2344HH3yQFStW3HOyvJNamTBupZS6rdE/+eQTXnvtNaDs\n6HH58uX069ePqKgovv32W44ePcqIESOIjIy8rTeybNkyhg4dSlRUFE899RQZGRkVrnvixIkARERE\nEB4ezpEjR1BKMW/ePB544AF69erFH//4RwoLCytc/urVq0ybNo3u3bsTFRVV7gj010c3t/YK9u7d\nS0xMDF988QU9e/akT58+/Pjjj2zbto1BgwYRFRXF/PnzK1znkiVLWL16NV988QXh4eE8++yz5nkn\nT55kxIgRdO/enenTp6PX683ztmzZwsiRI+nevTvjx4/n1KlTFdYPcO7cOaZMmUJUVBRDhgxh3bp1\ndyxbWb0PPPAACxcuZMSIEXTt2pXXX3+dK1eu8PTTTxMeHs6UKVMoKCgwlz98+DDjxo2je/fujBw5\nkr1795rnTZw4kY8++ojx48cTHh7Ok08+SV5ennkelP8eU1NTmThxIhEREURHRzN9+vQ7bsOmTZt4\n8MEHiYyMZNKkSZw/fx6Axx9/nMTERGbPnk14eDgXL168bdno6GgGDx6Mr6/vbfP27t2L0Whk0qRJ\nODg4MHHiRJRS7NmzB4CVK1cyZcoU/Pz88PPz44knnmDFihUAJCYmVrrsrdauXXvbqZxFixbx3HPP\nATdPbfztb38jNjaW3r1785e//KXcvvHjjz8ycuRIunXrxsCBA9m5cycffPABBw4cYM6cOYSHh/PW\nW28BcPDgQeLj4+nevTsPP/wwhw4dKvcdffDBB4wfP56wsDDS0tJYvnw5/fv3Jzw8nP79+7NmzZoK\nv4Ndu3bRsWNHHB0dy01fuXIlYWFhjB492tw2t67v1h7QihUrePTRRwGYMGECSilGjBhBeHi4eR9e\nsmQJAwcOJCoqiueee47Lly8DN089fvPNN3zwwQdER0fj6OiIRqMhPDycd955567bOHToUMLDwxkw\nYIA56f+yD8TExDB//nx69OhBXFwcq1evLrcd165d45lnniE8PJyxY8eW+7vh7++Ph4cHR44cqbDd\n7pmq5WJjY9Xu3bvLTfvnP/+pXnvtNaWUUmlpaapdu3bqzTffVCUlJWrXrl0qNDRUPf/88yo3N1dl\nZWWp6OhotW/fPqWUUj/88IMaOHCgOn/+vDIajerTTz9VY8eOrXDdaWlpKiQkRJlMJvO0pUuXqoED\nB6q0tDR1/fp19cILL5hj+bX33ntPvfnmm8poNCqDwaD2799vnhcSEqJSU1PNn//4xz+qDz/8UCml\nVGJiourQoYOaN2+eMhgMasmSJapHjx7q1VdfVdevX1dnzpxRoaGh6tKlSxWu99a6bm3Hhx9+WGVn\nZ6tr166pIUOGqG+//VYppdSxY8dUdHS0SkpKUiaTSa1YsULFxsYqvV5/W93Xr19XMTExasWKFcpk\nMqkTJ06oqKgodfbs2dvWfbd6Y2Nj1dixY9WVK1eUTqdT0dHRatSoUerkyZNKr9erSZMmqU8++UQp\npVRWVpaKjIxU27dvV0optXv3bhUZGalyc3OVUkpNmDBBDRgwQF28eFGVlJSoCRMmqPfee++O3+P0\n6dPVZ599ppRSqqSkRB04cKDCtjx//rwKCwtTu3fvVgaDQX3++edqwIABqrS01LzepUuXVrjsrZYs\nWaImTpxYbtq///1v9fTTT5eb9swzz6h///vfSimlunXrpo4cOWKed/ToURUeHm7RsrcqLi5W4eHh\n6uLFi+ZpY8aMUWvXrlVKKfXWW2+pZ599VuXn56uioiI1bdo09f777yullDpy5Ijq1q2b+Teo0+nU\n+fPnK9z2vLw81b17d/X9998ro9Go1qxZo7p3767y8vLM5WNjY9XZs2eV0WhUBQUFKjw8XKWkpCil\nlMrOzjbvR7/2t7/9Tc2ePfu26QMGDFDffPONOnbsmOrYsaO6cuWKed6v41u+fLl69NFHzZ/btWtX\n7je4e/duFRUVZd7/5syZox577DGllFLffPONeuCBByqM7Va/3sbS0lK1detW82913759qkuXLurE\niRNKqbLf+l//+lel1+vV3r17VVhYmLpw4YJS6ubvKTIyUh09elQZjUb16quvqunTp5db57Rp09Ti\nxYvvGpslan0PwxIajYbnn38eR0dHevbsibOzM8OGDaNRo0b4+/sTERHBiRMnAPjf//7H1KlTadmy\nJVqtlqlTp5KcnExmZuYd61e39HDWrFnD5MmTCQoKwtnZmenTp7N27doKz4/b29uTnZ1NWloadnZ2\ndOvWrcI6K+Lg4MC0adOws7Nj6NChXL16lccffxxnZ2dat25N69atK+0FVGTSpEn4+Pjg7u5ObGws\nJ0+eBGDp0qWMGzeO0NBQNBoNI0eOxNHRscKjli1bttCkSRNGjhyJRqOhffv2DBw4kPXr199W1pJ6\nJ0yYgJeXF35+fkRERNClSxdCQkJwcHBgwIAB5hi///57+vXrR58+fYCbR+6dOnVi27Zt5rpGjx5N\ns2bNcHR0ZMiQIeZlf3Frm9vb25Oeno5Op8PR0ZHw8PAK22zdunX069eP6Oho7OzsePLJJ7lx40a5\nI+ff6vr167i5uZWb1rBhQ3OP9dfz3dzcuH79ukXL3qpBgwbExcWZj95TUlK4cOGC+fTOsmXLmDlz\nJm5ubri4uDB16lRz2WXLlhEfH090dDQAfn5+tGzZssLt2bp1Ky1atGD48OFotVqGDRtGq1at2LJl\ni7nMqFGjCA4ORqvVYmdnh52dHadPn6akpAQfHx+Cg4MrrLugoABXV9dy0/bv309GRgZDhgyhY8eO\nNGvW7Laj83uxZs0a4uPjzfvf9OnTOXz4MBkZGVy9evW2XmJMTAzdu3enc+fO5f5+3LqN9vb2xMTE\n0KRJE+BmL7dXr17s37/fXF6j0fC73/0OBwcHunfvTkxMTLle+8CBA+nUqRNarZbhw4fftl+7urqS\nn5//m7f7VvZVUkst4O3tbf5/gwYN8PHxMX92cnIy/9AyMjJ4++23+dvf/gaUnd/W6XQEBgbedT2X\nL1+mcePG5s9BQUEYDAZycnLw8/MrV/app57in//8J1OmTEGj0fDwww9XeC67Ip6enuZzrA0aNKhw\nG3/ZJkvduryzszPZ2dnAzTZZtWqV+S4fpRQGg8HcHb9VRkYGhw8fJjIy0lzWaDQycuTICsverd5b\nY3Jycrrt863f27p168x/fH6p65c/ZEC579zZ2bnS9pkxYwYffvgh8fHx5usMFd2B8+vvW6PREBgY\niE6nu2PdlnJxcbntD3xhYSENGzascH5hYSEuLi4WLftrw4YN4+9//zvPPfcca9asoX///jg6OpKb\nm0txcXG5bTeZTObkmpWVRUxMjEXb8+u2AmjcuHG5tgoICDD/39nZmQ8++ICFCxcya9YsunXrxowZ\nM2jVqtVtdbu7u1NUVFRu2qpVq+jduzceHh7mbVy5ciWPP/64RfFWFH/Hjh3Nn11cXPD09ESn0+Hp\n6Xnb72Hbtm0YjUY6depU7mDk1m38pdy8efNISUnBZDJx48YN2rVrV27bnJyczJ8bN25cbl1326+L\niopwd3f/Tdv8a/UmYVgqICCAZ5991qKL5xVdFPPz8yt3zSM9PR17e/tyX+ovXFxc+MMf/sAf/vAH\nzp07x8SJE+ncuTM9evTA2dmZ4uJic9ns7OzbdrTqEhAQwLRp03jmmWfuWjYwMJCoqCgWLlxYpfVa\nst6RI0cye/bse162ou/R29vbfG3rwIEDPPHEE0RGRtK0adNy5fz8/Dhz5ky5aZmZmVXyXbVp04ZF\nixaVm3b69GnzNZfWrVuTnJxMaGgocPMaVJs2bSpdtqI7tQB69+7NzJkzSU5OJiEhgVmzZgHQqFEj\nnJ2dWbNmzW0HPHDzO7zTnUS/blc/Pz82btxYblpGRgZ9+/a94zK9evWiV69e6PV6PvjgA9544w2+\n/vrr29bVrl07Vq1aZf5cUlLCunXrMJlM9O7dG7h5I0B+fj6nTp2iXbt2uLi4cOPGDfMyvxwg3cmv\nf9vXr18nLy8Pf39/PD09eeuttzh+/Hi5pAK3ny24dRv1ej0vv/wy7777LnFxcWi1Wp5//vlyy+Tn\n53Pjxg3zgWFmZiZt27atNNZbnT9/nieffNLi8pWpF6ek7nZ651bjx49n/vz5nD17FrjZ1a3odAqA\nl5cXWq2W1NRU87Rhw4axaNEi0tLSKCoq4oMPPmDYsGFotbc39datW83Luri4mLvgcPOi95o1azCZ\nTGzfvp19+/ZZvA134+Pjc0+3Cz7yyCN8++23JCUlATd/KNu2bavwCL1fv35cuHCBVatWYTAYKC0t\n5ejRo+YLwb+13rsZMWIEmzdvZufOnZhMJkpKSti7d69FR/oVfY/r1683L+vu7o5Wq63wOxwyZAhb\nt25lz549GAwGFi5ciJOTE2FhYRbFbTKZ0Ov1GAyGcv8HiIyMRKvVsnjxYvR6vbknFhUVBcDIkSNZ\ntGgROp0OnU7HokWLGD16dKXL9ujRo8I47OzsGDRoEH//+9/Jz8+nV69eAOae79y5c8338+t0Onbu\n3AlAfHw8y5cvZ8+ePSil0Ol05u/61/tZTEwMFy9eJCEhAaPRyNq1azl//jyxsbEVxnTlyhU2b95M\ncXEx9vb25t9IRXr16sXx48fNF+N/+OEH7OzsWLduHatWrWLVqlWsXbuWbt26sXLlSuDmb2zjxo3c\nuHGDixcv8t1335Wr89fxP/jggyxfvpzk5GT0ej3vv/8+Xbp0oXHjxrRs2ZKxY8cyffp0du/eTUlJ\nCSaTiYMHD1Z6t1VpaSmlpaU0atQIrVbLtm3b2LVrV7kySik+/vhjSktL2b9/P1u3bmXIkCF3rPNW\nOp2Oa9eu0aVLF4vK302tTxiW3Pr26zKVfe7fvz9PP/00r7zyChEREYwYMeKOt8U1aNCAadOmMX78\neCIjI0lKSiI+Pp6HHnqICRMmMGDAAJydnXn99dcrXD4lJYXJkyfTtWtXxo8fz2OPPUb37t0B+NOf\n/sTmzZvp3r07CQkJ9O/f/7628Vbx8fGcPXuWyMhIXnjhhbuW79SpE3PmzGH27NlERkYyaNCg2+44\n+YWrqytffvkla9eupU+fPvTp04f33nuv3F01ltZ7L9sUEBDAvHnzmD9/PtHR0cTGxvLll1+aDxYq\nW7ai7/Ho0aM8/PDDhIeH8/zzz/OnP/2JoKCg25Zt2bIl7777LnPmzCE6OpqtW7fy2WefYW9vf9f1\nws3TJp07d2b27NkcOHCALl26mB86dXBwYN68eaxYsYLIyEiWL1/OvHnzzHWPGzeO2NhYRowYwYgR\nI4iNjeWRRx6xaNmKDBs2jJ9++okhQ4aUS46///3vad68OY888ggRERFMmTKFlJQUADp37szcuXOZ\nO3cu3bp1Y9KkSebz9ZMmTWL9+vVERUXx9ttv4+npyWeffcbChQvp0aMHCxcuZP78+eZTRr9uK5PJ\nxL///W/69u1Ljx492LdvH2+++WaFsXt7e9OjRw82bdoE3Lw7asyYMfj7++Pt7W3+99hjj7F69WpM\nJhOTJ0/GwcGBXr16MXPmTIYPH16uzhdffJEZM2YQGRnJ+vXriY6O5uWXX+bFF1+kT58+pKWl8f77\n75vL//nPf2bixIm88847REVFERMTw8cff8yHH35oPhX36210dXXlT3/6Ey+//DKRkZGsXbv2tluD\nfX198fDwoE+fPsyYMYPZs2fTokWLO36Pt1q9ejWjRo3CwcHBovJ3o1H3cvh9j2bNmsXWrVvx9va+\n48WmxMRE3nnnHQwGA40aNWLx4sXWCkcIUYedO3eOP/7xjyxdutTWoVSZvXv3MmPGDLZu3XrPy+r1\nekaOHMlXX32Fl5dXlcRj1YSxf/9+XF1dmTFjRoUJo6CggHHjxvHll1/i7+9Pbm5ulW2YEELUdveT\nMKzBqqekIiIiKr06v3r1agYOHIi/vz+AJAshhKjBbHoNIyUlhWvXrjFx4kTGjBljvhglhBDi5s0L\nNaV3ATa+rdZoNHLixAn+7//+j+vXrzNu3Di6du1K8+bNbRmWEEKICtg0Yfj7+9OoUSOcnJxwcnIi\nIiKC5OTkuyaMXx6mE0IIUX2snjAqu6YeFxfHW2+9hdFoRK/Xk5SUxBNPPHHXOjUaDdnZBXctVx/4\n+rpJW/xM2qKMtEUZaYsyvr5udy9UCasmjFdffZXExETy8vLo168fL774IqWlpWg0GsaOHUtwcDC9\ne/dmxIgRaLVaHnnkkUqHghZCCGE7Vr2t1prkiOEmOXoqI21RRtqijLRFmfvtYdT6J72FEEJUD0kY\nQgghLCIJQwghhEUkYQghhLCIJAwhhBAWkYQhhBB1VFXfBCsJQwgh6qD07EJe+HAHe05kVVmdkjCE\nEKIOWrnjAsUlBlwbVM3Lk0AShhBC1GqLN5zi201nyp1+StUVcOB0Nq0au9OpZdW9NsKmgw8KIYT4\n7c6mX2PLoXQA/Bs5ExveBIBVOy8AMLJ3yyodqFUShhBC1FIb9qYC4Oig5ZtNZ2gR6I5Wo+HQmRyC\ng9zpWIW9C5BTUkIIUStdvnqdg6eyaR7gxgujQjEaFZ+uPMaSLWcBGNmnVZW/BkJ6GEIIUcMopZi3\n8hhXC0ro0MKLTi29aNXYHXu7smP8jfsuoYDBkc3o1MqbB3u2YPXuFHKu3aBNEw86NG9U5XFJwhBC\niBrmXEY+B05lA3A+I581u1Nwc3HgyWHt6RzsQ2FxKTuTMvF2dyIixBeAh3q35Gz6NU5evGqV3gVI\nwhBCiBpn19FMAJ4b2Qk7Ow3HLuSy40gmHy5NYlh0c+y0GvQGEwO6N8NOe7PXodVqeGlMZzKuFNEy\n0N0qcUnCEEIIG7mYVcD5jGv06xpk7hHoS43sPamjkZsT4W190Wo1dG3jS9/Ojfl05TESfroIgLOT\nPX06B5arz8nRzmrJAuSitxBC2IS+1Mi/Vhxl8cbTJJ7UmacfOpNDcYmRnp0C0GrLTis1D3Djz5O7\nE9725imouG5BODtV7zG/9DCEEMIGNuxNJefaDQCWbD5Ll2AfnJ3szaejenYKuG0Zlwb2PD+qE5cu\nF9LEt2G1xgvSwxBCiGqXm3+DhD0XcXdxYGD3puQV6lm9K4WrBSUcT8klOMidQG/XCpfVaDQ083cr\n1/uoLlZNGLNmzaJnz54MHz680nJJSUl06NCBjRs3WjMcIYSoEZZtO4e+1MSYmGBG922Fj0cDfth/\nieXbzqEU9AoNvHslNmDVhDF69GgWLlxYaRmTycR7771Hnz59rBmKEELUCGfTrrHnuI7mAW706hyI\no4Md4/u3wWhS7DqWhb2dlsgQP1uHWSGrJoyIiAjc3Su/Yr948WIGDRqEl1fVPsIuhBA1jUkpvtl0\nGoBH+7dB+/OdUWGtfegc7A1AeFsfXKpwhNmqZNNrGDqdjh9//JHx48fbMgwhhKgWe45ncSGzgMj2\nfrRp4mmertFomDCgLV2CvRkW3cJ2Ad6FTe+Smjt3Lq+99pr5/uOqfjuUEELUFCWlRr7bdh4Hey0P\n92t923wfT2defriLDSKznE0TxrFjx3jllVdQSnH16lW2b9+Ovb09cXFxd13W19etGiKsHaQtykhb\nlJG2KFMT2uLbH05xtaCEh+PaENLa19bh/CZWTxiV9Ro2bdpk/v/MmTOJjY21KFkAZGcX3HdsdYGv\nr5u0xc+kLcpIW5SpCW1xtaCEpZtO4+7iQL/OgTaL534Tp1UTxquvvkpiYiJ5eXn069ePF198kdLS\nUjQaDWPHjrXmqoUQosZYseM8+lIT4+PaVPvT2VXJqpG/9957Fpd95513rBiJEEJUv5JSI9sOpbMr\nKZMmvq706dzY1iHdl9qb6oQQwoYMRhMnUq4S0swTRwe7cvOKSwxsOZTOhr2pFFwvxdFBy4SB7Wzy\ndHZVkoQhhBC/wfJt51m/N5UALxeeHNae4CAPlFLsP5XNf388zbVCPc5OdjzYszkDIpri5uJo65Dv\nmyQMIYS4R5lXivhh/yWcnezR5V5n7lcH6N+tKZm5RRw7n4u9nZbhPVswKLJpjX0I77eQhCGEEPfo\n201nMZoUU4aG4ObiyJcJJ/lh/yUAOrb0YsLAtvg3crFxlFVPEoYQQtyDI2dzOHr+Cu2bNyK8rS8a\njYb/NyWSH/ZfIsDLhW7tfK3yetSaQBKGEEJYyGA08e2mM2g1Gsb3b2NODE6OdjzYs4Vtg6sG8j4M\nIYSwgMFoYvm28+iuFhPbNcgmLzCyNelhCCFEJZRSHDqTw9Kt59DlXsfD1ZGH+rS0dVg2IQlDCCF+\n5WpBCeczrnE+I58TF69yMasArUZDv65BPNS7JQ2d686dT/dCEoYQQvys1GDi281n2HIw3TxNA3Rt\n40N8v+A7vja1vpCEIYSol86k5eHsaE9jX1e0Gg05ecXMW3mMlKwCGvu4Et3Rn1aB7rQIdK/V4z9V\nJWkFIUS9s+NIBv9elwxAQ2cH2jTx4PSlPIpuGOjVKYAJg9rh9KvhPoQkDCFEPXMq9Sr/2XAK1wb2\ndGntw6nUqxw6k4O9nZbJQ0Lo0zmwzj5Hcb8kYQgh6g3d1et8svwoAM+PCiWkeSOUUuRcu4GTgx3u\nrrV/vCdrkoQhhKgXim6U8tHSJIpuGJg8JISQ5o2Am+/T9vV0tnF0tYM8uCeEqPOMJhOfrTxGVu51\nBkc2o2+X2v1eCluRhCGEqPOWbjnH8ZSrdA72Jr5fsK3DqbUkYQgh6rQf915k475LBHq78MyIjrX+\nJUa2JAlDCFGrXS0oYfPBNIpulN4270xaHv9aloRrA3teiu8sz1PcJ6u23qxZs9i6dSve3t6sXr36\ntvmrV6/m888/R6PR4OLiwl/+8hfatWtnzZCEEHVIid7I+0sOk55dxIrt5xkW3YK4bkHorhazds9F\nEk/o0Gg0TBvZqU6+n6K6aZRSylqV79+/H1dXV2bMmFFhwjh8+DDBwcG4ubmxfft2PvnkE5YsWWJR\n3dnZBVUdbq3k6+smbfEzaYsy9aEtlFIsWH2CxBM62jdvxMWsAq6XGGjo7EBh8c3eRhNfV6aM6EQL\n3/o9pMcvfH3d7mt5q/YwIiIiSE9Pv+P8sLCwcv/X6XTWDEcIUYf8eCCNxBM6goPceeWRLpSUGkn4\n6SJbDqUTHOTOsOgWdAn2xs/Pvc4nz+pSY07oLV26lL59+9o6DCFELXD6Uh5LNp/F3cWB50aGYm+n\nxd5OyyOxrXkktrWtw6uzakTC2LNnD8uXL+e///2vxcvcb9eqLpG2KCNtUaautsW5tDzmrTyGAv44\nOZK2rXzuukxdbYvqZvOEkZyczJ///Ge++OILPDw8LF5Oupg31Ydz1ZaStihTV9vibPo1PlhyhBsl\nBh4fEkKAu9Ndt7OutsVvUaOvYcDNC1N3kpGRwUsvvcTf//53mjVrZu1QhBC1WPLFq3y0LIlSg4mn\nhncgumOArUOqd6yaMF599VUSExPJy8ujX79+vPjii5SWlqLRaBg7dizz5s3j2rVr/L//9/9QSmFv\nb8+yZcusGZIQopZJyy7kx/1p7D6WiVLw7MhOdGvna+uw6iWr3lZrTdLFvEm622WkLcrU9rYwmkwk\nnb3CjwfSOHnxKgA+Hg2YNLgdnVp631Ndtb0tqlKNPyUlhBCWulpQwq6jmWw9nE5ufgkA7Zs3on+3\nJnRp7SPDetiYJAwhhM2YTIp9yZc5kZLL6Ut56K4WA+DkYEds1yBiuwbRxK+hjaMUv5CEIYSwCX2p\nkc9Xn+DA6WwAGjjaEdrKm87B3vTsFCDjPtVA8o0IIapdwXU9//zuKGfTrxHSzJNHHmhNMz83OeVU\nw0nCEEJUq8t5xXyw5Ai63Ov06ODPE0Pb42AvA2fXBpIwhBDVJj2niH98e4hrhXqG9mjO6JhWaDXS\nq6gtJGEIIapFqq6Af3x7mMLiUsY90JqBkfKwbm0jCUMIYTUmk6KguJSLWQUs+P44xSUGHh/cjpiw\nIFuHJn4DSRhCiCp3IiWX/2w4RU7eDUw/Pxus1WhkSI9aThKGEKJKHTiVzfzvjwHQqrE7Hg0d8XR1\nIrydL+2bN7JxdOJ+SMIQQlRqx5EMDp3JoW9YYzoHe1d6kXrHkQwWrU/G0d6OF8eE0qGFVzVGKqxN\nEoYQ4o5+EQoNAAAgAElEQVRKDSaWbj1HYXEph8/mEOjtQv9uTWjo4oi+1Ii+1Ej+9VKuFtzgyrUb\nHE+5imsDe155JIxWjd1tHb6oYpIwhBB3dOhMNoXFpUS298PeTkviCR2LN56+Y3n/Rs68MDqUIF8Z\nzqMukoQhhLijHUmZAIzo1ZLGPq6M7tuKg6ez0Wg0ONprcXDQ4ubiiJebE54NnWQ4jzpOvl0hRIVy\n8oo5cSGX1kEeNPZxBcDLvQH9I5raODJhK/I8vhCiQjuPZqKAPl0CbR2KqCEkYQghbmMyKXYezaSB\nox3dQ/xsHY6oISRhCCFuczwll9z8EiLb+9PAUc5ci5skYQghbrP9SAYAfbs0tnEkoiaxasKYNWsW\nPXv2ZPjw4Xcs89ZbbzFw4EAeeughTp48ac1whBB3kXmliM9WHePgqWyCfF1pGXh/74AWdYtV+5qj\nR49m4sSJzJgxo8L527ZtIzU1lY0bN3LkyBHefPNNlixZYs2QhKi3SvRGPlt1DEcHO8Ja+xAa7I1r\nA3uuXLtBSlYBh87ksOdEFkpBc383Jg8JQSNDj4tbWDVhREREkJ6efsf5mzZtYuTIkQB06dKFgoIC\ncnJy8PHxsWZYQtRLq3encOTcFQD2JV9Gq9Hg7GRH0Q2DuUwTX1ce6t2K8LY+kizEbWx6Nevy5csE\nBJSNXOnv749Op5OEIUQVy7xSxIa9qXi7N+D50Z04fiGXw2dzKLheSocWXrQIcKNloDttm3nKC43E\nHdk0Yaifhz2+laVHNb6+cm71F9IWZaQtyvzSFkopPvouCaNJMW1MZ7p3CqR7aP16H4XsF1XDpgnD\n39+frKws8+esrCz8/Cy75zs7u8BaYdUqvr5u0hY/k7Yoc2tb7D2p48iZHDoHe9PKz7XetZHsF2Xu\nN3Fa/bbainoRv4iLi2PlypUAHD58GHd3dzkdJUQVKiwu5dtNZ7C30/Jo/zZyXULcF6v2MF599VUS\nExPJy8ujX79+vPjii5SWlqLRaBg7diwxMTFs27aNAQMG4OzszDvvvGPNcISos0r0Rq6XGCg1mjAY\nTBy9mMeWfakcu3AFg1ExolcL/Bq52DpMUctpVGVdgBpMupg3SXe7TH1rC6UUpy/lseVQOgdOZWM0\n3f5TbuLrSlQHfwZFNsPern4+p1vf9ovK3O8pKXnmX4ha6PSlPBZvOEV6ThEAgd4uNPN3w95Og4Od\nlqaBHoQ0cSfQ29XGkYq6RBKGELXMoTPZfLryOCaTIrK9H7Fdg2jb1LPc9Qk5qhbWIAlDiFpk19FM\n/r02GXt7DS+N6UynVt62DknUI5IwhKihDEYTmw+kkX3tBiaT4nqJgcQTOlwb2PO7h7sQHORh6xBF\nPSMJQ4ga6rtt59iw91K5aY3cnHjlkS40kXdmCxuQhCFEDXToTDYb9l7C38uFaSM6Ym+vxU6rwdvd\nCQd7O1uHJ+qpuyaMS5cusWzZMhITE8nKysLJyYmQkBAGDRrEwIEDsbeXnCNEVcrJK2bhmpM42Gt5\nbmQnmvpJb0LUDJX+tf/zn//M8ePHGTx4ML///e/x8fGhpKSEc+fOsXPnThYsWMBf/vIXwsLCqite\nIeo0g9HEp6uOc73EwOQhIZIsRI1SacKIi4tj9uzZt01v164dQ4cOJS8vj0uXLlWwpBDiXhmMJhYm\nnORCZj7RHQPo0znQ1iEJUU6lCSMmJqbShT09PfH09KzSgISoj0oNRj5deZzDZ3MIDnJn4qC2Mu6T\nqHEsGivgr3/9KwUFBRgMBh599FHCwsJYtWqVtWMTol64oTfw4dIkDp/NoUOLRvx+bFcaOMq1QVHz\nWJQwdu/ejZubGzt37sTf358NGzbw5ZdfWjs2Ieq8VF0Bf/3qICcvXqVrGx9eju+Mk6PcBSVqpns6\njNm3bx8DBgzA399fustC3IdSg5Hvd6Wwbk8qJqWICWvMhIFtsdPWzwECRe1gUcLw9vbm9ddfZ9eu\nXUydOhWDwYDRaLR2bELUOfpSI3tO6Fi35yK6q8V4uzfg8cHtZIgPUStYlDDee+89vv/+e+Lj4/Hw\n8CAtLY0nnnjC2rEJUWcUFpeycV8qWw9lUFhcilajoX9EE0b3bSXXK0StYdGe6uXlxeTJk82fmzRp\nQpMmTawVkxA12rXCErRaDW4ujhaV1129zgdLjnD5ajGuDewZFt2c2K5BeLk3sHKkQlStShPGc889\nx7Rp0+jcufNt8woLC/nuu+9o0KABY8eOtVqAQtQkJXojbyzcyw29gZ6dAhkU2bTSd06cS7/GR8uS\nKCwuZUiPZozo1RInB7moLWqnShPGSy+9xHvvvUdKSgqdO3fG29ubkpISzp8/T3p6OuPGjWP8+PHV\nFasQNrc3WUdhcSmODlq2H8lgx5EMQoO96djSi3ZNPWni2xC9wYgut5jzGdf43+azlBpNTBrcjn5h\nQbYOX4j7UmnCCAkJ4fPPPyczM5O9e/ei0+lwcnJi8ODBdOvWDUdHy7rkQtQVO45kogFmPxlFalYB\n6xIvknTuCknnrgDgaK9FbzCZyzs6aHlpTGe6tPaxUcRCVB2LrmEEBgby0EMP/aYVbN++nblz56KU\nYsyYMUydOrXc/MzMTP7whz9QUFCAyWRi+vTpd33CXAhbSM8p4mz6NTq29MLP0xk/T2e6tfMl+9oN\nzlzK49SlPFIyC3B3dcDfy4WARi6EBnsT4OVi69CFqBIWJYwrV67wzjvvkJmZyddff01ycjKHDh26\n6+kok8nEnDlzWLRoEX5+fsTHxxMXF0dwcLC5zKeffsrQoUMZN24c586d4+mnn2bz5s33t1VCWMGO\nIxkA9O3S2DxNo9GYk0evUBn7SdRtFj0l9Prrr9OtWzfy8/MBaNWqFf/973/vulxSUhLNmzcnKCgI\nBwcHhg0bxqZNm8qV0Wg0FBYWApCfn4+/v/+9boMQVldqMLH7WBYNnR3o2kZOL4n6yaKEodPpGD9+\nPHZ2N+/ucHR0RGvBE6k6nY7AwLKjLn9/fy5fvlyuzAsvvMCqVauIiYlh2rRpvPHGG/cSvxDV4vDZ\nHAqLS+kVGoC9nTyNLeoni05J/folSfn5+Sil7rqcJWUSEhIYM2YMkydP5vDhw7z22mskJCTcdTlf\nX7e7lqkvpC3KWKst9iw/CsBD/drUmvauLXFWB2mLqmFRwhg4cCB//vOfKSoqYvny5fz3v/9lzJgx\nd10uICCAjIwM82edToefn1+5MsuWLWPhwoUAhIWFUVJSQm5uLl5eXpXWnZ1dYEnodZ6vr5u0xc+s\n1RapugIOn86mdRMPGmhrx74n+0UZaYsy95s4LepbP/XUU0RERNCxY0e2bdvGxIkTefzxx++6XGho\nKKmpqaSnp6PX60lISCAuLq5cmcaNG7N7924Azp07h16vv2uyEKI6KKXYcjCNtxcfQAEDIpraOiQh\nbEqjLDlvdB+2b9/O22+/jVKK+Ph4pk6dyscff0xoaCixsbGcO3eO119/nevXr6PVapkxYwbR0dF3\nrVeOGG6So6cyVdkW+UV6Fq1L5vDZHFwb2DN5SHu6tfOtkrqrg+wXZaQtytxvD8OihHHlyhW++uor\nUlNTMRgM5ukfffTRfa38fsgOcJP8GMpURVvkX9ezYW8qmw+kU1JqJKSZJ08P70gjN6cqirJ6yH5R\nRtqizP0mDIuuYTz33HN06NCB6Oho851SQtQl6dmF7EjKZNvhDEpKjXg0dCS+XzCxXYPQauXdL0KA\nhQmjuLiYN99809qxCFGtSkqNbD+cwe5jWVzU3TwC9WjoyJiYVsSENcbBXg6OhLiVRQmjS5cunDp1\ninbt2lk7HiGqRfLFqyxal8zlvGLstBq6BHvTMzSQsNbekiiEuAOLEsa4ceOYMGECAQEBODmVnctd\ntmyZ1QIToqqUGkwUXNejN5go0RvZejidbYcz0GhgUGRThkQ1x91VBtIU4m4sShivvfYa06ZNo0OH\nDnINQ9Qql/OK+etXB8gr1Jeb3sTXlSeGtqdloLuNIhOi9rEoYTg5OfHkk09aOxYhqpS+1Mi85UfJ\nK9TTtY0Pbi4OONrb4e/lQkxYYxniQ4h7ZFHC6NOnD9u3b6dv377WjkeIKqGUYvHGU6ReLqRvl8ZM\nHhJi65CEqPUsShhLlixhwYIFuLq64ujoiFIKjUbDTz/9ZO34hPhNth/JYNfRLJoHuPHYgDa2DkeI\nOsGihPHdd99ZOw4hqkROXjE/ndCxetcFXBvY8/yoTnLXkxBVxKKEERQk7yIWNVvSuRx+WHKE4+d/\nflWqg5ZnHuqIj4ezjSMTou6oNGG89tprvPvuu4wZMwaN5vanXeW2WmFrJqX4fucFvt+VAkBIM0+i\nOwbQrZ0fLg0sOh4SQlio0l/ULy87+sMf/lAtwQhxL4pLDCxMOMnB09n4eDTgjSd74OYodz4JYS2V\nJoxfXskaGRlZLcEIYYkSvZHEkzrWJ6aSlXudkGaePDuyE62CPGSQOSGsSPrsotbIKywh4aeL7D6W\nSXGJEY0G+ndrwiMPtJZnKoSoBpUmjNOnT1f4bgq5rVZUtwOnsvm/9ckUFpfi2dCRARFN6dulMV7u\nDWwdmhD1RqUJo0WLFixYsKC6YhHiNsUlBr758Qw7j2biYK/lsQFt5SltIWyk0oTh6Ogot9QKmzAY\nTWw7nMGa3SlcK9LT3N+NqSM6EOjtauvQhKi3Kk0YDg4O1RWHEMDN22R/OpbFqp0XyLl2AycHOx7q\n3ZJh0c2lVyGEjVWaMJYsWVJdcQhBenYh/7fhFGfTrmFvp2Vg96YM7SFDjwtRU1j9Lqnt27czd+5c\nlFKMGTOGqVOn3lZm7dq1/Otf/0Kr1dKuXTv+8Y9/WDssUYPkF+n5Yf8l1iemYjQpurXzZdwDbfD2\nkAvaQtQkVk0YJpOJOXPmsGjRIvz8/IiPjycuLo7g4GBzmYsXL/LFF1/wv//9j4YNG5Kbm2vNkISN\nGIwmVu9KQW8w4uxkj4uTPVfyb3Ai5SqXLhcC4O3uxGMD2hHWxsfG0QohKmLVhJGUlETz5s3NF86H\nDRvGpk2byiWMJUuW8Oijj9KwYUMAvLy8rBmSsJENe1NZvTvltun2dlo6tGhEp5be9OvamAaO8miQ\nEDWVVX+dOp2OwMBA82d/f3+OHj1arkxKSgoA48ePRynF888/T58+fawZlqhmV67dYPWuFNxdHHhh\ndGdKDEau3zDg2sCe1kEeODrIaLJC1AZWTRhKqbuWMRqNpKam8vXXX5ORkcFjjz1GQkKCucdxJ76+\nblUVZq1X09vi84ST6A0mnn+4C9Fdm1h1XTW9LaqTtEUZaYuqYdWEERAQQEZGhvmzTqfDz8+vXBl/\nf3+6du2KVqulSZMmtGzZkpSUFDp16lRp3TJm0E2+vm41ui2Onr/CT0czadPEg07NPK0aa01vi+ok\nbVFG2qLM/SZOq97YHhoaSmpqKunp6ej1ehISEoiLiytXpn///uzZsweA3NxcLl68SNOmTa0Zlqgm\npQYTX/9wGq1Gw4SB7SocIl8IUXtYtYdhZ2fHG2+8wZQpU1BKER8fT3BwMB9//DGhoaHExsbSp08f\ndu3axbBhw7Czs2PGjBl4eHhYMyxhRUopUnWFHDydzcHT2Vy+Wkz/iCY09av8FKMQoubTKEsuNNRA\n0sW8qSZ1t3OuFfOvFce4mHUzHns7LWGtvXliaHucnax/91NNagtbk7YoI21R5n5PSck9jKJKnEnL\n41/Lj5J/vZSubXyI7hhAp1ZecpusEHWI/JrFfTEpxc6kTBZvOIVS8NiAtjwQHiTXK4SogyRhiHtm\nMilOpl7l4Kmb1ymuFelxcbLn2VGd6NhCHrwUoq6ShCHuSeaVIr5Yc5ILmTdf39vQ2YHenQMZFt0c\n/0YuNo5OCGFNkjCERUxKselAGsu2nqPUYKJ7iB+xXYNo09QDO60MOy5EfSAJQ9xVzrVivkw4SXJq\nHg2dHXj6wQ5EhPjdfUEhRJ0iCUPckVKKHUmZfLvpDDf0RsJa+/D44HZ4NHSydWhCCBuQhFHPGYwm\ntBoNWm35u5oycopYsuUsSeeu4Oxkx5PD2tOzU4Dc/SREPSYJox7LL9Lz1n/2U1JqJCLEj6j2/rg2\nsGf17hT2nbyMAjq2aMQTQ9vj5S4vMxKivpOEUU+ZlOLzNSduvjfb0Y4tB9PZcjDdPL+ZX0OG92pJ\neFsf6VUIIQBJGPXW2p8ucvxCLqGtvHlxTCinUvNIPKEjr6iE2K5BhLWWRCGEKE8SRj10+lIeK3ac\np5GbE0892B57Oy0dW3rRsaU8dCeEuDNJGPWIUoqz6df4bNUxNGh4ZkRH3FwcbR2WEKKWkIRRD5Qa\njOw/lc0P+y6R8vNIsg/3C6ZtU08bRyaEqE0kYdRRpy/lcfB0NmfTr3ExqwCjSaEBurbxYWD3prRr\n1sjWIQohahlJGHXQ0fNX+HDJERRgp9XQzL8hIc0aEdM1CD9PZ1uHJ4SopSRh1DGX84pZ8P1x7Oy0\nPPtQRzq09MLJwc7WYQkh6gBJGHVIid7IJ98dpeiGgSeGhNC1ra+tQxJC1CEyzGgdoZRi0fpk0rIL\n6dc1iD5dGts6JCFEHWP1hLF9+3YGDx7MoEGDWLBgwR3LrV+/npCQEI4fP27tkOqU4hIDWw+n8+aX\n+0g8oSO4sTuP9m9j67CEEHWQVU9JmUwm5syZw6JFi/Dz8yM+Pp64uDiCg4PLlSsqKuKrr74iLCzM\nmuHUKSaTImHPRTbsTeX6DQN2Wg0RIX481r8N9nbScRRCVD2rJoykpCSaN29OUFAQAMOGDWPTpk23\nJYyPPvqIp59+mi+++MKa4dQZeYUlLPj+OMmpeXi6OdG/WxNiwoJo5CbDjgshrMeqh6I6nY7AwEDz\nZ39/fy5fvlyuzMmTJ8nKyiImJsaaodQZxy5c4S9f7iU5NY+w1j7Mm/EAI/u0kmQhhLA6q/YwlFJ3\nnT937lz+9re/WbzML3x93e4rttomJTOfxWtPsvdEFvZ2Gp5+qBPD+7RCo9HI8B63qG/7RWWkLcpI\nW1QNqyaMgIAAMjIyzJ91Oh1+fmWv9iwqKuLs2bNMnDgRpRQ5OTk899xzfPrpp3Ts2LHSurOzC6wW\nd02Sqitgw95U9hzXoYC2TTwY178NLQLcyckpxNfXrd60xd1IW5SRtigjbVHmfhOnVRNGaGgoqamp\npKen4+vrS0JCAu+//755fsOGDfnpp5/MnydOnMjMmTPp0KGDNcOq8fSlRvaevMzWw+mcz8gHbr6f\nYnRMMKGtvGTYcSGETVg1YdjZ2fHGG28wZcoUlFLEx8cTHBzMxx9/TGhoKLGxseXKazQai09J1UUm\nk2LXsUxWbD9PXqEeDdA52JuYsMZ0ae2DVhKFEMKGNKqW/oWua13Mkym5fLv5LJcuF+JoryUuogmx\nXYPw8ah87CfpbpeRtigjbVFG2qJMjT4lJe5OX2pkyZazbD6Yjgbo1SmAUX1byTu0hRA1jiQMG0q7\nXMj874+TnlNEkI8rTz7YnhYB7rYOSwghKiQJwwZMSrHpQBpLt5zDYDTxQHgQj8S2xlFGlRVC1GCS\nMKpZbv4Nvlx7khMpV2no7MATQzvStY2MKiuEqPkkYVQTfamR3ceyWLb1HNdLDHQO9uaJISF4NJQn\ntIUQtYMkDCvLyStm86F0dhzJoOiGAUcHLZMGtyOmS2N5nkIIUatIwrASk0mxLvEiK3dcwGhSuLk4\n8GDP5vQLC5I7oIQQtZIkDCu4cu0GX6w5walLeXg0dCQ+JpjI9n442MtFbSFE7SUJowoppdhzQsdX\nG09TXGIgvK0vjw9uJ4MDCiHqBEkYVSS/SM9/Npzi4OlsnBzsmDwkhD6dA+U6hRCizpCEcZ+UUuxL\nvsxXG09TWFxK26aeTBnWHj/Pyof0EEKI2kYSxn1Iu1zIf388TXJqHg72WsbFtaF/RBMZJFAIUSdJ\nwvgNCotLWbnjPFsOpaMUdAn2ZlxcG/y9XGwdmhBCWI0kjHtgMim2HU5n+fbzFN0wEODlwvj+bQht\n5W3r0IQQwuokYVjoVOpV/vvjGS5dLqSBox2PxLamf0QT7O2s+lp0IYSoMSRh3EVu/g2WbDnL3pOX\nAegdGsiYmFYypIcQot6RhHEHJXojG/elkrDnIvpSEy0D3Xh0QFuCG3vYOjQhhLAJSRi/YjCa2JmU\nyaqdF7hWpMfNxYHH+relV+dAuftJCFGvWT1hbN++nblz56KUYsyYMUydOrXc/EWLFrF06VLs7e3x\n8vJi7ty5BAYGWjus2xiMJhJP6Fjz00V0uddxdNDyYM8WDI5shksDyatCCGHVv4Qmk4k5c+awaNEi\n/Pz8iI+PJy4ujuDgYHOZDh06sHz5cpycnPjmm2/4+9//zgcffGDNsMopNZjYmZTBusRUcq7dwE6r\noV/XIEb0aoGnXKcQQggzqyaMpKQkmjdvTlBQEADDhg1j06ZN5RJGZGSk+f9hYWGsXr3amiGVk3ml\niM9WHefS5UIc7LXEhTdhcFQzvD1kNFkhhPg1qyYMnU5X7vSSv78/R48evWP5ZcuW0bdvX2uGZLbr\naCZfbTxNSamRPp0DGd1X7nwSQojKWDVhKKUsLrtq1SqOHz/O4sWLqzyOkxev8p/1yZQaTTg72qPR\naEjLLsTZyY5pD3Uksr1/la9TCCHqGqsmjICAADIyMsyfdTodfn5+t5XbvXs3CxYs4KuvvsLBwcGi\nun193Swqd+R0Nh8tS8JkMuHl4Uz+dT3Xbxjo2Mqb343rSoC3q2UbU4NZ2hb1gbRFGWmLMtIWVcOq\nCSM0NJTU1FTS09Px9fUlISGB999/v1yZEydO8Oabb7Jw4UIaNWpkcd3Z2QV3LXPswhX++d1RlFK8\nMDqUzsE+wM2ej0ajAZPJonpqMl9ft1q/DVVF2qKMtEUZaYsy95s4rZow7OzseOONN5gyZQpKKeLj\n4wkODubjjz8mNDSU2NhY3n33XYqLi3n55ZdRStG4cWPmzZt33+s+fCaHeSuPAfDimM7lxnuSd1QI\nIcS906h7udBQg2RnF1BSamRnUiYdWjQi8OdTS0op1iemsmzrOezttbw4JpROLevu4IBy9FRG2qKM\ntEUZaYsyNbqHYW3fbTvHj/vT0AARIX4MimzGjwcusee4jkZuTrwwOpSWge62DlMIIeqEWpswUnUF\nbDqQho9HA1ydHdiXfJl9yTcHCAxu7M7zo0PlwTshhKhCtTJhmEyKxRtPoRRMGtyOji28OHYhlw17\nU/Fv5MK4uDY42Muw40IIUZVqZcL4cV8q59LziQjxM1+fCG3lLS8yEkIIK6qVh+GL1pzAydGO8XFt\nbB2KEELUG7UyYRRc1zOyd0sauck1CiGEqC61MmE82Lslcd2a2DoMIYSoV2plwnhmVGd5l7YQQlQz\n+asrhBDCIpIwhBBCWEQShhBCCItIwhBCCGERSRhCCCEsIglDCCGERSRhCCGEsIgkDCGEEBaRhCGE\nEMIikjCEEEJYRBKGEEIIi1g9YWzfvp3BgwczaNAgFixYcNt8vV7PK6+8wsCBAxk7diwZGRnWDkkI\nIcRvYNWEYTKZmDNnDgsXLmTNmjUkJCRw7ty5cmWWLVuGh4cHGzdu5PHHH+fdd9+1ZkhCCCF+I6sm\njKSkJJo3b05QUBAODg4MGzaMTZs2lSuzadMmRo0aBcCgQYP46aefrBmSEEKI38iqCUOn0xEYGGj+\n7O/vz+XLl8uVuXz5MgEBAQDY2dnh7u5OXl6eNcMSQgjxG1g1YSil7rmMUgqNRmOtkIQQQvxG9tas\nPCAgoNxFbJ1Oh5+f321lsrKy8Pf3x2g0UlhYiIeHx13r9vV1q/J4aytpizLSFmWkLcpIW1QNq/Yw\nQkNDSU1NJT09Hb1eT0JCAnFxceXKxMbGsmLFCgDWr19Pjx49rBmSEEKI30ijLDlvdB+2b9/O22+/\njVKK+Ph4pk6dyscff0xoaCixsbHo9Xpee+01Tp48iaenJ++//z5Nmsj7uoUQoqaxesIQQghRN8iT\n3kIIISwiCUMIIYRFJGEIIYSwSK1LGHcbm6ouy8rKYtKkSQwdOpThw4fzn//8B4Br164xZcoUBg0a\nxJNPPklBQYGNI60eJpOJUaNGMW3aNADS0tJ45JFHGDRoENOnT8dgMNg4wupTUFDASy+9xJAhQxg2\nbBhHjhypl/vFokWLePDBBxk+fDivvvoqer2+Xu0Xs2bNomfPngwfPtw8rbL94K233mLgwIE89NBD\nnDx58q7116qEYcnYVHWZnZ0dM2fOZO3atXz77bd8/fXXnDt3jgULFhAdHc2GDRuIiopi/vz5tg61\nWvznP/8hODjY/Pkf//gHTzzxBBs2bMDNzY1ly5bZMLrq9fbbbxMTE8O6detYtWoVrVq1qnf7hU6n\nY/HixSxfvpzVq1djNBpJSEioV/vF6NGjWbhwYblpd9oPtm3bRmpqKhs3bmT27Nm8+eabd62/ViUM\nS8amqst8fX1p3749AK6urgQHB6PT6cqNxzVq1Ch+/PFHW4ZZLbKysti2bRsPP/ywedqePXsYNGgQ\ncLMdfvjhB1uFV60KCwvZv38/Y8aMAcDe3h43N7d6uV+YTCaKi4sxGAzcuHEDPz8/EhMT681+ERER\ngbu7e7lpv94PfvmbuWnTJkaOHAlAly5dKCgoICcnp9L6a1XCsGRsqvoiLS2N5ORkunTpwpUrV/Dx\n8QFuJpWrV6/aODrrmzt3LjNmzDAPI3P16lU8PDzQam/u0gEBAfVm30hLS6NRo0bMnDmTUaNG8cYb\nb1BcXFzv9gt/f3+eeOIJ+vXrR9++fXFzc6NDhw64u7vXy/3iF7m5ueX2g9zcXKD8OH5ws/10Ol2l\nddWqhCGPjNxUVFTESy+9xKxZs3B1da13Y29t3boVHx8f2rdvb94nlFK37R/1pV0MBgMnTpzg0Ucf\nZcWKFTg7O7NgwYJ6s/2/yM/PZ9OmTWzZsoUdO3ZQXFzM9u3bbytX39rlTir6e3q3trHqWFJVzZKx\nqcpAbb4AAAdwSURBVOo6g8HASy+9xEMPPUT//v0B8Pb2JicnBx8fH7Kzs/Hy8rJxlNZ18OBBNm/e\nzLZt2ygpKaGoqIi5c+dSUFCAyWRCq9WSlZVVb/aNgIAAAgICCA0NBWDgwIF8/vnn9W6/2L17N02b\nNsXT0xOA/v37c+jQIfLz8+vlfvGLO+0H/v7+ZGVlmctZ0ja1qodhydhUdd2sWbNo3bo1jz/+uHna\nAw88wPLlywFYsWJFnW+T6dOns3XrVjZt2sT7779PVFQU//jHP4iKimL9+vVA/WiHX/j4+BAYGMiF\nCxeAm9dyWrduXe/2i8aNG3PkyBFKSkpQSrFnzx7atGlT7/aLX/cc7rQfxMXFsXLlSgAOHz6Mu7u7\n+dTVndS6oUEqGpuqvjhw4AATJkygbdu2aDQaNBoNr7zyCp07d+Z3v/sdmZmZNG7cmI8++ui2C191\n1d69e/nyyy/57LPPuHTpEtOnTyc/P5/27dvz7rvv4uDgYOsQq0VycjJ/+tOfMBgMNG3alHfeeQej\n0Vjv9otPPvmEhIQE7O3t6dChA2+99RZZWVn1Zr949dVXSUxMJC8vDx8fH1588UX69+/Pyy+/XOF+\nMHv2bHbs2IGzszPvvPMOHTt2rLT+WpcwhBBC2EatOiUlhBDCdiRhCCGEsIgkDCGEEBaRhCGEEMIi\nkjCEEEJYRBKGEEIIi0jCEDXaAw88wNmzZ6tlXZ988km5oa9nzpzJ119/fd/1zpw5k+HDhzN9+vT7\nrqsyycnJrFu3zqrrEPWbJAwhfvbJJ59QWlpapXXm5OTw/9u7v5AmuziA49/ln7S8KOvWoghaI8KL\nihkJWon0R/Y8S2NYOL1IEFqE3gjRRZZEBcPyJqE/lDSIyBp2UV4IEVgGXeyiDKMVFnSRltTmaPr4\ney/Eh3KL9vYG7+vb73O182znnN/DYL+dHfY7fX199Pb2EgwGf+vYcz1//vyXE8b09PRvjkb9H2nC\nUPPS69evOXjwIDU1NRiGYZc+AHA6nXR1dVFdXU1FRQV9fX32c/fv32fnzp14vV66urpwOp0kEgna\n2tpwOBz4fD5M0yQWiwEwPDyM3++nsrKS1tbWH8Zz584dqqqq8Hg8BAIBPn78SDwex+/38/XrV0zT\n5OrVq9/1CYfDHDp0yG5blkVpaaldL+3ixYvs27cPr9dLU1MTY2NjAExOTnL69GmqqqowDINAIMD4\n+DidnZ08fvwY0zRpb28HZiojmKaJx+OhoaGBt2/fAjP/kDcMg5MnT+Lz+Xj48OE/eTvUn0KU+g8r\nLy+Xly9ffndtampKTNOUaDQqIiKxWEwqKyvt9tq1a+X69esiIvL06VMpLS0VEZHR0VHZvHmzjIyM\niIjIlStXxOl0ysTEhN0vkUjY87S2tkptba0kk0lJJpOye/duGRgYSIlxeHhYtm7dKqOjoyIi0tHR\nIUeOHBERkXfv3onb7U57b4lEQtxut3z69ElERPr7+8Xv94uISDgclmPHjtmvDYVC0tLSIiIinZ2d\nEggEZGpqSkTE7t/T0yOHDx+2+4yNjYnb7ZZXr16JiMjNmzelpqZGREQGBwfF5XJJJBJJG5tS6egK\nQ807b968IRqN0tzcjGEY7N+/n8nJye9OX9y1axcAxcXFfPjwgWQySSQSYf369RQVFQFQXV2dMrbM\nqZSzY8cOcnJyyMnJweVyMTIyktJncHCQsrIyli1bBoDP52NgYOCn95GXl8f27du5e/cuMFMYbvYQ\npP7+fh49eoRhGBiGQSgU4v3798BMefe6ujqysrIA7Oqsc0UiEdatW8fq1asB2Lt3L0NDQ0xMTACw\ncuVKNmzY8NM4lZo1r8qbKwUzH+qFhYXcvn077fMOh4OFCxcC2AfnWJaVkgzmttPJzc21H2dlZaU9\nD1pEUs4RmJ33ZwzD4NSpU+zZs4cnT55w9uxZe8ympia8Xm/a+TKRLq5v24sWLcpoHKVm6QpDzTur\nVq0iLy+PcDhsX4tGo8TjcSD1A3W2XVxczLNnz+zf8b/d9wAoKCjgy5cvfzuekpISHjx4YO8x3Lhx\ngy1btqTMn87GjRuJxWIEg0EqKirsRLdt2zZCoRCfP38GIJlM8uLFCwDKy8u5du2avUE/e5JeQUGB\nvfcye79DQ0N22fOenh5cLpcmCvXLdIWh/tMcDgf19fVkZ2fb35h7e3u5cOEC7e3tXL58GcuyWL58\nOR0dHXafuWPAzEEyx48fp7GxkaVLl1JWVkZ2djb5+fkANDQ0UFdXR35+Pt3d3RnHuGbNGpqbm6mv\nr2fBggUUFRXR1taWMv+PGIbB+fPnCYVC9jWPx8P4+DgHDhzA4XAwPT1NbW0tTqeTxsZGgsEghmGQ\nm5vLihUrOHfuHCUlJVy6dAnDMNi0aRNHjx7lzJkztLS0YFkWhYWF9gpGqV+h5c3VHyUej7N48WJg\n5hv3rVu3fst/LZT6E+gKQ/1Ruru7uXfvHpZlsWTJEk6cOPFvh6TUvKErDKWUUhnRTW+llFIZ0YSh\nlFIqI5owlFJKZUQThlJKqYxowlBKKZURTRhKKaUy8hf8CwfjbzhfpQAAAABJRU5ErkJggg==\n",
+            "text/plain": [
+              "\u003cmatplotlib.figure.Figure at 0x7f47b218dbd0\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "display_data"
+        }
+      ],
+      "source": [
+        "plt.plot(ag_means)\n",
+        "plt.ylabel('Time(s)')\n",
+        "plt.xlabel('Length of vector')\n",
+        "_ = plt.title('Time to sum the elements of 1000 vectors (AutoGraph)')\n",
+        "_ = plt.ylim(ymin=0)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "d7IAJ6Bwbk9t"
+      },
+      "source": [
+        "## Eager"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "XMu5-12yoOzY"
+      },
+      "outputs": [],
+      "source": [
+        "from tensorflow.python.eager import context"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "_vt9MzpyjQ4T"
+      },
+      "outputs": [],
+      "source": [
+        "# Sum written using for loop and run with tf.eager\n",
+        "def sum_all(elements):\n",
+        "  sum_ = 0.0\n",
+        "  length = elements.shape[0]\n",
+        "  for i in tf.range(length): \n",
+        "    sum_ += elements[i][0]\n",
+        "  return sum_\n",
+        "\n",
+        "eager_means = []\n",
+        "for num in range(max_elements):\n",
+        "  with context.eager_mode():\n",
+        "    durations = []\n",
+        "    for i in range(trials + burn_ins):\n",
+        "      \n",
+        "      start = time.time()\n",
+        "      for _ in range(batches):\n",
+        "        run_trial(num)\n",
+        "      \n",
+        "      if i \u003c burn_ins:\n",
+        "        continue\n",
+        "      \n",
+        "      duration = time.time() - start\n",
+        "      durations.append(duration)\n",
+        "    eager_means.append(np.mean(durations))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "height": 301
+        },
+        "colab_type": "code",
+        "executionInfo": {
+          "elapsed": 422,
+          "status": "ok",
+          "timestamp": 1532460024499,
+          "user": {
+            "displayName": "",
+            "photoUrl": "",
+            "userId": ""
+          },
+          "user_tz": 240
+        },
+        "id": "5gHVdMlD-A-T",
+        "outputId": "3b581cb7-7ef9-489c-92f1-3e52c0c2dc8a"
+      },
+      "outputs": [
+        {
+          "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYkAAAEcCAYAAAAydkhNAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzs3XlclNX+wPHPsC+CILviCiruhAiiqaAimqnglpZLWpkt\ntmh50+rXvZZ2q9veq5ulZYtlaZp7mQuaorjklkoqiwgKsojsy8yc3x9cBxFUVGBYvu+/mGfmPM93\nzjzMd85zznOORimlEEIIISphYuwAhBBC1F2SJIQQQtyQJAkhhBA3JElCCCHEDUmSEEIIcUOSJIQQ\nQtyQJAlg8eLFvPrqq8YOo14ZOHAge/furfHjzJs3jw8//LDGj1NXxMfHExERQc+ePfnuu++MHU6D\ndvbsWcaMGWPsMCo1btw4YmNjjR0G0EiSxD333IOfnx9+fn506tSJHj16GLZt2LCBxx9/nNdff73G\n40hOTsbHxwe9Xl/jx6pOje2L+lZq8nNcsmQJgYGBHDp0iEmTJlV4fvPmzUyYMAFfX1+mTJlS4flT\np04xevRofH19GTNmDDExMeWef+eddwgMDKR379688847t1W2pk2ePJlVq1bV2vE++ugjHn30UcPj\ngQMH0qNHD/z8/AzfD2+88UatxXOtRx55pM78zzWKJHH48GH+/PNP/vzzT5o3b87ixYsN2+6///5a\ni0MphUajQe5frN9q8nO8cOEC3t7eN3zewcGBhx9+mBkzZlR4rqSkhKeeeorw8HAOHDhAeHg4Tz75\nJFqtFoAVK1awfft21q9fz7p164iMjOTHH3+sUtn64HY+j7S0NKKjoxk0aFC57YsXL+bPP/80fD+8\n8sor1R3mTel0OqA0YUVHR5Oenl6rx69Mo0gS11JKVTiZPvnkE1588UWg7Ffi6tWrCQ4OJjAwkBUr\nVnD8+HFGjhxJQEBAhVbHqlWruO+++wgMDOTRRx/lwoULlR578uTJAPj7++Pn58fRo0dRSvHpp58y\ncOBA+vbty0svvURubm6l5S9fvszMmTPp1asXgYGB5X5p+vj4cP78ecPja3/979+/nwEDBrBkyRL6\n9OlDv3792Lp1Kzt37iQsLIzAwEAWL15c6TF/+ukn1q9fz5IlS/Dz8+OJJ54wPHfq1ClGjhxJr169\nmD17NsXFxYbnduzYQXh4OL169WLixIn8/fffle4fIDY2lunTpxMYGMiwYcPYvHnzDV97s/0OHDiQ\npUuXMnLkSO655x5eeeUVMjIyeOyxx/Dz82P69Onk5OQYXn/kyBEmTJhAr169CA8PZ//+/YbnJk+e\nzIcffsjEiRPx8/PjkUceISsry/AclP8cExMTmTx5Mv7+/gQFBTF79uwbvodt27Zx//33ExAQwJQp\nU4iLiwNg6tSpREdHs2DBAvz8/Dh37lyFskFBQQwdOhQXF5cKz+3fvx+dTseUKVMwNzdn8uTJKKXY\nt28fAL/88gvTp0/H1dUVV1dXpk2bxpo1awCIjo6+adlrbdq0qcJlmmXLlvHkk08CUFxczFtvvUVI\nSAj33nsv//znP8udG1u3biU8PJyePXsyZMgQdu/ezfvvv8+hQ4d4/fXXy/2C//PPPxk7diy9evVi\n3LhxHD58uNxn9P777zNx4kR8fX1JSkpi9erVDB48GD8/PwYPHsyGDRsq/Qz27NlDly5dsLCwKLf9\nRonm/PnzTJ06lcDAQIKCgnjhhRfK/Z+eOHHCcJnw2Wef5fnnny/XErjVefvFF18Yzlu9Xo+FhQVd\nunRh9+7dlcZTq1QjExISoqKiospt+/jjj9WLL76olFIqKSlJdezYUb322muqqKhI7dmzR3Xr1k09\n9dRTKjMzU6WkpKigoCB14MABpZRSv//+uxoyZIiKi4tTOp1O/fe//1UPPPBApcdOSkpSPj4+Sq/X\nG7atXLlSDRkyRCUlJan8/Hz19NNPG2K53rvvvqtee+01pdPplFarVQcPHjQ85+PjoxITEw2PX3rp\nJfXBBx8opZSKjo5WnTt3Vp9++qnSarXqp59+Ur1791Zz5sxR+fn56syZM6pbt27q/PnzlR732n1d\nW4/jxo1TaWlp6sqVK2rYsGFqxYoVSiml/vrrLxUUFKSOHTum9Hq9WrNmjQoJCVHFxcUV9p2fn68G\nDBig1qxZo/R6vTp58qQKDAxUZ8+erXDsW+03JCREPfDAAyojI0OlpqaqoKAgFRERoU6dOqWKi4vV\nlClT1CeffKKUUiolJUUFBASoXbt2KaWUioqKUgEBASozM1MppdSkSZNUaGioOnfunCoqKlKTJk1S\n77777g0/x9mzZ6vPPvtMKaVUUVGROnToUKV1GRcXp3x9fVVUVJTSarXqiy++UKGhoaqkpMRw3JUr\nV1Za9lo//fSTmjx5crltX331lXrsscfKbXv88cfVV199pZRSqmfPnuro0aOG544fP678/PyqVPZa\nBQUFys/PT507d86wbcyYMWrTpk1KKaXeeOMN9cQTT6js7GyVl5enZs6cqd577z2llFJHjx5VPXv2\nNPwPpqamqri4uErfe1ZWlurVq5dat26d0ul0asOGDapXr14qKyvL8PqQkBB19uxZpdPpVE5OjvLz\n81MJCQlKKaXS0tIM59H13nrrLbVgwYJy2yr7brjq3LlzKioqSpWUlKjMzEw1adIktWjRIqWUUsXF\nxSokJER9++23SqvVqi1btqguXbrc1nkbHh6uUlJSVFFRkeGYr7/+uvr3v/9daTy1qdG1JKpCo9Hw\n1FNPYWFhQZ8+fbC2tmb48OE4Ojri5uaGv78/J0+eBODHH39kxowZtG3bFhMTE2bMmEFMTAwXL168\n4f7VNb9WNmzYwMMPP0yLFi2wtrZm9uzZbNq0qdLr3WZmZqSlpZGUlISpqSk9e/asdJ+VMTc3Z+bM\nmZiamnLfffdx+fJlpk6dirW1Nd7e3nh7e9/0135lpkyZgrOzM/b29oSEhHDq1CkAVq5cyYQJE+jW\nrRsajYbw8HAsLCw4evRohX3s2LEDT09PwsPD0Wg0dOrUiSFDhvDrr79WeG1V9jtp0iSaNWuGq6sr\n/v7+9OjRAx8fH8zNzQkNDTXEuG7dOoKDg+nXrx9Q+gu9a9eu7Ny507Cv0aNH06pVKywsLBg2bJih\n7FXX1rmZmRnJycmkpqZiYWGBn59fpXW2efNmgoODCQoKwtTUlEceeYTCwsJyv5DvVH5+PnZ2duW2\nNWnSxPCL9/rn7ezsyM/Pr1LZa1lZWTFo0CDDr/SEhATi4+MNl25WrVrFvHnzsLOzw8bGhhkzZhhe\nu2rVKsaOHUtQUBAArq6utG3bttL3ExkZSZs2bRgxYgQmJiYMHz6cdu3asWPHDsNrIiIi8PLywsTE\nBFNTU0xNTTl9+jRFRUU4Ozvj5eVV6b5zcnKwtbWtsP2pp54iICCAXr16ERAQwMqVKwFo1aoVQUFB\nmJmZ4ejoyNSpUzlw4ABQ2iLV6XRMmjQJU1NTQkND6d69u2GfVTlvp0yZgpubW7mWja2tLdnZ2ZXG\nX5vMjB1AXeXk5GT428rKCmdnZ8NjS0tLwz/XhQsXWLhwIW+99RZQdr06NTUVDw+PWx7n0qVLNG/e\n3PC4RYsWaLVa0tPTcXV1LffaRx99lI8//pjp06ej0WgYN25cpdemK+Pg4IBGozG8n8re49X3VFXX\nlre2tiYtLQ0orZO1a9caRucopdBqtVy6dKnCPi5cuMCRI0cICAgwvFan0xEeHl7pa2+132tjsrS0\nrPD42s9t8+bNhi+cq/u6+uUFlPvMra2tb1o/c+fO5YMPPmDs2LGGfoPKRs5c/3lrNBo8PDxITU29\n4b6rysbGpsKXem5uLk2aNKn0+dzcXGxsbKpU9nrDhw/n7bff5sknn2TDhg0MHjwYCwsLMjMzKSgo\nKPfe9Xq9IaGmpKQwYMCAKr2f6+sKoHnz5uXqyt3d3fC3tbU177//PkuXLmX+/Pn07NmTuXPn0q5d\nuwr7tre3Jy8vr8L2Tz/9lN69e1fYnpmZyRtvvMHBgwfJz89Hp9Ph4OAAlPZvuLm5lXv9tf/7VTlv\nr30fV+Xl5WFvb19he22TJHGX3N3deeKJJ6rUAX71S/parq6u5fowkpOTMTMzK/cFdZWNjQ3/+Mc/\n+Mc//kFsbCyTJ0+me/fu9O7dG2trawoKCgyvTUtLq/TEqw3u7u7MnDmTxx9//Jav9fDwIDAwkKVL\nl1brfqty3PDwcBYsWHDbZSv7HJ2cnAx9VYcOHWLatGkEBATQsmXLcq9zdXXlzJkz5bZdvHixWj6r\n9u3bs2zZsnLbTp8+behD8fb2JiYmhm7dugGlfUrt27e/adnKRlgB3HvvvcybN4+YmBg2btzI/Pnz\nAXB0dMTa2poNGzZU+JEDpZ/htX1n17q+Xl1dXdmyZUu5bRcuXKB///43LNO3b1/69u1LcXEx77//\nPq+++irLly+vcKyOHTuydu3aCttv1CJ/99130Wg0bNiwAXt7e7Zu3WroN3FxcamQ5C9evEirVq0M\n7/lOztu4uDhGjhx5W2VqglxuqsStLt1ca+LEiSxevJizZ88Cpc3Yyi6VADRr1gwTExMSExMN24YP\nH86yZctISkoiLy+P999/n+HDh2NiUvGjiYyMNJS1sbExNK+htON6w4YN6PV6du3aZWgKVwdnZ+cb\n/mNXZvz48axYsYJjx44BpZcydu7cWekv8eDgYOLj41m7di1arZaSkhKOHz9u6My90/3eysiRI9m+\nfTu7d+9Gr9dTVFTE/v37q/SLvrLP8ddffzWUtbe3x8TEpNLPcNiwYURGRrJv3z60Wi1Lly7F0tIS\nX1/fKsWt1+spLi5Gq9WW+xsgICAAExMTvv32W4qLiw2/XAMDAwEIDw9n2bJlpKamkpqayrJlyxg9\nevRNy1b2qxrA1NSUsLAw3n77bbKzs+nbty+AoYW7aNEiMjMzAUhNTTV0wI4dO5bVq1ezb98+lFKk\npqYaPuvrz7MBAwZw7tw5Nm7ciE6nY9OmTcTFxRESElJpTBkZGWzfvp2CggLMzMwM/yOV6du3LydO\nnCjXoX4zeXl52Nra0qRJE1JTU8v9qPH19cXU1JTly5ej0+nYunWr4RyFOztvi4uLOXHihKFejanR\nJYnKfgXe6jU3ezx48GAee+wxnn/+efz9/Rk5ciR//PFHpfu1srJi5syZTJw4kYCAAI4dO8bYsWMZ\nNWoUkyZNIjQ0FGtr6xsOu0tISODhhx/mnnvuYeLEiTz00EP06tULgJdffpnt27fTq1cvNm7cyODB\ng+/qPV5r7NixnD17loCAAJ5++ulbvr5r1668/vrrLFiwgICAAMLCwgyjaK5na2vLl19+yaZNm+jX\nrx/9+vXj3XffrfSf91b7vZ335O7uzqeffsrixYsJCgoiJCSEL7/80vAD4WZlK/scjx8/zrhx4/Dz\n8+Opp57i5ZdfpkWLFhXKtm3blnfeeYfXX3+doKAgIiMj+eyzzzAzM7vlcQHWrl1L9+7dWbBgAYcO\nHaJHjx6GG0HNzc359NNPWbNmDQEBAaxevZpPP/3UsO8JEyYQEhLCyJEjGTlyJCEhIYwfP75KZSsz\nfPhw9u7dy7Bhw8olxBdeeIHWrVszfvx4/P39mT59OgkJCQB0796dRYsWsWjRInr27MmUKVMM/XdT\npkzh119/JTAwkIULF+Lg4MBnn33G0qVL6d27N0uXLmXx4sU0bdq00rrS6/V89dVX9O/fn969e3Pg\nwAFee+21SmN3cnKid+/ebN26tdz2J554wnBPlZ+fH7NmzQLg6aef5q+//sLf35+ZM2cSFhZmKGNu\nbs7HH3/MypUr6dWrFxs2bGDgwIGG/oXbPW+hdARcYGBgpaPYaptG3c7P5ts0f/58IiMjcXJyYv36\n9QDExMTw2muvUVRUhJmZGa+99pqh+SuEELUlNjaWl156ydA5XZ3Gjx/PxIkTiYiIuKPyDzzwAAsX\nLrzpPTO1pUaTxMGDB7G1tWXu3LmGJPHII48wbdo07r33Xnbu3MmSJUv49ttvayoEIYSocQcOHKBt\n27Y4Ojqybt06/vWvf7F169ZK+xbrmxrtuPb39yc5ObncNo1GY7ihKScnp8KoACGEqG/i4+N57rnn\nyM/Pp1WrVnz00UcNIkFADbckoHS0zsyZMw0tidjYWB599FHDnc8rVqyo0lBRIYQQta/WO65/+OEH\nXn75ZSIjI5k3b55h6JwQQoi6p9aTxC+//GIYeTN06NByQ8VupoYbPEIIUe9l5xVz/Gx6tX5f1vjN\ndNcH6+bmxv79+wkICGDv3r20adOmSvvRaDSkpeXc+oWNgIuLndTF/0hdlJG6KNPY6iL9SgFb9p9n\n17ELFJfo+ee0XrRyK51mxcXF7halb65Gk8ScOXOIjo4mKyuL4OBgZs2axeuvv84bb7yBXq/H0tKy\nVtZxEEKIhijjSiG/7I5j71+p6JWimb0lw4Jb09K18ulU7kSNd1xXp8b0y+BmGtuvpJuRuigjdVGm\noddFXmEJG/eeY+vBJLQ6Pc2dbRkW2IrAzm6YmZbvRajTLQkhhBDV61RCJv9de4LcghKa2VsS0a8d\nQV3cMTG59WwSd0KShBBC1BORR5JZvuU0AGODvRjc0xML88rnp6oukiSEEKKO0+n1/LQ9lt8PnqeJ\ntTlPj+5Gh5YOtXJsSRJCCFFHXczIY/fxi+z9K4Ws3GI8nGx4dlwPXB2say0GSRJCCFHHFBRp+WL9\nSY6cTQfA2tKMgX4tGN3fCxur2v3aliQhhBB1SGZ2IR+sPEZSWi7enk0Z5OfJPe2da7zv4UYkSQgh\nRB1x/lIuH6w8yuWcIkL8WvDQ4A41NmqpqiRJCCGEESmlOJt8hT3HU4g+mUpRiY7xId6EBbSs0iJp\nNU2ShBBCGMmhvy+xMjKWS5dL16d3tLNk+vBO9PKpuD64sUiSEEKIWqbXK9b8EcfGvecwNzMhqIsb\nfbp50KmVo9EvL11PkoQQQtSi/MISFq87yfG4DFwdrZk1uhstXKpvrqXqJklCCCFqScaVQt798Qgp\nmfl0bdeMx0d2wdbK3Nhh3ZQkCSGEqAUX0vN498cjXM4pYkivlowP8a5zl5YqI0lCCCFqWNyFbD5Y\neZTcghLGhXgxLLC1sUOqMkkSQghRA5RSJKTksPvYRfb8dZESrZ5pw3zo16O5sUO7LZIkhBCimh2L\nzWBV5FmS0vIAcGhiweQRHbmng4uRI7t9NZok5s+fT2RkJE5OTqxfv96w/dtvv2X58uWYm5szYMAA\nXnjhhZoMQwghas3Wg+f5YdsZTDQaenZ0oV93D7q0bYapicmtC9dBNZokRo8ezeTJk5k7d65hW3R0\nNDt27GDDhg2YmZmRmZlZkyEIIUSt0OsVK7afYevBJOxtLXh2bHfaetgbO6y7VqNJwt/fn+Tk5HLb\nfvjhBx577DHMzEoP3axZs5oMQQghaoRerzgWm0Fyei5pWYWcS83hXEoOzZ1teW5sd5xrcTrvmlTr\nfRIJCQkcPHiQ999/H0tLS+bOnUu3bt1qOwwhhLhjWp2epRtPEX0ytdz27l5OzBjRGZs6fu/D7aj1\nJKHT6cjOzuann37i2LFjPPfcc2zbtq1KZe92Qe+GROqijNRFGamLMjVVF0UlOv799QEOnkqlU5tm\njB3UHrdmNrg52mBl2fDGAtX6O3J3d2fIkCEAdO/eHRMTEy5fvoyjo+Mty6al5dR0ePWCi4ud1MX/\nSF2UkbooU1N1UVCk5eOfjxGTmEWXts14OqIblhal6zzkZBdQF2v/bpNljXe3K6XKPR48eDB79+4F\nID4+Hq1WW6UEIYQQxhR/MZvXvz5ITGIWPTu68MyY7oYE0ZDVaEtizpw5REdHk5WVRXBwMLNmzWLM\nmDHMmzePESNGYG5uzltvvVWTIQghxF3R6fVs3HuO9XsS0OkVQ3q1ZFyIV70d0nq7NOr6n/p1mDSl\nS8llhTJSF2WkLspUV12kXylg8boTxCZn42hnySPDO9G5Tf0akXm3l5saXi+LEEJUgyNn01m64SR5\nhVoCOrkyOaxjnZ+xtSZIkhBCiGsUlehY+0c8v+5PxMzUhKlDO9K/R/M6sZSoMUiSEEIIILeghO2H\nkth6KIncghLcmtnwxKgutHJr3MOKJUkIIRo1vV6xYW8Cm/ado7hEj62VGff3acOwwFZYN8D7Hm6X\n1IAQotG6klfM5+tOcOrcZRyaWDCmf2v69fDAykK+Gq+SmhBCNEp/J17ms7UnuJJXjK+3M4/c36lR\ndkzfiiQJIUSjUqLV88vuOH6NTkSDhnEhXgwNaNVoO6ZvRZKEEKLRSEzNYcmGkySl5eHc1IrHRnSm\nvaeDscOq0yRJCCEavBKtno17E9i49xw6vWKAb3PGh3hLx3QVSA0JIRq00+ez+PrXGC5m5ONoZ8nU\noT5093Iydlj1hiQJIUSDlH6lgLV/xLPnrxQ0wCA/T0YPaCeth9sktSWEaFCu5Baxenc8m6NKJ+Tz\ndLFlylAfvFs0NXZo9ZIkCSFEvafT6zkRn8nu4ykcOZOGVqdwcbAi/N52BHZ2w8RERi7dKUkSQoh6\nq6hYx/Y/k9hy8DxXcosBaO5sS/gAL3zbNcPMtHFM512TJEkIIeqdohIdO/5MZnP0OXLyS7C2NCPE\nrwX3dvOgjbsdrq72Mm16NZEkIYSoV1Iz8/lw1TFSMvOxtjRlZN82DOnVEhu5W7pG1GhbbP78+fTp\n04cRI0ZUeG7p0qX4+PiQlZVVkyEIIRqQUwmZvPHNQVIy8xnU05O3ZvYhvF87SRA1qEaTxOjRo1m6\ndGmF7SkpKURFRdG8efOaPLwQooEo0erYevA87/10lMJiHdPu8+Gh0A40sZbkUNNq9HKTv78/ycnJ\nFbYvWrSIuXPn8sQTT9Tk4YUQ9ZhSiuNxGUSfvMSRs2kUFOloYm3O06O70aGlTKVRW2q9T2L79u14\neHjQsWPH2j60EKIeWRUZy+boRACc7K0Y4NuCQX6eODW1MnJkjUutJonCwkI+++wzvvzyS8M2pVSV\ny9/tgt4NidRFGamLMg2lLk7EZfDr/kQ8nG2Z/aAfHVs53vYsrQ2lLoytVpNEYmIiycnJjBo1CqUU\nqampjBkzhpUrV+LkdOu5VGRIWykXFzupi/+RuijTUOqiqFjHe98fAgXThvrgZGNOenrube2jodRF\ndbjbZFnjSeLalkKHDh3Ys2eP4fHAgQNZs2YNTZvK7fJCiFKrdsZy6XIBQwNa4e0p3w3GVqOjm+bM\nmcOECROIj48nODiYn3/+udzzGo3mti43CSEatphzl9l2KAkPJxvC+7U1djiCGm5JvPvuuzd9ftu2\nbTV5eCFEPXL4TBpfbjyFRgOPDO+MhbmpsUMSyB3XQggj0+r0rIqMZcuB85ibmTD9vk60a25v7LDE\n/0iSEEIYTWJqDl//+jfxF7Nxb2bDE+FdaenaxNhhiWtIkhBC1LrcghLW7Ioj8kgySkFQFzcmh3XE\nykK+kuoa+USEELXm6uytG/cmkFeoxcPJhomD29O1rSwnWldJkhBC1LgSrY7IIxfYtPccV/KKsbY0\nY8JAbwb29JQ1H+o4SRJCiBqhV4qzSVfYdzKVA6dSySvUYmlhyv19WhMW0Apbmbm1XpAkIYSodhcz\n8vho1TFSLxcA0NTWgvt6tyYsoCV2NhZGjk7cDkkSQohqdTmniPd+PEJGdhG9u7jRt6sHnVo7yjrT\n9ZQkCSFEtckvLOG9n0oTRET/dozo08bYIYm7JD1GQohqUVSi46NVx0hOy2OQnyf3B7U2dkiiGkhL\nQghxx/ILtRw9m87hs+n8FZdBYbGOXj6uTBzc/ran9hZ1kyQJIcQdOZt8hU9+PkZ2fgkALg5WDOrp\nyci+baX/oQGRJCGEuG37TqTw5aYY9HrF/X1aE9jJjebOttJ6aIAkSQghqkwpxdrd8azbk4C1pSlP\nhHeTu6UbOEkSQogqUUqx/PfTbP8zGeemVjw7rgctnG2NHZaoYZIkhBC3pFeK5VtOs+NwMp4uTXhh\ngi/2tnJTXGNQo0li/vz5REZG4uTkxPr16wF4++232bFjBxYWFrRq1Yo333yTJk1kamAh6iq9Uny3\n5TSRh5Np6VqaIOSu6cZDo2pw/dCDBw9ia2vL3LlzDUkiKiqK3r17Y2Jiwn/+8x80Gg1z5syp0v5k\nYfNSssh7GamLMtVVF4XFWtbtTiD2whWy84rJzi+moEhHK9cmvDDxHppY1/05l+S8KOPiYndX5Wu0\nJeHv709ycnK5bX369DH87evry2+//VaTIQghbkNs8hW+WH+SS1kFaDRgZ22Ok70VLVya8FBoh3qR\nIET1MmqfxKpVqxg+fLgxQxBCAHq9Yn1UAuv3JKCUYlhgK8L7tcPcTCZlaOyMliT++9//Ym5uzogR\nI6pc5m6bTQ2J1EUZqYsyd1IX+YUlvPPdIQ6eSsXZwZrZE/3o5u1cA9HVLjkvqodRksSaNWvYuXMn\n33zzzW2Vk2uMpeR6axmpizJ3UheZ2YV8uOoY5y/l0qVtM2aO6oKtlXm9r1M5L8rU6T4JKB1bfa1d\nu3axZMkSvvvuOywsZISEEMZyLiWHD1cdJSu3mGDf5jwY2kFWiRMV1GiSmDNnDtHR0WRlZREcHMys\nWbNYvHgxJSUlTJ8+HYAePXrwz3/+sybDEEJc5+/Ey3y46hhFxTrGh3gTFtBSptQQlarRIbDVTZqP\npaQpXUbqokxV6+LI2XT++8tf6PWKx0Z0JqCTWy1EV7vkvChT5y83CSGMK7eghKycIrLzi0lMzWVV\nZCxmphqeGdudbu1k3iVxc5IkhGjA/jh6gW9++xudvuyCgY2lGc+O6057TwcjRibqC0kSQjRQx2Iz\n+PrXv7GxMqNXJ1fsbSywszGnWzsnXBysjR2eqCckSQjRACWkZPPfX/7C1FTDs2O749WiqbFDEvWU\njHcTooFJSsvlg5XHKC7RMWNEF0kQ4q5IS0KIBuDvxMus3h3PgRMppF4uAOCh0A707Ohi5MhEfSdJ\nQoh6rKhEx4/bzxJ5uHQiTUsLU+5p70xAJzcCOze8oa2i9kmSEKKeSrqUy2frTnAhPQ9PF1seH90d\nN3tLuWtXb7olAAAgAElEQVRaVCtJEkLUMyVaPb/tT2TdngS0Oj2D/DwZF+JFi+YOcgOZqHaSJISo\nR04mZPLdltOkZOZjb2PO1GFduKe99DuImiNJQoh6oLBYy/LfT7PneAoaYKBfC0b3b4eNlSwCJGqW\nJAkh6rjE1Bz++8tfpF4uoLWbHVOHdaSNu72xwxKNhCQJIeoopRTb/0zmx+1n0OoUQwNaMXpAO+mY\nFrVKkoQQdZBSip93xrFp3zmaWJvz6P2d6O5V/1eLE/WPJAkh6hi9UqzYeoath5Jwc7TmhQn34NTU\nythhiUbqlkni/PnzrFq1iujoaFJSUrC0tMTHx4ewsDCGDBmCmdmNdzF//nwiIyNxcnJi/fr1AFy5\ncoXnn3+e5ORkPD09+eCDD7Czk7VoReNUotVzObeIgkItCoVSsPNIMruOXqSFiy0vPOBL0yaWxg5T\nNGI3XXTo//7v/zhx4gRDhw7lnnvuwdnZmaKiImJjY9m9ezcnT57kn//8J76+vpWWP3jwILa2tsyd\nO9eQJN555x0cHBx47LHH+Pzzz8nOzuaFF16oUrAyBryULKhSpj7WRVpWAd/8GsP5S7lk55dU+prW\nbnbMmeBLE+uqj16qj3VRU6QuytTookODBg1iwYIFFbZ37NiR++67j6ysLM6fP3/D8v7+/iQnJ5fb\ntm3bNr777jsAIiIimDx5cpWThBD1XWzyFT76+Rg5+SW4OlrTwqUJjnaW2FiZYaLRoNGAjZU5g/xa\nyPBWUSfcNEkMGDDgpoUdHBxwcLi9hUsyMzNxdi7tgHNxceHy5cu3VV6I+upAzCWWbDiJTqeYPKQD\nIX6exg5JiFuq0li6f//73+Tk5KDVannwwQfx9fVl7dq1NR2bEPVadl4xO48k882vMfxr2YHS9R1M\nNDw7rrskCFFvVGl0U1RUFC+99BKRkZG4ubnx/vvvM2PGDEaNGnXbB3RyciI9PR1nZ2fS0tJo1qxZ\nlcve7bW1hkTqokxdrIviEh3zv9hHSkY+AGamJnRq04wnx/agjUfN3QhXF+vCWKQuqsdtDYE9cOAA\noaGhuLm5odFoqlTm+n7xgQMHsnr1ambMmMGaNWsYNGhQlY8vHVGlpFOuTF2ti1+jE0nJyCeoiztD\nerWkhYut4Sa4moq3rtaFMUhdlLnbZFmly01OTk688sorbNq0ib59+6LVatHpdLcsN2fOHCZMmEB8\nfDzBwcH8/PPPzJgxg6ioKMLCwti7dy8zZsy4qzcgRF2TW1DChqgEbK3MeDC0Pa3d7eQuaVFvVakl\n8e6777Ju3TrGjh1L06ZNSUpKYtq0aVUqV5lly5bdVpBC1CcbohLIL9LywEBvbGWEkqjnqpQkmjVr\nxsMPP2x47OnpiaendLwJcb20rAK2/5mEc1MrBkrntGgAbtoGfvLJJzl27Filz+Xm5vL111/z448/\n1khgQtRHa3bFodUpRvdvh7mZXGIS9d9NWxLPPPMM7777LgkJCXTv3h0nJyeKioqIi4sjOTmZCRMm\nMHHixNqKVYg6Sa9XHI1NZ+vBJE6du0xrdzsCZH1p0UDcNEn4+PjwxRdfcPHiRfbv309qaiqWlpYM\nHTqUnj17YmFhUVtxClEnnU26wpINJ7mUVQBAp9aOPBTaAZMqjv4Toq6rUp+Eh4fHHd0TIURDlltQ\nwqe/HCc7r4T+PTwY3LMlnq5NjB2WENWqShdNMzIyeOGFF3jooYcAiImJ4YcffqjRwISo677b8jdZ\nucWE92vLw8M6SYIQDVKVksQrr7xCz549yc7OBqBdu3Z8//33NRqYEHXZvpMp7D91Ce8WTRnWu5Wx\nwxGixlQpSaSmpjJx4kRMTU0BsLCwwMRERm6Ixikzu5DvfjuNpbkpj97fCVP5XxANWJXO7usXFsrO\nzq4w3YYQjUFqZj6frD5OfpGWCYO8cXW0MXZIQtSoKnVcDxkyhP/7v/8jLy+P1atX8/333zNmzJia\njk2IOkOr0/Pb/kTW7k5Aq9PTp6s7/Xs0N3ZYQtS4KiWJRx99lHXr1pGdnc3OnTuZPHmyjHYSjYJS\nimOxGfy8M46ktFzsbS14KLQD/h1dqjzJpRD1WZVngR05ciQjR46syViEqDZanf6uJtXT6vT8FZfJ\nuj3xJKSUziZ6b3cPmY9JNDpVShIZGRl89913JCYmotVqDds//PDDGgtMiDuVmJrDv5Yd4KmIbvh1\ncKlSGaUUv/wRz7HYDC7nFBrWntYAvXxcGdG3DZ4uMsRVND5VShJPPvkknTt3JigoyDDCSYi66kzS\nFZSCY7EZVU4S6/YksD4qAXMzE5rZWdLc2Ra3ZjYM6ukpyUE0alVKEgUFBbz22ms1HYsQ1eLS5dIp\nMs6lVm3RmX0nU1i7Ox7npla8MsUfe1uZbkaIq6qUJHr06MHff/9Nx44dazoeIe5a2v/mUUpOy63Q\nN/H7gfOsj0qgh7cTfbt6YGZmwpcbY7C2NOXZsd0lQQhxnSoliQkTJjBp0iTc3d2xtLQ0bF+1atUd\nH3jZsmWsWrUKjUZDhw4dePPNN2XCQFEtrk62p9UpLqTn0cqtbPnGvSdSyC0oYc/xFPYcTwFAo4Fn\nRvWghVxWEqKCKiWJF198kZkzZ9K5c+dq6ZNITU3l22+/ZfPmzVhYWPDcc8+xadMmwsPD73rfonHT\nK2VoSUDpJaerSaKwWEtiai5eze0ZG+zF7uMXOR6bQXj/dnRt52SskIWo06qUJCwtLXnkkUeq9cB6\nvZ6CggJMTEwoLCzE1dW1WvcvGqcrucWUaPU42VuRkV1IYmqu4bnYC9nolaJDSwc6tnKkYytHI0Yq\nRP1QpYHk/fr1Y9euXdV2UDc3N6ZNm0ZwcDD9+/fHzs6OPn36VNv+ReN16XI+APd0cMZEoynXeX3m\nfBYA7T0djBKbEPVRlVoSP/30E59//jm2trZYWFiglEKj0bB37947Omh2djbbtm1jx44d2NnZ8cwz\nz7B+/XpGjBhxR/sT4qqr/RGeLk3wcLLhfGoueqUw0Wg4k3QFAG/PpsYMUYh6pUpJ4ueff67Wg0ZF\nRdGyZUscHEp/0YWGhnL48OFbJgkXF7ubPt+YSF2UubYu8or1ALRv04xzl3JJPpRECRrcm9kSdzGb\nVu52tG3VzFih1jg5L8pIXVSPKiWJFi1aVOtBmzdvztGjRykqKsLCwoJ9+/bRrVu3W5ZLS6vauPeG\nzsXFTurif66vi4Tk0ktKFoCbgzUAh0+l4OZoQ1GxjnYe9g227uS8KCN1UeZuk+VNk8SLL77IO++8\nw5gxYyqdzOxOh8B2796dsLAwwsPDMTMzo3PnzowfP/6O9iXEtdKyCjAz1eBoZ0lrt9IhrYmpuVzJ\nLQagvVxqEuK23DRJXLp0CYB//OMf1X7gp59+mqeffrra9ysat7SsQpyaWmNioqGla+kvqHMpOVhb\nlp7qHaTTWojbctMkcXW50oCAgFoJRoi7kV+oJbeghLYe9gDYWJnh6mBNYmoOJiYamtlb4tTUyshR\nClG/yLqLosG4ehOd6//6IgBauduRV6glJ79EWhFC3IGbtiROnz5NUFBQhe13OwRWiJpwdfiri2NZ\nkmjt1oSDMaWXTaU/Qojbd9Mk0aZNGz7//PPaikWI25KTX4xdcdn6JldvpLu2JdH6mnmb2reUloQQ\nt+umScLCwqLah78KUR2KS3S8siQab08HZo0uHT6dVklL4uq8TbZWZjR3tq39QIWo526aJMzNZZlG\nUTfFJGaRk1/C4dNpnEvJobW7nWEdCZdrOqftbS3o29UdFwdrTGRNaiFu2007rn/66afaikOI23Is\nNt3w9+8HzwOlLQlHO0sszMvPVPzI/Z0ZeW/bWo1PiIZCRjeJekcpxbHYDKwtzWjh0oTok6mkXykg\nM7sIl2v6I4QQd0+ShKh3Lmbkk36lkC5tmzGqfzt0esWqyFgU4OIg90EIUZ0kSYh651hsBgA9vJwI\n6dkSWysz9p8qHebqKi0JIaqVJAlR71ztj+jazgkrSzMG+JaNwLt2ZJMQ4u5JkhD1Sn6hljNJV2jr\nYUdT29I10Qf6tcDUpHTkkquDjTHDE6LBkSQh6pWTCZno9Ipu16xJ3czeij5d3bG2NMPDSZKEENWp\nSutJCFFXGPojvJ3LbZ8ytCMPDGxvmO1VCFE95D9K1Bt6pTgel4G9jTmt3csvpGJqYoKNlTSMhahu\n8l8l6o0T8ZlcySumWzsnuXtaiFpitJZETk4OL7/8MmfOnMHExIRFixbRo0cPY4Uj6jCtTs+GqAQ2\nRJ1DAwR1dTd2SEI0GkZLEgsXLmTAgAF89NFHaLVaCgsLjRWKqMMupOexdONJ4i/m4GRvySPDO+PT\n2tHYYQnRaBglSeTm5nLw4EH+/e9/lwZhZkaTJk2MEYqoo/ILtazbE8+2Q0no9IqgLu48FNoBGyvp\nRhOiNhnlPy4pKQlHR0fmzZtHTEwMXbt25eWXX8bKSqZUaOwKirTsO5nK2j/iyM4vwbmpFRMHteee\nDi7GDk2IRkmjlFK1fdC//vqLBx54gBUrVtCtWzcWLlyInZ0dzzzzTG2HIuoApRSH/05j+8Hz7P3r\nIsUlOqwsTBk3qAPhA7wqzOoqhKg9RmlJuLu74+7uTrdupYvFhIWFsWTJkluWS0vLqenQ6gUXF7sG\nUxdZuUV8ufEUf8VnAuDmaE1QV3f6dW+Oo50lV7Lyb1q+IdXF3ZK6KCN1UcbFxe7WL7oJoyQJZ2dn\nPDw8iI+Pp23btuzbtw8vLy9jhCKM6PDpNL7aHENuQQld2zVjVN+2tGtuj0aGtwpRZxitF/CVV17h\nhRdeQKvV0rJlS958801jhSJqWVGxjhXbz7DzyAXMzUx4KLQDA/1aSHIQog4yWpLw8fHh559/Ntbh\nhZEkpGSzeN1JUjPz8XRpwuMjO9PCRUa2CVFXyXhCUSu0Oj2/7U/klz/i0ekVYQEtGd3fC3Mzuelf\niLpMkoSoUVeXGv1x+1lSMvNp2sSCR4d3pkvbZsYOTQhRBZIkRI1JTM1hVWQsf8VnotFAiF8LIvq1\no4m1ubFDE0JUkSQJUe0SU3NYtyeBP0+nAdCljSMPDGqPp/Q9CFHvSJIQ1SY9q4CVkbEciCldb9qr\nuT2j7m1Ll7bNZOSSEPWUJAlx1wqKtGzad47f9p9Hq9PT1sOO8H7t6CrJQYh6T5KEuGNKKfaeSGFl\nZCxXcotxtLNkbLAXgZ3dZL0HIRoISRLijpxLyWH576c5m3wFczMTRvZtw7DA1lhayDxLQjQkkiTE\nbSko0rJmVxzbDiWhgJ4dXHhgoDfODtbGDk0IUQMkSYgqOxabwbe/xZCRXYRbMxsmDelAlzZyv4MQ\nDZkkCXFLl3OK+GnHWaJPpmJqouH+Pm0Y0ac15mZyaUmIhk6ShLghrU7P1oNJrN0TT1GxjrYe9kwb\n5oOnq9zvIERjIUlCVKCU4sjZdH7eGceF9DyaWJszYag3/Xo0l1FLQjQykiREOafOXWb1zlhiL2Sj\n0cAA3+aMGeAlU2kI0UhJkhBA6Qpxy7ec5tD/ptLo2cGF8P7taOFsa+TIhBDGJEmikVNK8cexi/y4\n/SwFRVraezZlwqD2tPWwN3ZoQog6wKhJQq/XM2bMGNzc3Pjss8+MGUqjdC4lhxXbzvD3+SysLEyZ\nHNaRAb7S7yCEKGPUJPHNN9/g5eVFbm6uMcNodDKzC1mzK46ov1JQgK+3M5OGdKCZvZWxQxNC1DFG\nSxIpKSns3LmTmTNn8tVXXxkrjEalqETHb9GJbNp3jmKtHk+XJjwwyFtuiBNC3JDRksSiRYuYO3cu\nOTk5xgqh0VBKcSDmEj/tOEtmdhFNbS14KLQdfbt5YGIil5aEEDdmlCQRGRmJs7MznTp1Ijo6usrl\nXFzsajCq+qUqdaGUYv+JFFZsPc3Z81mYmZowdmB7xg1qj41VwxnSKudFGamLMlIX1UOjlFK1fdD3\n3nuPdevWYWpqSlFREXl5eYSGhvL222/ftFxamrQ6oPTkv1VdHDmTzpo/4jh/KRcN4O/jypgB7XB1\ntKmdIGtJVeqisZC6KCN1UeZuk6VRksS19u/fz5dfflml0U3yoZe62T/A5Zwilv9+mj9Pp6HRQGAn\nN4b3adNg73eQL4MyUhdlpC7K3G2SkPskGgi9Uuw8coFVkWcpKNLRoaUDU8I60ryBJgchRO0wepII\nCAggICDA2GHUa38nXuaHbWdITM3F2tKMqUM7yjxLQohqYfQkIe7cpawCVu44y6G/S6fSCOrixthg\nbxztLI0cmRCioZAkUQ/lF5awKjKWLQcS0eoUXi3smTioA+2ay1QaQojqJUmiHlFKEfVXCqt3xXE5\npwhHO0vGhXgR2MkNjVxaEkLUAEkS9URyeh7f/vY3p89nYWFuyqh72zI0sBWW5rI6nBCi5kiSqOOK\ninWsj0rgt/2J6PSKe9o78/QD96DR6owdmhCiEZAkUUcppYg+mcrKyFgu5xThZG/FQ6Ed8G3vjIuj\njYwBF0LUCkkSddC5lByW/36as8lXMDM14f4+bRjeuzWWFnJpSQhRuyRJ1CG5BSWs3hXHzsPJKKBn\nRxceCPHG2cHa2KEJIRopSRJ1gFanZ/exi6zeFUduQQkeTjY8FNqBzjKFtxDCyCRJGJFWpyfqrxQ2\nRCWQfqUQSwtTxod4M9jfEzNTE2OHJ4QQkiSM5fCZNH7Yeob0K4WYmZowuKcn9wW1xqGJ3C0thKg7\nJEnUsit5xXz/+2kOxFzC1ETDoJ6e3Ne7tUylIYSokyRJ1BK9XvHHsQusiowlr1CLVwt7Hh7WqcFO\n4S2EaBgkSdSC0+ez+H7raRJTc7G0MOWh0A6E+LWQWVqFEHWeJIkalJZVwM87Y9l/6hIAQV3cGRvs\nJZeWhBD1hiSJGpBXWMLGqHNsPXQerU7R1sOeBwe3x6tFU2OHJoQQt8UoSSIlJYW5c+eSnp6Oqakp\n48aNY8qUKcYIpVoppfjj2EVW7jhLXqEWJ3tLxgzwIqCzm1xaEkLUS0ZJEqampsybN49OnTqRl5fH\n6NGj6du3L15eXsYIp1qkXs7n680xxCRmYWVhyrhgLwb7e2JuJlNpCCHqL6MkCRcXF1xcXACwtbXF\ny8uLS5cu1cskUVCk5fcD59m47xwlWj2+3s5MGtKBZvZWxg5NCCHumtH7JJKSkoiJiaF79+7GDuW2\nlGh17Dh8gY17E8jJL8He1oJH7++Af0cXWQBICNFgaJRSylgHz8vLY/LkyTz55JMMHjzYWGHclqIS\nHb9Hn+PnHWdJzyrA2tKMiGBvRvVvh42VubHDE0KIamW0JKHVann88cfp378/U6dOrVIZY66hUFyi\nY/ufyfy6P5HsvGIszEwI8WvBfb1bY2djUauxuLjYyXoS/yN1UUbqoozURRkXF7u7Km+0y03z58/H\n29u7ygnCmOIvZrNkw0kuZuRjZWHK8KDWhPq3xN62dpODEELUNqMkiUOHDrF+/Xo6dOhAeHg4Go2G\n559/nv79+xsjnBvS6vSs25PApr3n0CvF4J6ejOrXFlu5rCSEaCSMkiR69uzJqVOnjHHoKjubdIVv\nfoshKS0PJ3srpg/vRKfWjsYOSwghapXRRzfVNbkFJayKPMuuoxcB6N+jOQ8M9MbaUqpKCNH4yDff\nNf48ncayzTHkFpTg6WLL5LCOtPd0MHZYQghhNJIkKL0h7oetZ9h9/CLmZiaMC/Ei1L+lrA4nhGj0\nGn2SOJmQybLNMaRfKaSVWxMeG9FF1ngQQoj/abRJIuNKIT9uP8PBv9PQAMODWjPq3rbSehBCiGs0\nuiRRotXz6/5ENkYlUKzV49XCnkmhHWntfnc3nAghREPUqJLEsdh0vt96hkuXC7C3tWBymBdBXd1l\nGm8hhLiBRpEk0rIKWLHtDIfPpGOi0RDq35JR97bFxqpRvH0hhLhjDfpbsqhEx+Z959i0LxGtTk+H\nlg5MCu2Ap2sTY4cmhBD1QoNMEkopDv2dxo/bz5KRXYhDEwvGD/QmsJObTOMthBC3ocEliYSUbFZs\nPcPppCuYmmgYFtiK+/u0kTumhRDiDjSYb84recX8HBnLnuMXUcA97Z0ZH+KNWzMbY4cmhBD1Vr1P\nElqdnu2Hkli7J56CIh2eLrZMGNSezm2aGTs0IYSo9+p1kvg78TLfbjnNhfQ8bK3MmDSkAwN8m2Nq\nIjfECSFEdaiXSSK3oISfdpxl97GLaIBg3+ZE9G9X6yvECSFEQ2e0JLFr1y4WLVqEUooxY8YwY8aM\nW5ZRShF9MpXvt54ht6CElq5NmDrUh3bN7WshYiGEaHyMkiT0ej2vv/46y5Ytw9XVlbFjxzJo0CC8\nvLxuWCbjSgEfrTrG0dgMLMxNGB/iTWgvT7m0JIQQNcgoSeLYsWO0bt2aFi1aADB8+HC2bdt20yTx\n1NvbySvU0qm1I1OH+eDqYF1b4QohRKNllCSRmpqKh4eH4bGbmxvHjx+/aRm9gqlDO9K/R3O5IU4I\nIWqJUZKEUuq2yyx9JZTCvKIaiEYIIcSNGCVJuLu7c+HCBcPj1NRUXF1db1rGzsZCRi9dw8VFpja/\nSuqijNRFGamL6mGUXt9u3bqRmJhIcnIyxcXFbNy4kUGDBhkjFCGEEDdhlJaEqakpr776KtOnT0cp\nxdixY2/aaS2EEMI4NOpOOgiEEEI0CnKTgRBCiBuSJCGEEOKGJEkIIYS4oTqfJHbt2sXQoUMJCwvj\n888/N3Y4tSolJYUpU6Zw3333MWLECL755hsArly5wvTp0wkLC+ORRx4hJyfHyJHWHr1eT0REBDNn\nzgQgKSmJ8ePHExYWxuzZs9FqtUaOsHbk5OTwzDPPMGzYMIYPH87Ro0cb7XmxbNky7r//fkaMGMGc\nOXMoLi5uNOfF/Pnz6dOnDyNGjDBsu9l58MYbbzBkyBBGjRrFqVOnqnSMOp0krs7xtHTpUjZs2MDG\njRuJjY01dli1xtTUlHnz5rFp0yZWrFjB8uXLiY2N5fPPPycoKIjffvuNwMBAFi9ebOxQa80333xT\nbiTcf/7zH6ZNm8Zvv/2GnZ0dq1atMmJ0tWfhwoUMGDCAzZs3s3btWtq1a9coz4vU1FS+/fZbVq9e\nzfr169HpdGzcuLHRnBejR49m6dKl5bbd6DzYuXMniYmJbNmyhQULFvDaa69V6Rh1OklcO8eTubm5\nYY6nxsLFxYVOnToBYGtri5eXF6mpqWzbto2IiAgAIiIi2Lp1qzHDrDUpKSns3LmTcePGGbbt27eP\nsLAwoLQufv/9d2OFV2tyc3M5ePAgY8aMAcDMzAw7O7tGe17o9XoKCgrQarUUFhbi6upKdHR0ozgv\n/P39sbcvPwv29efB1e/Mbdu2ER4eDkCPHj3IyckhPT39lseo00misjmeLl26ZMSIjCcpKYmYmBh6\n9OhBRkYGzs7OQGkiuXz5spGjqx2LFi1i7ty5hrm7Ll++TNOmTTH530zA7u7ujeL8SEpKwtHRkXnz\n5hEREcGrr75KQUFBozwv3NzcmDZtGsHBwfTv3x87Ozs6d+6Mvb19ozsvrsrMzCx3HmRmZgJw6dIl\n3N3dDa9zc3MjNTX1lvur00lCbuEolZeXxzPPPMP8+fOxtbVtlBMcRkZG4uzsTKdOnQznhVKqwjnS\nGOpGq9Vy8uRJHnzwQdasWYO1tTWff/55o3jv18vOzmbbtm3s2LGDP/74g4KCAnbt2lXhdY2xbq5X\n2fdpVeqlTq9MdydzPDU0Wq2WZ555hlGjRjF48GAAnJycSE9Px9nZmbS0NJo1a/jref/5559s376d\nnTt3UlRURF5eHosWLSInJwe9Xo+JiQkpKSmN4vxwd3fH3d2dbt26ATBkyBC++OKLRnleREVF0bJl\nSxwcHAAYPHgwhw8fJjs7u9GdF1fd6Dxwc3MjJSXF8Lqq1kudbknIHE+loxe8vb2ZOnWqYdvAgQNZ\nvXo1AGvWrGkUdTJ79mwiIyPZtm0b7733HoGBgfznP/8hMDCQX3/9FWg8deHs7IyHhwfx8fFAab+M\nt7d3ozwvmjdvztGjRykqKkIpxb59+2jfvn2jOi+ubyHc6DwYNGgQv/zyCwBHjhzB3t7ecFnqZur8\ntBy7du1i4cKFhjmeqrLMaUNx6NAhJk2aRIcOHdBoNGg0Gp5//nm6d+/Oc889x8WLF2nevDkffvhh\nhc6rhmz//v18+eWXfPbZZ5w/f57Zs2eTnZ1Np06deOeddzA3Nzd2iDUuJiaGl19+Ga1WS8uWLXnz\nzTfR6XSN8rz45JNP2LhxI2ZmZnTu3Jk33niDlJSURnFezJkzh+joaLKysnB2dmbWrFkMHjyYZ599\nttLzYMGCBfzxxx9YW1vz5ptv0qVLl1seo84nCSGEEMZTpy83CSGEMC5JEkIIIW5IkoQQQogbkiQh\nhBDihiRJCCGEuCFJEkIIIW5IkoSocwYOHMjZs2dr5ViffPJJuWmk582bx/Lly+96v/PmzWPEiBHM\nnj37rvd1MzExMWzevLlGjyEaN0kSolH75JNPKCkpqdZ9pqens2XLFtavX897771Xrfu+3smTJ+84\nSej1+mqORjREkiREvREfH89jjz3GuHHjCA8PN0w9AODj48PixYsZO3YsoaGhbNmyxfDcb7/9xrBh\nwxg9ejSLFy/Gx8eHgoICFixYgEajYcKECURERJCbmwvA6dOnmTp1KmFhYbz00ks3jOeXX35hxIgR\njBo1ilmzZpGZmUleXh5Tp06lqKiIiIgIvv7663Jl1q5dy9NPP214rNPp6Nevn2GOsiVLljB+/HhG\njx7NE088QUZGBgAlJSW89dZbjBgxgvDwcGbNmkVWVhYff/wx+/btIyIigoULFwKlsxREREQwatQo\npk2bxvnz54HSO9XDw8N54403mDBhAn/88cfdfByisVBC1DEhISHqzJkz5bZptVoVERGh4uLilFJK\n5ebmqrCwMMPjjh07quXLlyullDp06JDq16+fUkqp9PR0FRAQoBITE5VSSn311VfKx8dH5efnG8oV\nFGMkQHAAAAOxSURBVBQYjvPSSy+pBx98UBUXF6vi4mI1fPhwFRUVVSHG06dPq3vvvVelp6crpZT6\n4IMP1HPPPaeUUiopKUn17t270vdWUFCgevfurS5fvqyUUmr79u1q6tSpSiml1q5dq1599VXDa7//\n/ns1Z84cpZRSH3/8sZo1a5bSarVKKWUov3r1avXMM88YymRkZKjevXur2NhYpZRSK1euVOPGjVNK\nKRUdHa06d+6sjh49WmlsQlRGWhKiXkhISCAuLo7Zs2cTHh7OQw89RElJSbmVCu+77z4AfH19SUtL\no7i4mKNHj9K1a1datmwJwNixYyvsW103M83gwYMxNzfH3Nyczp07k5iYWKFMdHQ0wcHBODk5ATBh\nwgSioqJu+T6srKwYNGgQGzZsAEonYLu6eND27dvZu3cv4eHhhIeH8/3333Px4kWgdKr0KVOmYGpq\nCmCY9fR6R48epVOnTrRr1w6AMWPGcOrUKfLz8wFo3bo13bt3v2WcQlxVp6cKF+IqpRTNmjVjzZo1\nlT6v0WiwtLQEMCw2o9PpKiSA6x9XxsLCwvC3qalppesjK6UqzMV/9bi3Eh4ezptvvsn999/P/v37\neeeddwz7fOKJJxg9enSlx6uKyuK69rGNjU2V9iPEVdKSEPVC27ZtsbKyYu3atYZtcXFx5OXlARW/\nRK8+9vX15cSJE4br8tf2YwA0adKk3ELxVRUUFMTOnTsNfQY//vgjffr0qXD8yvj7+5Obm8t7771H\naGioIbkNHDiQ77//nuzsbACKi4uJiYkBICQkhG+++cbQyX511bkmTZoY+lKuvt9Tp04ZphFfvXo1\nnTt3luQg7pi0JESdo9FoePjhhzEzMzP8Ml6/fj2fffYZCxcu5Msvv0Sn0+Hs7MwHH3xgKHP9PqB0\nAZZ//etfzJgxA0dHR4KDgzEzM8Pa2hqAadOmMWXKFKytrfn222+rHKO3tzezZ8/m4YcfxsTEhJYt\nW7JgwYIKx7+R8PBwPvroo/9v5w5xGASiIAwPBoMhHADNBRCcgtUEzQWQSByChAOgSHB4joVBLqlo\ngnumadK0/T/51LrZyeat1nW9Z2VZ6jgO1XWtIAh0XZeqqlKWZWqaRuM4yjmnMAyVpqmmaVJRFJrn\nWc455Xmurus0DIPatpX3XkmS3E0FeAVfhePnneepKIokPW/W27a9ZRcC+Ac0Cfy8ZVm077u894rj\nWH3ff/pIwNegSQAATDxcAwBMhAQAwERIAABMhAQAwERIAABMhAQAwPQAVSnSA55bZkwAAAAASUVO\nRK5CYII=\n",
+            "text/plain": [
+              "\u003cmatplotlib.figure.Figure at 0x7f47b8e3bd90\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "display_data"
+        }
+      ],
+      "source": [
+        "plt.plot(eager_means)\n",
+        "plt.ylabel('Time(s)')\n",
+        "plt.xlabel('Length of vector')\n",
+        "_ = plt.title('Time to sum the elements of 1000 vectors (Eager)')\n",
+        "_ = plt.ylim(ymin=0)"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "collapsed_sections": [],
+      "default_view": {},
+      "name": "Autograph vs. Eager vs Graph sum",
+      "provenance": [
+        {
+          "file_id": "1olZkm32B7n7pQwlIAXR0_w8fZhRHCtkX",
+          "timestamp": 1531755808890
+        }
+      ],
+      "version": "0.3.2",
+      "views": {}
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/tensorflow/contrib/autograph/impl/api.py b/tensorflow/contrib/autograph/impl/api.py
index f7fe3de5da..ee71f4f9ac 100644
--- a/tensorflow/contrib/autograph/impl/api.py
+++ b/tensorflow/contrib/autograph/impl/api.py
@@ -23,7 +23,6 @@ from functools import wraps
 from enum import Enum
 
 # pylint:disable=g-bad-import-order
-import gast
 import six
 # pylint:enable=g-bad-import-order
 
@@ -245,19 +244,21 @@ def to_graph(e,
   _, name, namespace = conversion.entity_to_graph(e, program_ctx, arg_values,
                                                   arg_types)
 
-  module = gast.Module([])
+  nodes = []
   for dep in reversed(program_ctx.dependency_cache.values()):
-    module.body.append(dep)
-  compiled_node, compiled_src = compiler.ast_to_object(
-      module, source_prefix=program_ctx.required_imports)
+    nodes.extend(dep)
+  compiled_module, compiled_src = compiler.ast_to_object(
+      nodes,
+      source_prefix=program_ctx.required_imports,
+      include_source_map=True)
 
   # The compiled code should see everything the entry entity saw.
   # TODO(mdan): This might not work well if the call tree spans modules?
   for key, val in namespace.items():
     # Avoid overwriting entities that have been transformed.
-    if key not in compiled_node.__dict__:
-      compiled_node.__dict__[key] = val
-  compiled_fn = getattr(compiled_node, name)
+    if key not in compiled_module.__dict__:
+      compiled_module.__dict__[key] = val
+  compiled_fn = getattr(compiled_module, name)
 
   # Need this so the source_mapping attribute is available for the context
   # manager to access for runtime errors.
@@ -270,7 +271,7 @@ def to_graph(e,
                      '"%s", which is reserved for AutoGraph.' %
                      (compiled_fn, source_map_attribute_name))
   setattr(compiled_fn, source_map_attribute_name,
-          compiled_node.__dict__['ag_source_map__'])
+          compiled_module.__dict__['ag_source_map__'])
 
   if verbose:
     logging.info('Compiled output of %s:\n\n%s\n', e, compiled_src)
@@ -308,7 +309,7 @@ def to_code(e,
   conversion.entity_to_graph(e, program_ctx, arg_values, arg_types)
 
   code = '\n'.join(
-      compiler.ast_to_source(dep, indentation)[0]
+      compiler.ast_to_source(dep, indentation)
       for dep in reversed(tuple(six.itervalues(program_ctx.dependency_cache))))
 
   return program_ctx.required_imports + '\n\n' + code
diff --git a/tensorflow/contrib/autograph/impl/conversion.py b/tensorflow/contrib/autograph/impl/conversion.py
index 7bd0ba3f2d..57ec739a80 100644
--- a/tensorflow/contrib/autograph/impl/conversion.py
+++ b/tensorflow/contrib/autograph/impl/conversion.py
@@ -164,7 +164,7 @@ def class_to_graph(c, program_ctx):
       class_namespace = namespace
     else:
       class_namespace.update(namespace)
-    converted_members[m] = node
+    converted_members[m] = node[0]
   namer = program_ctx.new_namer(class_namespace)
   class_name = namer.compiled_class_name(c.__name__, c)
 
@@ -175,10 +175,10 @@ def class_to_graph(c, program_ctx):
   # program_ctx.update_name_map(namer)).
   output_nodes = []
   renames = {}
-  bases = []
+  base_names = []
   for base in c.__bases__:
     if isinstance(object, base):
-      bases.append('object')
+      base_names.append('object')
       continue
     if is_whitelisted_for_graph(base):
       alias = namer.new_symbol(base.__name__, ())
@@ -190,28 +190,28 @@ def class_to_graph(c, program_ctx):
     else:
       # This will trigger a conversion into a class with this name.
       alias = namer.compiled_class_name(base.__name__, base)
-    bases.append(alias)
+    base_names.append(alias)
     renames[qual_names.QN(base.__name__)] = qual_names.QN(alias)
   program_ctx.update_name_map(namer)
 
   # Generate the definition of the converted class.
-  output_nodes.append(
-      gast.ClassDef(
-          class_name,
-          bases=bases,
-          keywords=[],
-          body=list(converted_members.values()),
-          decorator_list=[]))
-  node = gast.Module(output_nodes)
-
+  bases = [gast.Name(n, gast.Load(), None) for n in base_names]
+  class_def = gast.ClassDef(
+      class_name,
+      bases=bases,
+      keywords=[],
+      body=list(converted_members.values()),
+      decorator_list=[])
   # Make a final pass to replace references to the class or its base classes.
   # Most commonly, this occurs when making super().__init__() calls.
   # TODO(mdan): Making direct references to superclass' superclass will fail.
-  node = qual_names.resolve(node)
+  class_def = qual_names.resolve(class_def)
   renames[qual_names.QN(c.__name__)] = qual_names.QN(class_name)
-  node = ast_util.rename_symbols(node, renames)
+  class_def = ast_util.rename_symbols(class_def, renames)
+
+  output_nodes.append(class_def)
 
-  return node, class_name, class_namespace
+  return output_nodes, class_name, class_namespace
 
 
 def _add_reserved_symbol(namespace, name, entity):
@@ -279,7 +279,7 @@ def function_to_graph(f,
   program_ctx.update_name_map(namer)
   # TODO(mdan): Use this at compilation.
 
-  return node, new_name, namespace
+  return (node,), new_name, namespace
 
 
 def node_to_graph(node, context, rewrite_errors=True):
diff --git a/tensorflow/contrib/autograph/impl/conversion_test.py b/tensorflow/contrib/autograph/impl/conversion_test.py
index 207225a1ac..bfc51365a3 100644
--- a/tensorflow/contrib/autograph/impl/conversion_test.py
+++ b/tensorflow/contrib/autograph/impl/conversion_test.py
@@ -60,10 +60,11 @@ class ConversionTest(test.TestCase):
       return a + b
 
     program_ctx = self._simple_program_ctx()
-    ast, name, ns = conversion.entity_to_graph(f, program_ctx, None, None)
-    self.assertTrue(isinstance(ast, gast.FunctionDef), ast)
+    nodes, name, ns = conversion.entity_to_graph(f, program_ctx, None, None)
+    fn_node, = nodes
+    self.assertIsInstance(fn_node, gast.FunctionDef)
     self.assertEqual('tf__f', name)
-    self.assertTrue(ns['b'] is b)
+    self.assertIs(ns['b'], b)
 
   def test_entity_to_graph_call_tree(self):
 
@@ -78,14 +79,11 @@ class ConversionTest(test.TestCase):
 
     self.assertTrue(f in program_ctx.dependency_cache)
     self.assertTrue(g in program_ctx.dependency_cache)
-    self.assertEqual('tf__f', program_ctx.dependency_cache[f].name)
-    # need one extra .body[0] in order to step past the try/except wrapper that
-    # is added automatically, the other for the with tf.name_scope('f') that is
-    # added automatically
-    self.assertEqual(
-        'tf__g',
-        program_ctx.dependency_cache[f].body[0].body[0].body[0].value.func.id)
-    self.assertEqual('tf__g', program_ctx.dependency_cache[g].name)
+    f_node = program_ctx.dependency_cache[f][0]
+    g_node = program_ctx.dependency_cache[g][0]
+    self.assertEqual('tf__f', f_node.name)
+    self.assertEqual('tf__g', f_node.body[0].body[0].body[0].value.func.id)
+    self.assertEqual('tf__g', g_node.name)
 
   def test_entity_to_graph_class_hierarchy(self):
 
@@ -118,9 +116,9 @@ class ConversionTest(test.TestCase):
     self.assertTrue(TestBase in program_ctx.dependency_cache)
     self.assertTrue(TestSubclass in program_ctx.dependency_cache)
     self.assertEqual('TfTestBase',
-                     program_ctx.dependency_cache[TestBase].body[-1].name)
+                     program_ctx.dependency_cache[TestBase][-1].name)
     self.assertEqual('TfTestSubclass',
-                     program_ctx.dependency_cache[TestSubclass].body[-1].name)
+                     program_ctx.dependency_cache[TestSubclass][-1].name)
 
   def test_entity_to_graph_class_hierarchy_whitelisted(self):
 
@@ -139,10 +137,9 @@ class ConversionTest(test.TestCase):
     self.assertTrue(TestSubclass in program_ctx.dependency_cache)
     self.assertFalse(training.Model in program_ctx.dependency_cache)
     self.assertEqual(
-        'Model',
-        program_ctx.dependency_cache[TestSubclass].body[0].names[0].name)
+        'Model', program_ctx.dependency_cache[TestSubclass][0].names[0].name)
     self.assertEqual('TfTestSubclass',
-                     program_ctx.dependency_cache[TestSubclass].body[-1].name)
+                     program_ctx.dependency_cache[TestSubclass][-1].name)
 
   def test_entity_to_graph_lambda(self):
     f = lambda a: a
diff --git a/tensorflow/contrib/autograph/lang/special_functions.py b/tensorflow/contrib/autograph/lang/special_functions.py
index 11135295a7..6149cbbd6c 100644
--- a/tensorflow/contrib/autograph/lang/special_functions.py
+++ b/tensorflow/contrib/autograph/lang/special_functions.py
@@ -26,6 +26,43 @@ from __future__ import print_function
 from tensorflow.contrib.autograph.operators import data_structures
 
 
+def tensor_list(elements,
+                element_dtype=None,
+                element_shape=None,
+                use_tensor_array=False):
+  """Creates an tensor list and populates it with the given elements.
+
+  This function provides a more uniform access to tensor lists and tensor
+  arrays, and allows optional initialization.
+
+  Note: this function is a simplified wrapper. If you need greater control,
+  it is recommended to use the underlying implementation directly.
+
+  Args:
+    elements: Iterable[tf.Tensor, ...], the elements to initially fill the list
+        with
+    element_dtype: Optional[tf.DType], data type for the elements in the list;
+        required if the list is empty
+    element_shape: Optional[tf.TensorShape], shape for the elements in the list;
+        required if the list is empty
+    use_tensor_array: bool, whether to use the more compatible but restrictive
+        tf.TensorArray implementation
+  Returns:
+    Union[tf.Tensor, tf.TensorArray], the new list.
+  Raises:
+    ValueError: for invalid arguments
+  """
+  if not (elements or (element_dtype and element_shape)):
+    raise ValueError(
+        'element_dtype and element_shape are required for empty lists')
+  if use_tensor_array:
+    return data_structures.tf_tensor_array_new(elements, element_dtype,
+                                               element_shape)
+  else:
+    return data_structures.tf_tensor_list_new(elements, element_dtype,
+                                              element_shape)
+
+
 def stack(list_or_tensor, element_dtype=None, strict=True):
   """Stacks the input, if it admits the notion of stacking.
 
diff --git a/tensorflow/contrib/autograph/lang/special_functions_test.py b/tensorflow/contrib/autograph/lang/special_functions_test.py
index a49cb64075..db492cc5c6 100644
--- a/tensorflow/contrib/autograph/lang/special_functions_test.py
+++ b/tensorflow/contrib/autograph/lang/special_functions_test.py
@@ -28,7 +28,23 @@ from tensorflow.python.platform import test
 
 class SpecialFunctionsTest(test.TestCase):
 
-  def test_basic(self):
+  def test_tensor_list_from_elements(self):
+    elements = [constant_op.constant([1, 2]), constant_op.constant([3, 4])]
+
+    l = special_functions.tensor_list(elements)
+    sl = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
+    with self.test_session() as sess:
+      self.assertAllEqual(sess.run(sl), [[1, 2], [3, 4]])
+
+  def test_tensor_list_array_from_elements(self):
+    elements = [constant_op.constant([1, 2]), constant_op.constant([3, 4])]
+
+    l = special_functions.tensor_list(elements, use_tensor_array=True)
+    sl = l.stack()
+    with self.test_session() as sess:
+      self.assertAllEqual(sess.run(sl), [[1, 2], [3, 4]])
+
+  def test_stack(self):
     self.assertEqual(special_functions.stack(1, strict=False), 1)
     self.assertListEqual(
         special_functions.stack([1, 2, 3], strict=False), [1, 2, 3])
diff --git a/tensorflow/contrib/autograph/operators/data_structures.py b/tensorflow/contrib/autograph/operators/data_structures.py
index 06d8727b0f..cc0a3c3544 100644
--- a/tensorflow/contrib/autograph/operators/data_structures.py
+++ b/tensorflow/contrib/autograph/operators/data_structures.py
@@ -28,7 +28,6 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import tensor_array_ops
-from tensorflow.python.ops import variables
 
 
 # TODO(mdan): Once control flow supports objects, repackage as a class.
@@ -48,29 +47,101 @@ def new_list(iterable=None):
   else:
     elements = ()
 
-  # TODO(mdan): Extend these criteria.
-  if any(isinstance(el, variables.Variable) for el in elements):
+  if elements:
+    # When the list contains elements, it is assumed to be a "Python" lvalue
+    # list.
     return _py_list_new(elements)
-  return _tf_tensor_list_new(elements)
+  return tf_tensor_list_new(elements)
 
 
-def _tf_tensor_list_new(elements):
+def tf_tensor_array_new(elements, element_dtype=None, element_shape=None):
   """Overload of new_list that stages a Tensor list creation."""
   elements = tuple(ops.convert_to_tensor(el) for el in elements)
+
+  all_dtypes = set(el.dtype for el in elements)
+  if len(all_dtypes) == 1:
+    inferred_dtype, = tuple(all_dtypes)
+    if element_dtype is not None and element_dtype != inferred_dtype:
+      raise ValueError(
+          'incompatible dtype; specified: {}, inferred from {}: {}'.format(
+              element_dtype, elements, inferred_dtype))
+  elif len(all_dtypes) > 1:
+    raise ValueError(
+        'TensorArray requires all elements to have the same dtype:'
+        ' {}'.format(elements))
+  else:
+    if element_dtype is None:
+      raise ValueError('dtype is required to create an empty TensorArray')
+
+  all_shapes = set(tuple(el.shape.as_list()) for el in elements)
+  if len(all_shapes) == 1:
+    inferred_shape, = tuple(all_shapes)
+    if element_shape is not None and element_shape != inferred_shape:
+      raise ValueError(
+          'incompatible shape; specified: {}, inferred from {}: {}'.format(
+              element_shape, elements, inferred_shape))
+  elif len(all_shapes) > 1:
+    raise ValueError(
+        'TensorArray requires all elements to have the same shape:'
+        ' {}'.format(elements))
+    # TODO(mdan): We may want to allow different shapes with infer_shape=False.
+  else:
+    inferred_shape = None
+
+  if element_dtype is None:
+    element_dtype = inferred_dtype
+  if element_shape is None:
+    element_shape = inferred_shape
+
+  l = tensor_array_ops.TensorArray(
+      dtype=element_dtype,
+      size=len(elements),
+      dynamic_size=True,
+      infer_shape=(element_shape is None),
+      element_shape=element_shape)
+  for i, el in enumerate(elements):
+    l = l.write(i, el)
+  return l
+
+
+def tf_tensor_list_new(elements, element_dtype=None, element_shape=None):
+  """Overload of new_list that stages a Tensor list creation."""
+  elements = tuple(ops.convert_to_tensor(el) for el in elements)
+
   all_dtypes = set(el.dtype for el in elements)
   if len(all_dtypes) == 1:
-    element_dtype = tuple(all_dtypes)[0]
+    inferred_dtype = tuple(all_dtypes)[0]
+    if element_dtype is not None and element_dtype != inferred_dtype:
+      raise ValueError(
+          'incompatible dtype; specified: {}, inferred from {}: {}'.format(
+              element_dtype, elements, inferred_dtype))
   else:
     # Heterogeneous lists are ok.
-    element_dtype = dtypes.variant
+    if element_dtype is not None:
+      raise ValueError(
+          'specified dtype {} is inconsistent with that of elements {}'.format(
+              element_dtype, elements))
+    inferred_dtype = dtypes.variant
 
-  # TODO(mdan): This may fail for elements of variable shapes.
   all_shapes = set(tuple(el.shape.as_list()) for el in elements)
   if len(all_shapes) == 1:
-    element_shape = array_ops.shape(elements[0])
+    inferred_shape = array_ops.shape(elements[0])
+    if element_shape is not None and element_shape != inferred_shape:
+      raise ValueError(
+          'incompatible shape; specified: {}, inferred from {}: {}'.format(
+              element_shape, elements, inferred_shape))
   else:
     # Heterogeneous lists are ok.
-    element_shape = constant_op.constant(-1)  # unknown shape, by convention
+    if element_shape is not None:
+      raise ValueError(
+          'specified shape {} is inconsistent with that of elements {}'.format(
+              element_shape, elements))
+    inferred_shape = constant_op.constant(-1)  # unknown shape, by convention
+
+  if element_dtype is None:
+    element_dtype = inferred_dtype
+  if element_shape is None:
+    element_shape = inferred_shape
 
   l = list_ops.empty_tensor_list(
       element_shape=element_shape, element_dtype=element_dtype)
diff --git a/tensorflow/contrib/autograph/operators/data_structures_test.py b/tensorflow/contrib/autograph/operators/data_structures_test.py
index 8bbb52d6c1..7ea11a839b 100644
--- a/tensorflow/contrib/autograph/operators/data_structures_test.py
+++ b/tensorflow/contrib/autograph/operators/data_structures_test.py
@@ -37,10 +37,51 @@ class ListTest(test.TestCase):
 
   def test_new_list_tensor(self):
     l = data_structures.new_list([3, 4, 5])
+    self.assertAllEqual(l, [3, 4, 5])
+
+  def test_tf_tensor_list_new(self):
+    l = data_structures.tf_tensor_list_new([3, 4, 5])
     t = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
     with self.test_session() as sess:
       self.assertAllEqual(sess.run(t), [3, 4, 5])
 
+  def test_tf_tensor_list_new_illegal_input(self):
+    with self.assertRaises(ValueError):
+      data_structures.tf_tensor_list_new([3, 4.0])
+    # TODO(mdan): It might make more sense to type cast in this case.
+    with self.assertRaises(ValueError):
+      data_structures.tf_tensor_list_new([3, 4], element_dtype=dtypes.float32)
+    # Tensor lists do support heterogeneous lists.
+    self.assertIsNot(data_structures.tf_tensor_list_new([3, [4, 5]]), None)
+    with self.assertRaises(ValueError):
+      data_structures.tf_tensor_list_new([3, 4], element_shape=(2,))
+    with self.assertRaises(ValueError):
+      data_structures.tf_tensor_list_new([], element_shape=(2,))
+    with self.assertRaises(ValueError):
+      data_structures.tf_tensor_list_new([], element_dtype=dtypes.float32)
+
+  def test_tf_tensor_array_new(self):
+    l = data_structures.tf_tensor_array_new([3, 4, 5])
+    t = l.stack()
+    with self.test_session() as sess:
+      self.assertAllEqual(sess.run(t), [3, 4, 5])
+
+  def test_tf_tensor_array_new_illegal_input(self):
+    with self.assertRaises(ValueError):
+      data_structures.tf_tensor_array_new([3, 4.0])
+    with self.assertRaises(ValueError):
+      data_structures.tf_tensor_array_new([3, 4], element_dtype=dtypes.float32)
+    with self.assertRaises(ValueError):
+      data_structures.tf_tensor_array_new([3, [4, 5]])
+    with self.assertRaises(ValueError):
+      data_structures.tf_tensor_array_new([3, 4], element_shape=(2,))
+    with self.assertRaises(ValueError):
+      data_structures.tf_tensor_array_new([], element_shape=(2,))
+    # TAs can infer the shape.
+    self.assertIsNot(
+        data_structures.tf_tensor_array_new([], element_dtype=dtypes.float32),
+        None)
+
   def test_append_tensor_list(self):
     l = data_structures.new_list()
     x = constant_op.constant([1, 2, 3])
diff --git a/tensorflow/contrib/autograph/pyct/BUILD b/tensorflow/contrib/autograph/pyct/BUILD
index f77a6ab392..ddadc6b96e 100644
--- a/tensorflow/contrib/autograph/pyct/BUILD
+++ b/tensorflow/contrib/autograph/pyct/BUILD
@@ -100,6 +100,16 @@ py_test(
 )
 
 py_test(
+    name = "origin_info_test",
+    srcs = ["origin_info_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":pyct",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
     name = "parser_test",
     srcs = ["parser_test.py"],
     srcs_version = "PY2AND3",
diff --git a/tensorflow/contrib/autograph/pyct/ast_util.py b/tensorflow/contrib/autograph/pyct/ast_util.py
index 86e3f56a64..d7453b0781 100644
--- a/tensorflow/contrib/autograph/pyct/ast_util.py
+++ b/tensorflow/contrib/autograph/pyct/ast_util.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import ast
 
-import collections
 import gast
 
 from tensorflow.contrib.autograph.pyct import anno
@@ -185,6 +184,7 @@ class PatternMatcher(gast.NodeVisitor):
         if v != p:
           return self.no_match()
 
+
 def matches(node, pattern):
   """Basic pattern matcher for AST.
 
@@ -253,30 +253,61 @@ def apply_to_single_assignments(targets, values, apply_fn):
       apply_fn(target, values)
 
 
-def iter_fields(node):
-  for field in sorted(node._fields):
-    try:
-      yield getattr(node, field)
-    except AttributeError:
-      pass
-
-
-def iter_child_nodes(node):
-  for field in iter_fields(node):
-    if isinstance(field, gast.AST):
-      yield field
-    elif isinstance(field, list):
-      for item in field:
-        if isinstance(item, gast.AST):
-          yield item
-
-
-def parallel_walk(node_a, node_b):
-  todo_a = collections.deque([node_a])
-  todo_b = collections.deque([node_b])
-  while todo_a and todo_b:
-    node_a = todo_a.popleft()
-    node_b = todo_b.popleft()
-    todo_a.extend(iter_child_nodes(node_a))
-    todo_b.extend(iter_child_nodes(node_b))
-    yield node_a, node_b
+def parallel_walk(node, other):
+  """Walks two ASTs in parallel.
+
+  The two trees must have identical structure.
+
+  Args:
+    node: Union[ast.AST, Iterable[ast.AST]]
+    other: Union[ast.AST, Iterable[ast.AST]]
+  Yields:
+    Tuple[ast.AST, ast.AST]
+  Raises:
+    ValueError: if the two trees don't have identical structure.
+  """
+  if isinstance(node, (list, tuple)):
+    node_stack = list(node)
+  else:
+    node_stack = [node]
+
+  if isinstance(other, (list, tuple)):
+    other_stack = list(other)
+  else:
+    other_stack = [other]
+
+  while node_stack and other_stack:
+    assert len(node_stack) == len(other_stack)
+    n = node_stack.pop()
+    o = other_stack.pop()
+
+    if (not isinstance(n, (ast.AST, gast.AST)) or
+        not isinstance(o, (ast.AST, gast.AST)) or
+        n.__class__.__name__ != o.__class__.__name__):
+      raise ValueError('inconsistent nodes: {} and {}'.format(n, o))
+
+    yield n, o
+
+    for f in n._fields:
+      n_child = getattr(n, f, None)
+      o_child = getattr(o, f, None)
+      if f.startswith('__') or n_child is None or o_child is None:
+        continue
+
+      if isinstance(n_child, (list, tuple)):
+        if (not isinstance(o_child, (list, tuple)) or
+            len(n_child) != len(o_child)):
+          raise ValueError(
+              'inconsistent values for field {}: {} and {}'.format(
+                  f, n_child, o_child))
+        node_stack.extend(n_child)
+        other_stack.extend(o_child)
+
+      elif isinstance(n_child, (gast.AST, ast.AST)):
+        node_stack.append(n_child)
+        other_stack.append(o_child)
+
+      elif n_child != o_child:
+        raise ValueError(
+            'inconsistent values for field {}: {} and {}'.format(
+                f, n_child, o_child))
diff --git a/tensorflow/contrib/autograph/pyct/ast_util_test.py b/tensorflow/contrib/autograph/pyct/ast_util_test.py
index 981e398b93..2293c89720 100644
--- a/tensorflow/contrib/autograph/pyct/ast_util_test.py
+++ b/tensorflow/contrib/autograph/pyct/ast_util_test.py
@@ -44,7 +44,7 @@ class AstUtilTest(test.TestCase):
         node, {qual_names.QN('a'): qual_names.QN('renamed_a')})
 
     self.assertIsInstance(node.body[0].value.left.id, str)
-    source, _ = compiler.ast_to_source(node)
+    source = compiler.ast_to_source(node)
     self.assertEqual(source.strip(), 'renamed_a + b')
 
   def test_rename_symbols_attributes(self):
@@ -54,7 +54,7 @@ class AstUtilTest(test.TestCase):
     node = ast_util.rename_symbols(
         node, {qual_names.from_str('b.c'): qual_names.QN('renamed_b_c')})
 
-    source, _ = compiler.ast_to_source(node)
+    source = compiler.ast_to_source(node)
     self.assertEqual(source.strip(), 'renamed_b_c = renamed_b_c.d')
 
   def test_rename_symbols_annotations(self):
@@ -97,10 +97,10 @@ class AstUtilTest(test.TestCase):
     d = ast_util.keywords_to_dict(keywords)
     # Make sure we generate a usable dict node by attaching it to a variable and
     # compiling everything.
-    output = parser.parse_str('b = 3')
-    output.body += (ast.Assign([ast.Name(id='d', ctx=ast.Store())], d),)
-    result, _ = compiler.ast_to_object(output)
-    self.assertDictEqual(result.d, {'a': 3, 'c': 1, 'd': 'e'})
+    node = parser.parse_str('def f(b): pass').body[0]
+    node.body.append(ast.Return(d))
+    result, _ = compiler.ast_to_object(node)
+    self.assertDictEqual(result.f(3), {'a': 3, 'c': 1, 'd': 'e'})
 
   def assertMatch(self, target_str, pattern_str):
     node = parser.parse_expression(target_str)
@@ -130,8 +130,8 @@ class AstUtilTest(test.TestCase):
                        'super(Bar, _).__init__(_)')
 
   def _mock_apply_fn(self, target, source):
-    target, _ = compiler.ast_to_source(target)
-    source, _ = compiler.ast_to_source(source)
+    target = compiler.ast_to_source(target)
+    source = compiler.ast_to_source(source)
     self._invocation_counts[(target.strip(), source.strip())] += 1
 
   def test_apply_to_single_assignments_dynamic_unpack(self):
@@ -157,24 +157,40 @@ class AstUtilTest(test.TestCase):
     })
 
   def test_parallel_walk(self):
-    ret = ast.Return(
-        ast.BinOp(
-            op=ast.Add(),
-            left=ast.Name(id='a', ctx=ast.Load()),
-            right=ast.Num(1)))
-    node = ast.FunctionDef(
-        name='f',
-        args=ast.arguments(
-            args=[ast.Name(id='a', ctx=ast.Param())],
-            vararg=None,
-            kwarg=None,
-            defaults=[]),
-        body=[ret],
-        decorator_list=[],
-        returns=None)
+    node = parser.parse_str(
+        textwrap.dedent("""
+      def f(a):
+        return a + 1
+    """))
     for child_a, child_b in ast_util.parallel_walk(node, node):
       self.assertEqual(child_a, child_b)
 
+  def test_parallel_walk_inconsistent_trees(self):
+    node_1 = parser.parse_str(
+        textwrap.dedent("""
+      def f(a):
+        return a + 1
+    """))
+    node_2 = parser.parse_str(
+        textwrap.dedent("""
+      def f(a):
+        return a + (a * 2)
+    """))
+    node_3 = parser.parse_str(
+        textwrap.dedent("""
+      def f(a):
+        return a + 2
+    """))
+    with self.assertRaises(ValueError):
+      for _ in ast_util.parallel_walk(node_1, node_2):
+        pass
+    # There is not particular reason to reject trees that differ only in the
+    # value of a constant.
+    # TODO(mdan): This should probably be allowed.
+    with self.assertRaises(ValueError):
+      for _ in ast_util.parallel_walk(node_1, node_3):
+        pass
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/autograph/pyct/cfg.py b/tensorflow/contrib/autograph/pyct/cfg.py
index 25fec7fd53..ba51dcf285 100644
--- a/tensorflow/contrib/autograph/pyct/cfg.py
+++ b/tensorflow/contrib/autograph/pyct/cfg.py
@@ -67,10 +67,8 @@ class Node(object):
     if isinstance(self.ast_node, gast.FunctionDef):
       return 'def %s' % self.ast_node.name
     elif isinstance(self.ast_node, gast.withitem):
-      source, _ = compiler.ast_to_source(self.ast_node.context_expr)
-      return source.strip()
-    source, _ = compiler.ast_to_source(self.ast_node)
-    return source.strip()
+      return compiler.ast_to_source(self.ast_node.context_expr).strip()
+    return compiler.ast_to_source(self.ast_node).strip()
 
 
 class Graph(
diff --git a/tensorflow/contrib/autograph/pyct/common_transformers/anf_test.py b/tensorflow/contrib/autograph/pyct/common_transformers/anf_test.py
index 81983a5ecb..aefbc69d8c 100644
--- a/tensorflow/contrib/autograph/pyct/common_transformers/anf_test.py
+++ b/tensorflow/contrib/autograph/pyct/common_transformers/anf_test.py
@@ -43,7 +43,7 @@ class AnfTransformerTest(test.TestCase):
       return a
 
     node, _ = parser.parse_entity(test_function)
-    node = anf.transform(node, self._simple_source_info())
+    node = anf.transform(node.body[0], self._simple_source_info())
     result, _ = compiler.ast_to_object(node)
 
     self.assertEqual(test_function(), result.test_function())
diff --git a/tensorflow/contrib/autograph/pyct/compiler.py b/tensorflow/contrib/autograph/pyct/compiler.py
index c90a5e89c2..f9cee10962 100644
--- a/tensorflow/contrib/autograph/pyct/compiler.py
+++ b/tensorflow/contrib/autograph/pyct/compiler.py
@@ -30,44 +30,7 @@ import tempfile
 import astor
 import gast
 
-from tensorflow.contrib.autograph.pyct import anno
-from tensorflow.contrib.autograph.pyct import ast_util
 from tensorflow.contrib.autograph.pyct import origin_info
-from tensorflow.contrib.autograph.pyct import parser
-
-
-def _build_source_map(node, code):
-  """Return the Python objects represented by given AST.
-
-  Compiling the AST code this way ensures that the source code is readable by
-  e.g. `pdb` or `inspect`.
-
-  Args:
-    node: An AST node of the original generated code, before the source code is
-      generated.
-    code: The string representation of the source code for the newly generated
-      code.
-
-  Returns:
-    Dict[CodeLocation, OriginInfo], a mapping between the user and AutoGraph
-    generated code.
-  """
-  # After we have the final generated code we reparse it to get the final line
-  # numbers. Then we walk through the generated and original ASTs in parallel
-  # to build the mapping between the user and generated code.
-  new_node = parser.parse_str(code)
-  origin_info.resolve(new_node, code)
-  source_mapping = {}
-  for before, after in ast_util.parallel_walk(node, new_node):
-    # Need both checks because if origin information is ever copied over to new
-    # nodes then we need to rely on the fact that only the original user code
-    # has the origin annotation.
-    if (anno.hasanno(before, anno.Basic.ORIGIN) and
-        anno.hasanno(after, anno.Basic.ORIGIN)):
-      source_info = anno.getanno(before, anno.Basic.ORIGIN)
-      new_line_number = anno.getanno(after, anno.Basic.ORIGIN).line_number
-      source_mapping[new_line_number] = source_info
-  return source_mapping
 
 
 def ast_to_source(node, indentation='  '):
@@ -81,24 +44,28 @@ def ast_to_source(node, indentation='  '):
     code: The source code generated from the AST object
     source_mapping: A mapping between the user and AutoGraph generated code.
   """
-  original_node = node
-  if isinstance(node, gast.AST):
-    node = gast.gast_to_ast(node)
+  if not isinstance(node, (list, tuple)):
+    node = (node,)
   generator = astor.codegen.SourceGenerator(indentation, False,
                                             astor.string_repr.pretty_string)
-  generator.visit(node)
-  generator.result.append('\n')
+
+  for n in node:
+    if isinstance(n, gast.AST):
+      n = gast.gast_to_ast(n)
+    generator.visit(n)
+    generator.result.append('\n')
+
   # In some versions of Python, literals may appear as actual values. This
   # ensures everything is string.
   code = map(str, generator.result)
   code = astor.source_repr.pretty_source(code).lstrip()
-  source_mapping = _build_source_map(original_node, code)
 
-  return code, source_mapping
+  return code
 
 
-def ast_to_object(node,
+def ast_to_object(nodes,
                   indentation='  ',
+                  include_source_map=False,
                   source_prefix=None,
                   delete_on_exit=True):
   """Return the Python objects represented by given AST.
@@ -107,42 +74,46 @@ def ast_to_object(node,
   e.g. `pdb` or `inspect`.
 
   Args:
-    node: The code to compile, as an AST object.
-    indentation: The string to use for indentation.
-    source_prefix: Optional string to print as-is into the source file.
-    delete_on_exit: Whether to delete the temporary file used for compilation on
-      exit.
+    nodes: Union[ast.AST, Iterable[ast.AST]], the code to compile, as an AST
+        object.
+    indentation: Text, the string to use for indentation.
+    include_source_map: bool, whether to attach a source map to the compiled
+        object. Also see origin_info.py.
+    source_prefix: Optional[Text], string to print as-is into the source file.
+    delete_on_exit: bool, whether to delete the temporary file used for
+        compilation on exit.
 
   Returns:
-    compiled_node: A module object containing the compiled source code.
+    compiled_nodes: A module object containing the compiled source code.
     source: The source code of the compiled object
   Raises:
     ValueError: If ag_source_map__ is already in the namespace of the compiled
-    node.
+    nodes.
   """
-  # code_source_mapping does not yet include the offsets from import statements.
-  source, code_source_mapping = ast_to_source(node, indentation=indentation)
+  if not isinstance(nodes, (list, tuple)):
+    nodes = (nodes,)
+
+  source = ast_to_source(nodes, indentation=indentation)
+
+  if source_prefix:
+    source = source_prefix + '\n' + source
 
   with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
-    # TODO(znado): move into an _offset_source_map() helper function.
-    # Need to offset the generated line numbers by the number of import lines.
-    if source_prefix:
-      num_import_lines = source_prefix.count('\n') + 1
-    else:
-      num_import_lines = 0
-    source_mapping = {}
-    for line_number, original_position in code_source_mapping.items():
-      source_map_key = origin_info.CodeLocation(
-          file_path=f.name, line_number=line_number + num_import_lines)
-      source_mapping[source_map_key] = original_position
     module_name = os.path.basename(f.name[:-3])
-    if source_prefix:
-      f.write(source_prefix)
-      f.write('\n')
     f.write(source)
+
+    if isinstance(nodes, (list, tuple)):
+      indices = range(-len(nodes), 0)
+    else:
+      indices = (-1,)
+
+    if include_source_map:
+      source_map = origin_info.source_map(nodes, source, f.name, indices)
+
+  # TODO(mdan): Try flush() and delete=False instead.
   if delete_on_exit:
     atexit.register(lambda: os.remove(f.name))
-  compiled_node = imp.load_source(module_name, f.name)
+  compiled_nodes = imp.load_source(module_name, f.name)
 
   # TODO(znado): Clean this up so we don't need to attach it to the namespace.
   # TODO(znado): This does not work for classes because their methods share a
@@ -158,11 +129,13 @@ def ast_to_object(node,
   # is hard, and this cleanly fixes the
   # issues encountered with nested functions because this is attached to the
   # outermost one.
-  source_map_name = 'ag_source_map__'
-  if source_map_name in compiled_node.__dict__:
-    raise ValueError('cannot convert %s because is has namespace attribute '
-                     '"%s", which is reserved for AutoGraph.' %
-                     (compiled_node, source_map_name))
-  compiled_node.__dict__[source_map_name] = source_mapping
-
-  return compiled_node, source
+  if include_source_map:
+    # TODO(mdan): This name should be decided by the caller.
+    source_map_name = 'ag_source_map__'
+    if source_map_name in compiled_nodes.__dict__:
+      raise ValueError('cannot convert %s because is has namespace attribute '
+                       '"%s", which is reserved for AutoGraph.' %
+                       (compiled_nodes, source_map_name))
+    compiled_nodes.__dict__[source_map_name] = source_map
+
+  return compiled_nodes, source
diff --git a/tensorflow/contrib/autograph/pyct/compiler_test.py b/tensorflow/contrib/autograph/pyct/compiler_test.py
index e29fa9324c..cf783da6a3 100644
--- a/tensorflow/contrib/autograph/pyct/compiler_test.py
+++ b/tensorflow/contrib/autograph/pyct/compiler_test.py
@@ -59,7 +59,7 @@ class CompilerTest(test.TestCase):
                 value=gast.Str('c'))
         ])
 
-    source, _ = compiler.ast_to_source(node, indentation='  ')
+    source = compiler.ast_to_source(node, indentation='  ')
     self.assertEqual(
         textwrap.dedent("""
             if 1:
diff --git a/tensorflow/contrib/autograph/pyct/origin_info.py b/tensorflow/contrib/autograph/pyct/origin_info.py
index 614e346634..1aad2f47df 100644
--- a/tensorflow/contrib/autograph/pyct/origin_info.py
+++ b/tensorflow/contrib/autograph/pyct/origin_info.py
@@ -22,49 +22,115 @@ import collections
 import gast
 
 from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import ast_util
+from tensorflow.contrib.autograph.pyct import parser
 from tensorflow.python.util import tf_inspect
 
 
-class CodeLocation(
-    collections.namedtuple('CodeLocation', ('file_path', 'line_number'))):
-  """Location of a line of code.
+class LineLocation(
+    collections.namedtuple('LineLocation', ('filename', 'lineno'))):
+  """Similar to Location, but without column information.
 
   Attributes:
-    file_path: text, the full path to the file containing the code.
-    line_number: Int, the 1-based line number of the code in its file.
+    filename: Text
+    lineno: int, 1-based
   """
   pass
 
 
+class Location(
+    collections.namedtuple('Location', ('filename', 'lineno', 'col_offset'))):
+  """Encodes code location information.
+
+  Attributes:
+    filename: Text
+    lineno: int, 1-based
+    col_offset: int
+  """
+
+  @property
+  def line_loc(self):
+    return LineLocation(self.filename, self.lineno)
+
+
 class OriginInfo(
-    collections.namedtuple('OriginInfo',
-                           ('file_path', 'function_name', 'line_number',
-                            'column_offset', 'source_code_line'))):
+    collections.namedtuple(
+        'OriginInfo',
+        ('loc', 'function_name', 'source_code_line'))):
   """Container for information about the source code before conversion.
 
-  Instances of this class contain information about the source code that
-  transformed code originated from. Examples include:
-    * line number
-    * file name
-    * original user code
+  Attributes:
+    loc: Location
+    function_name: Optional[Text]
+    source_code_line: Text
   """
 
   def as_frame(self):
-    """Makes a traceback frame tuple.
-
-    Returns:
-      A tuple of (file_path, line_number, function_name, source_code_line).
-    """
-    return (self.file_path, self.line_number, self.function_name,
+    """Returns a 4-tuple consistent with the return of traceback.extract_tb."""
+    return (self.loc.filename, self.loc.lineno, self.function_name,
             self.source_code_line)
 
 
+# TODO(mdan): This source map should be a class - easier to refer to.
+def source_map(nodes, code, filename, indices_in_code):
+  """Creates a source map between an annotated AST and the code it compiles to.
+
+  Args:
+    nodes: Iterable[ast.AST, ...]
+    code: Text
+    filename: Optional[Text]
+    indices_in_code: Union[int, Iterable[int, ...]], the positions at which
+        nodes appear in code. The parser always returns a module when parsing
+        code. This argument indicates the position in that module's body at
+        which the corresponding of node should appear.
+
+  Returns:
+    Dict[CodeLocation, OriginInfo], mapping locations in code to locations
+    indicated by origin annotations in node.
+  """
+  reparsed_nodes = parser.parse_str(code)
+  reparsed_nodes = [reparsed_nodes.body[i] for i in indices_in_code]
+
+  resolve(reparsed_nodes, code)
+  result = {}
+
+  for before, after in ast_util.parallel_walk(nodes, reparsed_nodes):
+    # Note: generated code might not be mapped back to its origin.
+    # TODO(mdan): Generated code should always be mapped to something.
+    origin_info = anno.getanno(before, anno.Basic.ORIGIN, default=None)
+    final_info = anno.getanno(after, anno.Basic.ORIGIN, default=None)
+    if origin_info is None or final_info is None:
+      continue
+
+    line_loc = LineLocation(filename, final_info.loc.lineno)
+
+    existing_origin = result.get(line_loc)
+    if existing_origin is not None:
+      # Overlaps may exist because of child nodes, but almost never to
+      # different line locations. Exception make decorated functions, where
+      # both lines are mapped to the same line in the AST.
+
+      # Line overlaps: keep bottom node.
+      if existing_origin.loc.line_loc == origin_info.loc.line_loc:
+        if existing_origin.loc.lineno >= origin_info.loc.lineno:
+          continue
+
+      # In case of overlaps, keep the leftmost node.
+      if existing_origin.loc.col_offset <= origin_info.loc.col_offset:
+        continue
+
+    result[line_loc] = origin_info
+
+  return result
+
+
 # TODO(znado): Consider refactoring this into a Visitor.
-def resolve(node, source, function=None):
+# TODO(mdan): Does this work correctly with inner functions?
+def resolve(nodes, source, function=None):
   """Adds an origin information to all nodes inside the body of function.
 
   Args:
-    node: The AST node for the function whose body nodes will be annotated.
+    nodes: Union[ast.AST, Iterable[ast.AST, ...]]
     source: Text, the source code string for the function whose body nodes will
       be annotated.
     function: Callable, the function that will have all nodes inside of it
@@ -76,25 +142,32 @@ def resolve(node, source, function=None):
     A tuple of the AST node for function and a String containing its source
     code.
   """
+  if not isinstance(nodes, (list, tuple)):
+    nodes = (nodes,)
+
   if function:
     _, function_lineno = tf_inspect.getsourcelines(function)
     function_filepath = tf_inspect.getsourcefile(function)
   else:
     function_lineno = None
     function_filepath = None
+
   source_lines = source.split('\n')
-  for n in gast.walk(node):
-    if hasattr(n, 'lineno'):
-      # n.lineno is relative to the start of the enclosing function, so need to
-      # offset it by the line of the function.
-      source_code_line = source_lines[n.lineno - 1]
+  for node in nodes:
+    for n in gast.walk(node):
+      if not hasattr(n, 'lineno'):
+        continue
+
+      lineno_in_body = n.lineno
+
+      source_code_line = source_lines[lineno_in_body - 1]
       if function:
-        source_lineno = n.lineno + function_lineno - 1
+        source_lineno = function_lineno + lineno_in_body
         function_name = function.__name__
       else:
-        source_lineno = n.lineno
+        source_lineno = lineno_in_body
         function_name = None
-      anno.setanno(
-          n, anno.Basic.ORIGIN,
-          OriginInfo(function_filepath, function_name, source_lineno,
-                     n.col_offset, source_code_line))
+
+      location = Location(function_filepath, source_lineno, n.col_offset)
+      origin = OriginInfo(location, function_name, source_code_line)
+      anno.setanno(n, anno.Basic.ORIGIN, origin)
diff --git a/tensorflow/contrib/autograph/pyct/origin_info_test.py b/tensorflow/contrib/autograph/pyct/origin_info_test.py
new file mode 100644
index 0000000000..6d7d8b1622
--- /dev/null
+++ b/tensorflow/contrib/autograph/pyct/origin_info_test.py
@@ -0,0 +1,101 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for origin_info module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import compiler
+from tensorflow.contrib.autograph.pyct import origin_info
+from tensorflow.contrib.autograph.pyct import parser
+from tensorflow.python.platform import test
+
+
+class OriginInfoTest(test.TestCase):
+
+  def test_source_map(self):
+
+    def test_fn(x):
+      if x > 0:
+        x += 1
+      return x
+
+    node, source = parser.parse_entity(test_fn)
+    fn_node = node.body[0]
+    origin_info.resolve(fn_node, source)
+
+    # Insert a traced line.
+    new_node = parser.parse_str('x = abs(x)').body[0]
+    anno.copyanno(fn_node.body[0], new_node, anno.Basic.ORIGIN)
+    fn_node.body.insert(0, new_node)
+
+    # Insert an untraced line.
+    fn_node.body.insert(0, parser.parse_str('x = 0').body[0])
+
+    modified_source = compiler.ast_to_source(fn_node)
+
+    source_map = origin_info.source_map(fn_node, modified_source,
+                                        'test_filename', [0])
+
+    loc = origin_info.LineLocation('test_filename', 1)
+    origin = source_map[loc]
+    self.assertEqual(origin.source_code_line, 'def test_fn(x):')
+    self.assertEqual(origin.loc.lineno, 1)
+
+    # The untraced line, inserted second.
+    loc = origin_info.LineLocation('test_filename', 2)
+    self.assertFalse(loc in source_map)
+
+    # The traced line, inserted first.
+    loc = origin_info.LineLocation('test_filename', 3)
+    origin = source_map[loc]
+    self.assertEqual(origin.source_code_line, '  if x > 0:')
+    self.assertEqual(origin.loc.lineno, 2)
+
+    loc = origin_info.LineLocation('test_filename', 4)
+    origin = source_map[loc]
+    self.assertEqual(origin.source_code_line, '  if x > 0:')
+    self.assertEqual(origin.loc.lineno, 2)
+
+  def test_resolve(self):
+
+    def test_fn(x):
+      """Docstring."""
+      return x  # comment
+
+    node, source = parser.parse_entity(test_fn)
+    fn_node = node.body[0]
+    origin_info.resolve(fn_node, source)
+
+    origin = anno.getanno(fn_node, anno.Basic.ORIGIN)
+    self.assertEqual(origin.loc.lineno, 1)
+    self.assertEqual(origin.loc.col_offset, 0)
+    self.assertEqual(origin.source_code_line, 'def test_fn(x):')
+
+    origin = anno.getanno(fn_node.body[0], anno.Basic.ORIGIN)
+    self.assertEqual(origin.loc.lineno, 2)
+    self.assertEqual(origin.loc.col_offset, 2)
+    self.assertEqual(origin.source_code_line, '  """Docstring."""')
+
+    origin = anno.getanno(fn_node.body[1], anno.Basic.ORIGIN)
+    self.assertEqual(origin.loc.lineno, 3)
+    self.assertEqual(origin.loc.col_offset, 2)
+    self.assertEqual(origin.source_code_line, '  return x  # comment')
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/autograph/pyct/parser.py b/tensorflow/contrib/autograph/pyct/parser.py
index c961efa892..112ed46a1e 100644
--- a/tensorflow/contrib/autograph/pyct/parser.py
+++ b/tensorflow/contrib/autograph/pyct/parser.py
@@ -37,6 +37,7 @@ def parse_entity(entity):
 
 def parse_str(src):
   """Returns the AST of given piece of code."""
+  # TODO(mdan): This should exclude the module things are autowrapped in.
   return gast.parse(src)
 
 
diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/live_values.py b/tensorflow/contrib/autograph/pyct/static_analysis/live_values.py
index 32802069ba..2d8f922a45 100644
--- a/tensorflow/contrib/autograph/pyct/static_analysis/live_values.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/live_values.py
@@ -91,12 +91,20 @@ class LiveValueResolver(transformer.Base):
     if anno.hasanno(node.value, 'live_val'):
       assert anno.hasanno(node.value, 'fqn')
       parent_object = anno.getanno(node.value, 'live_val')
-      if not hasattr(parent_object, node.attr):
-        raise AttributeError('%s has no attribute %s' % (parent_object,
-                                                         node.attr))
+
       anno.setanno(node, 'parent_type', type(parent_object))
-      anno.setanno(node, 'live_val', getattr(parent_object, node.attr))
       anno.setanno(node, 'fqn', anno.getanno(node.value, 'fqn') + (node.attr,))
+      if hasattr(parent_object, node.attr):
+        # This can happen when the attribute's creation and use depend on the
+        # same static condition, for example:
+        #
+        #  if cond:
+        #    foo.bar = baz
+        #  if cond:
+        #    x = foo.bar
+        #
+        anno.setanno(node, 'live_val', getattr(parent_object, node.attr))
+
     # TODO(mdan): Investigate the role built-in annotations can play here.
     elif anno.hasanno(node.value, 'type'):
       parent_type = anno.getanno(node.value, 'type')
diff --git a/tensorflow/contrib/autograph/pyct/templates.py b/tensorflow/contrib/autograph/pyct/templates.py
index 72d1d3b269..5831d57ceb 100644
--- a/tensorflow/contrib/autograph/pyct/templates.py
+++ b/tensorflow/contrib/autograph/pyct/templates.py
@@ -140,8 +140,8 @@ class ReplaceTransformer(gast.NodeTransformer):
 
   def _set_inner_child_context(self, node, ctx):
     if isinstance(node, gast.Attribute):
-      self._set_inner_child_context(node.value, ctx)
-      node.ctx = gast.Load()
+      self._set_inner_child_context(node.value, gast.Load())
+      node.ctx = ctx
     elif isinstance(node, gast.Tuple):
       for e in node.elts:
         self._set_inner_child_context(e, ctx)
diff --git a/tensorflow/contrib/autograph/pyct/templates_test.py b/tensorflow/contrib/autograph/pyct/templates_test.py
index a8bbc5a4de..77e8ff62fd 100644
--- a/tensorflow/contrib/autograph/pyct/templates_test.py
+++ b/tensorflow/contrib/autograph/pyct/templates_test.py
@@ -97,6 +97,19 @@ class TemplatesTest(test.TestCase):
     with self.assertRaises(ValueError):
       templates.replace(template, foo=1)
 
+  def test_replace_attribute_context(self):
+    template = """
+      def test_fn(foo):
+        foo = 0
+    """
+
+    node = templates.replace(
+        template,
+        foo=parser.parse_expression('a.b.c'))[0]
+    self.assertIsInstance(node.body[0].targets[0].ctx, gast.Store)
+    self.assertIsInstance(node.body[0].targets[0].value.ctx, gast.Load)
+    self.assertIsInstance(node.body[0].targets[0].value.value.ctx, gast.Load)
+
   def test_replace_call_keyword(self):
     template = """
       def test_fn():
diff --git a/tensorflow/contrib/autograph/pyct/testing/BUILD b/tensorflow/contrib/autograph/pyct/testing/BUILD
new file mode 100644
index 0000000000..b89affdc98
--- /dev/null
+++ b/tensorflow/contrib/autograph/pyct/testing/BUILD
@@ -0,0 +1,57 @@
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+py_library(
+    name = "testing",
+    srcs = [
+        "codegen.py",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/contrib/autograph/pyct",
+        "//tensorflow/contrib/autograph/utils",
+        "@gast_archive//:gast",
+    ],
+)
+
+py_test(
+    name = "codegen_test",
+    size = "large",
+    srcs = ["codegen_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_windows"],
+    deps = [
+        ":testing",
+        "//tensorflow/contrib/autograph/pyct",
+        "//tensorflow/python:client_testlib",
+        "@gast_archive//:gast",
+    ],
+)
+
+# py_test(
+#     name = "dataflow_test",
+#     size = "large",
+#     srcs = ["dataflow_test.py"],
+#     srcs_version = "PY2AND3",
+#     tags = ["no_windows"],
+#     deps = [
+#         ":testing",
+#         "@gast_archive//:gast",
+#         "//tensorflow/contrib/autograph/pyct",
+#         "//tensorflow/python:client_testlib",
+#     ],
+# )
diff --git a/tensorflow/contrib/autograph/pyct/testing/codegen.py b/tensorflow/contrib/autograph/pyct/testing/codegen.py
new file mode 100644
index 0000000000..279e7c09dc
--- /dev/null
+++ b/tensorflow/contrib/autograph/pyct/testing/codegen.py
@@ -0,0 +1,234 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Random code generation for testing/fuzzing."""
+# pylint: disable=invalid-name
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import random
+import string
+
+import gast
+import numpy as np
+
+from tensorflow.contrib.autograph.pyct import templates
+
+
+class NodeSampler(object):
+  sample_map = None
+
+  def sample(self):
+    nodes, magnitudes = zip(*self.sample_map.items())
+    return np.random.choice(
+        nodes, p=np.array(magnitudes, dtype='float32') / np.sum(magnitudes))
+
+
+class StatementSampler(NodeSampler):
+  sample_map = dict((
+      (gast.Assign, 10),
+      (gast.Print, 1),
+      (gast.If, 2),
+      (gast.While, 2),
+      (gast.For, 0),
+  ))
+
+
+class ExpressionSampler(NodeSampler):
+  sample_map = dict((
+      (gast.UnaryOp, 1),
+      (gast.BinOp, 8),
+      (gast.Name, 1),
+      (gast.Call, 0),
+  ))
+
+
+class CompareSampler(NodeSampler):
+  sample_map = dict((
+      (gast.Eq, 1),
+      (gast.NotEq, 1),
+      (gast.Lt, 1),
+      (gast.LtE, 1),
+      (gast.Gt, 1),
+      (gast.GtE, 1),
+      (gast.Is, 1),
+      (gast.IsNot, 1),
+  ))
+
+
+class BinaryOpSampler(NodeSampler):
+  sample_map = dict((
+      (gast.Add, 1),
+      (gast.Sub, 1),
+      (gast.Mult, 1),
+      (gast.Div, 1),
+      (gast.FloorDiv, 1),
+      (gast.Mod, 1),
+      (gast.Pow, 1),
+  ))
+
+
+class UnaryOpSampler(NodeSampler):
+  sample_map = dict(((gast.USub, 1), (gast.UAdd, 0)))
+
+
+class NameSampler(NodeSampler):
+  sample_map = dict((
+      ('new', 1),
+      ('existing', 1),
+  ))
+
+
+N_CONTROLFLOW_STATEMENTS = 10
+N_FUNCTIONDEF_STATEMENTS = 10
+
+
+class CodeGenerator(object):
+  """Generate random syntactically-valid Python ASTs."""
+
+  def __init__(self, max_depth=3, depth=0):
+    self.max_depth = max_depth
+    self.depth = depth
+
+  def generate_statement(self):
+    """Generate a statement node, dispatching to the correct class method."""
+    desired_node = StatementSampler().sample()
+    self.depth += 1
+
+    # Enforce some constraints on generating statements.
+    # E.g., if statements need at least 3 readable variables.
+    # If we fail to satisfy our constraints, draw another sample.
+    if desired_node in (gast.While, gast.For, gast.If):
+      if self.depth > self.max_depth:
+        return self.generate_statement()
+
+    # Go get the generator method and run it
+    method = 'generate_' + desired_node.__name__
+    visitor = getattr(self, method)
+    node = visitor()
+    self.depth -= 1
+    return node
+
+  def sample_node_list(self, low, high, generator):
+    """Generate a list of statements of random length.
+
+    Args:
+      low: Fewest number of statements to generate.
+      high: Highest number of statements to generate.
+      generator: Function to call to generate nodes.
+
+    Returns:
+      A list of statements.
+    """
+    statements = []
+    for _ in range(np.random.randint(low, high)):
+      statements.append(generator())
+    return statements
+
+  def generate_Name(self, ctx=gast.Load()):
+    variable_name = '_' + ''.join(
+        random.choice(string.ascii_lowercase) for _ in range(4))
+    return gast.Name(variable_name, ctx=ctx, annotation=None)
+
+  def generate_BinOp(self):
+    # TODO(alexbw): convert to generate_expression when we get to limit
+    # expression depth.
+    op = BinaryOpSampler().sample()()
+    return gast.BinOp(self.generate_Name(), op, self.generate_Name())
+
+  def generate_Compare(self):
+    op = CompareSampler().sample()()
+    return gast.Compare(self.generate_Name(), [op], [self.generate_Name()])
+
+  def generate_UnaryOp(self):
+    operand = self.generate_Name()
+    op = UnaryOpSampler().sample()()
+    return gast.UnaryOp(op, operand)
+
+  def generate_expression(self):
+    desired_node = ExpressionSampler().sample()
+    # Go get the generator method and run it
+    method = 'generate_' + desired_node.__name__
+    generator = getattr(self, method)
+    return generator()
+
+  def generate_Assign(self):
+    """Generate an Assign node."""
+    # Generate left-hand side
+    target_node = self.generate_Name(gast.Store())
+    # Generate right-hand side
+    value_node = self.generate_expression()
+    # Put it all together
+    node = gast.Assign(targets=[target_node], value=value_node)
+    return node
+
+  def generate_If(self):
+    """Generate an If node."""
+    test = self.generate_Compare()
+
+    # Generate true branch statements
+    body = self.sample_node_list(
+        low=1,
+        high=N_CONTROLFLOW_STATEMENTS // 2,
+        generator=self.generate_statement)
+
+    # Generate false branch statements
+    orelse = self.sample_node_list(
+        low=1,
+        high=N_CONTROLFLOW_STATEMENTS // 2,
+        generator=self.generate_statement)
+
+    node = gast.If(test, body, orelse)
+    return node
+
+  def generate_While(self):
+    """Generate a While node."""
+
+    test = self.generate_Compare()
+    body = self.sample_node_list(
+        low=1, high=N_CONTROLFLOW_STATEMENTS, generator=self.generate_statement)
+    orelse = []  # not generating else statements
+
+    node = gast.While(test, body, orelse)
+    return node
+
+  def generate_Call(self):
+    raise NotImplementedError
+
+  def generate_Return(self):
+    return gast.Return(self.generate_expression())
+
+  def generate_Print(self):
+    return templates.replace('print(x)', x=self.generate_expression())[0]
+
+  def generate_FunctionDef(self):
+    """Generate a FunctionDef node."""
+
+    # Generate the arguments, register them as available
+    arg_vars = self.sample_node_list(
+        low=2, high=10, generator=lambda: self.generate_Name(gast.Param()))
+    args = gast.arguments(arg_vars, None, [], [], None, [])
+
+    # Generate the function body
+    body = self.sample_node_list(
+        low=1, high=N_FUNCTIONDEF_STATEMENTS, generator=self.generate_statement)
+    body.append(self.generate_Return())
+    fn_name = self.generate_Name().id
+    node = gast.FunctionDef(fn_name, args, body, (), None)
+    return node
+
+
+def generate_random_functiondef():
+  return CodeGenerator().generate_FunctionDef()
diff --git a/tensorflow/contrib/autograph/pyct/testing/codegen_test.py b/tensorflow/contrib/autograph/pyct/testing/codegen_test.py
new file mode 100644
index 0000000000..255c3b2a2e
--- /dev/null
+++ b/tensorflow/contrib/autograph/pyct/testing/codegen_test.py
@@ -0,0 +1,40 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for type_info module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.autograph.pyct import compiler
+from tensorflow.contrib.autograph.pyct.testing import codegen
+from tensorflow.python.platform import test
+
+
+class CodeGenTest(test.TestCase):
+
+  def test_codegen_gens(self):
+    np.random.seed(0)
+    for _ in range(1000):
+      node = codegen.generate_random_functiondef()
+      fn = compiler.ast_to_object(node)
+      self.assertIsNotNone(
+          fn, 'Generated invalid AST that could not convert to source.')
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/autograph/pyct/transformer.py b/tensorflow/contrib/autograph/pyct/transformer.py
index bbdfefc50a..969ca12244 100644
--- a/tensorflow/contrib/autograph/pyct/transformer.py
+++ b/tensorflow/contrib/autograph/pyct/transformer.py
@@ -398,18 +398,36 @@ class Base(gast.NodeTransformer):
     try:
       source, _ = compiler.ast_to_source(node)
       return source
-    except AssertionError:
+    # pylint: disable=broad-except
+    # This function is used for error reporting.  If an exception occurs here,
+    # it should be suppressed, in favor of emitting as informative a message
+    # about the original error as possible.
+    except Exception:
       return '<could not convert AST to source>'
 
   def visit(self, node):
+    if not isinstance(node, gast.AST):
+      # This is not that uncommon a mistake: various node bodies are lists, for
+      # example, posing a land mine for transformers that need to recursively
+      # call `visit`.  The error needs to be raised before the exception handler
+      # below is installed, because said handler will mess up if `node` is not,
+      # in fact, a node.
+      msg = (
+          'invalid value for "node": expected "ast.AST", got "{}"; to'
+          ' visit lists of nodes, use "visit_block" instead').format(type(node))
+      raise ValueError(msg)
+
     source_code = self.entity_info.source_code
     source_file = self.entity_info.source_file
     did_enter_function = False
     local_scope_size_at_entry = len(self._local_scope_state)
+    processing_expr_node = False
 
     try:
       if isinstance(node, (gast.FunctionDef, gast.ClassDef, gast.Lambda)):
         did_enter_function = True
+      elif isinstance(node, gast.Expr):
+        processing_expr_node = True
 
       if did_enter_function:
         self._enclosing_entities.append(node)
@@ -418,9 +436,23 @@ class Base(gast.NodeTransformer):
         self._lineno = node.lineno
         self._col_offset = node.col_offset
 
+      if processing_expr_node:
+        entry_expr_value = node.value
+
       if not anno.hasanno(node, anno.Basic.SKIP_PROCESSING):
         result = super(Base, self).visit(node)
 
+      # Adjust for consistency: replacing the value of an Expr with
+      # an Assign node removes the need for the Expr node.
+      if processing_expr_node:
+        if isinstance(result, gast.Expr) and result.value != entry_expr_value:
+          # When the replacement is a list, it is assumed that the list came
+          # from a template that contained a number of statements, which
+          # themselves are standalone and don't require an enclosing Expr.
+          if isinstance(result.value,
+                        (list, tuple, gast.Assign, gast.AugAssign)):
+            result = result.value
+
       # On exception, the local scope integrity is not guaranteed.
       if did_enter_function:
         self._enclosing_entities.pop()
diff --git a/tensorflow/contrib/autograph/pyct/transformer_test.py b/tensorflow/contrib/autograph/pyct/transformer_test.py
index 19b80b09ac..a37e922a1d 100644
--- a/tensorflow/contrib/autograph/pyct/transformer_test.py
+++ b/tensorflow/contrib/autograph/pyct/transformer_test.py
@@ -282,6 +282,88 @@ class TransformerTest(test.TestCase):
     self.assertTrue(isinstance(node.body[1].body[0], gast.Assign))
     self.assertTrue(isinstance(node.body[1].body[1], gast.Return))
 
+  def test_robust_error_on_list_visit(self):
+
+    class BrokenTransformer(transformer.Base):
+
+      def visit_If(self, node):
+        # This is broken because visit expects a single node, not a list, and
+        # the body of an if is a list.
+        # Importantly, the default error handling in visit also expects a single
+        # node.  Therefore, mistakes like this need to trigger a type error
+        # before the visit called here installs its error handler.
+        # That type error can then be caught by the enclosing call to visit,
+        # and correctly blame the If node.
+        self.visit(node.body)
+        return node
+
+    def test_function(x):
+      if x > 0:
+        return x
+
+    tr = BrokenTransformer(self._simple_source_info())
+
+    node, _ = parser.parse_entity(test_function)
+    with self.assertRaises(transformer.AutographParseError) as cm:
+      node = tr.visit(node)
+    obtained_message = str(cm.exception)
+    expected_message = r'expected "ast.AST", got "\<(type|class) \'list\'\>"'
+    self.assertRegexpMatches(obtained_message, expected_message)
+    # The exception should point at the if statement, not any place else.  Could
+    # also check the stack trace.
+    self.assertTrue(
+        'Occurred at node:\nIf' in obtained_message, obtained_message)
+    self.assertTrue(
+        'Occurred at node:\nFunctionDef' not in obtained_message,
+        obtained_message)
+    self.assertTrue(
+        'Occurred at node:\nReturn' not in obtained_message, obtained_message)
+
+  def test_robust_error_on_ast_corruption(self):
+    # A child class should not be able to be so broken that it causes the error
+    # handling in `transformer.Base` to raise an exception.  Why not?  Because
+    # then the original error location is dropped, and an error handler higher
+    # up in the call stack gives misleading information.
+
+    # Here we test that the error handling in `visit` completes, and blames the
+    # correct original exception, even if the AST gets corrupted.
+
+    class NotANode(object):
+      pass
+
+    class BrokenTransformer(transformer.Base):
+
+      def visit_If(self, node):
+        node.body = NotANode()
+        raise ValueError('I blew up')
+
+    def test_function(x):
+      if x > 0:
+        return x
+
+    tr = BrokenTransformer(self._simple_source_info())
+
+    node, _ = parser.parse_entity(test_function)
+    with self.assertRaises(transformer.AutographParseError) as cm:
+      node = tr.visit(node)
+    obtained_message = str(cm.exception)
+    # The message should reference the exception actually raised, not anything
+    # from the exception handler.
+    expected_substring = 'I blew up'
+    self.assertTrue(expected_substring in obtained_message, obtained_message)
+    # Expect the exception to have failed to parse the corrupted AST
+    self.assertTrue(
+        '<could not convert AST to source>' in obtained_message,
+        obtained_message)
+    # The exception should point at the if statement, not any place else.  Could
+    # also check the stack trace.
+    self.assertTrue(
+        'Occurred at node:\nIf' in obtained_message, obtained_message)
+    self.assertTrue(
+        'Occurred at node:\nFunctionDef' not in obtained_message,
+        obtained_message)
+    self.assertTrue(
+        'Occurred at node:\nReturn' not in obtained_message, obtained_message)
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/autograph/utils/builtins.py b/tensorflow/contrib/autograph/utils/builtins.py
index 71079cfdc0..ccbe5fc954 100644
--- a/tensorflow/contrib/autograph/utils/builtins.py
+++ b/tensorflow/contrib/autograph/utils/builtins.py
@@ -27,6 +27,7 @@ from tensorflow.contrib.autograph.utils import type_check
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
 
@@ -50,7 +51,9 @@ def dynamic_builtin(f, *args, **kwargs):
 
 def dynamic_len(list_or_tensor):
   """Implementation of len using dynamic dispatch."""
-  if tensor_util.is_tensor(list_or_tensor):
+  if _is_tensor_list(list_or_tensor):
+    return list_ops.tensor_list_length(list_or_tensor)
+  elif tensor_util.is_tensor(list_or_tensor):
     shape = list_or_tensor.shape
     if not shape.ndims:
       raise ValueError(
@@ -59,6 +62,11 @@ def dynamic_len(list_or_tensor):
   return len(list_or_tensor)
 
 
+def _is_tensor_list(list_or_tensor):
+  return (tensor_util.is_tensor(list_or_tensor)
+          and list_or_tensor.dtype == dtypes.variant)
+
+
 def dynamic_int(num_or_tensor, **kwargs):
   """Implementation of int() using dynamic dispatch."""
   if tensor_util.is_tensor(num_or_tensor):
diff --git a/tensorflow/contrib/bigtable/README.md b/tensorflow/contrib/bigtable/README.md
index ef3c60069e..d7c71a20ed 100644
--- a/tensorflow/contrib/bigtable/README.md
+++ b/tensorflow/contrib/bigtable/README.md
@@ -1,10 +1,344 @@
 # Bigtable #
 
-[Google Cloud Bigtable](https://cloud.google.com/bigtable/) is a high
+[Cloud Bigtable](https://cloud.google.com/bigtable/) is a high
 performance storage system that can store and serve training data. This contrib
 package contains an experimental integration with TensorFlow.
 
 > **Status: Highly experimental.** The current implementation is very much in
 > flux. Please use at your own risk! :-)
 
-<!-- TODO(saeta): Document usage / methods / etc. -->
+The TensorFlow integration with Cloud Bigtable is optimized for common
+TensorFlow usage and workloads. It is currently optimized for reading from Cloud
+Bigtable at high speed, in particular to feed modern accelerators. For
+general-purpose Cloud Bigtable
+APIs, see the [official Cloud Bigtable client library documentation][clientdoc].
+
+[clientdoc]:  https://cloud.google.com/bigtable/docs/reference/libraries
+
+## Sample Use
+
+There are three main reading styles supported by the `BigtableTable` class:
+
+ 1. **Reading keys**: Read only the row keys in a table. Keys are returned in
+    sorted order from the table. Most key reading operations retrieve all keys
+    in a contiguous range, however the `sample_keys` operation skips keys, and
+    operates on the whole table (and not a contiguous subset).
+ 2. **Retrieving a row's values**: Given a row key, look up the data associated
+    with a defined set of columns. This operation takes advantage of Cloud
+    Bigtable's low-latency and excellent support for random access.
+ 3. **Scanning ranges**: Given a contiguous range of rows retrieve both the row
+    key and the data associated with a fixed set of columns. This operation
+    takes advantage of Cloud Bigtable's high throughput scans, and is the most
+    efficient way to read data.
+
+When using the Cloud Bigtable API, the workflow is:
+
+ 1. Create a `BigtableClient` object.
+ 2. Use the `BigtableClient` to create `BigtableTable` objects corresponding to
+    each table in the Cloud Bigtable instance you would like to access.
+ 3. Call methods on the `BigtableTable` object to create `tf.data.Dataset`s to
+    retrieve data.
+
+The following is an example for how to read all row keys with the prefix
+`train-`.
+
+```python
+import tensorflow as tf
+
+GCP_PROJECT_ID = '<FILL_ME_IN>'
+BIGTABLE_INSTANCE_ID = '<FILL_ME_IN>'
+BIGTABLE_TABLE_NAME = '<FILL_ME_IN>'
+PREFIX = 'train-'
+
+def main():
+  client = tf.contrib.cloud.BigtableClient(GCP_PROJECT_ID, BIGTABLE_INSTANCE_ID)
+  table = client.table(BIGTABLE_TABLE_NAME)
+  dataset = table.keys_by_prefix_dataset(PREFIX)
+  iterator = dataset.make_initializable_iterator()
+  get_next_op = iterator.get_next()
+
+  with tf.Session() as sess:
+    print('Initializing the iterator.')
+    sess.run(iterator.initializer)
+    print('Retrieving rows:')
+    row_index = 0
+    while True:
+      try:
+        row_key = sess.run(get_next_op)
+        print('Row key %d: %s' % (row_index, row_key))
+        row_index += 1
+      except tf.errors.OutOfRangeError:
+        print('Finished reading data!')
+        break
+
+if __name__ == '__main__':
+  main()
+
+```
+
+### Reading row keys
+
+Read only the row keys in a table. Keys are returned in sorted order from the
+table. Most key reading operations retrieve all keys in a contiguous range,
+however the `sample_keys` operation skips keys, and operates on the whole table
+(and not a contiguous subset).
+
+There are 3 methods to retrieve row keys:
+
+ - `table.keys_by_range_dataset(start, end)`: Retrieve row keys starting with
+   `start`, and ending with `end`. The range is "half-open", and thus it
+   includes `start` if `start` is present in the table. It does not include
+   `end`.
+ - `table.keys_by_prefix_dataset(prefix)`: Retrieves all row keys that start
+   with `prefix`. It includes the row key `prefix` if present in the table.
+ - `table.sample_keys()`: Retrieves a sampling of keys from the underlying
+   table. This is often useful in conjunction with parallel scans.
+
+### Reading cell values given a row key
+
+Given a dataset producing row keys, you can use the `table.lookup_columns`
+transformation to retrieve values. Example:
+
+```python
+key_dataset = tf.data.Dataset.from_tensor_slices([
+    'row_key_1',
+    'other_row_key',
+    'final_row_key',
+])
+values_dataset = key_dataset.apply(
+  table.lookup_columns(('my_column_family', 'column_name'),
+                       ('other_cf', 'col')))
+training_data = values_dataset.map(my_parsing_function)  # ...
+```
+
+### Scanning ranges
+Given a contiguous range of rows retrieve both the row key and the data
+associated with a fixed set of columns. Scanning is the most efficient way to
+retrieve data from Cloud Bigtable and is thus a very common API for high
+performance data pipelines. To construct a scanning `tf.data.Dataset` from a
+`BigtableTable` object, call one of the following methods:
+
+ - `table.scan_prefix(prefix, ...)`
+ - `table.scan_range(start, end, ...)`
+ - `table.parallel_scan_prefix(prefix, ...)`
+ - `table.parallel_scan_range(start, end, ...)`
+
+Aside from the specification of the contiguous range of rows, they all take the
+following arguments:
+
+ - `probability`: (Optional.) A float between 0 (exclusive) and 1 (inclusive).
+      A non-1 value indicates to probabilistically sample rows with the
+      provided probability.
+ - `columns`: The columns to read. (See below.)
+ - `**kwargs`: The columns to read. (See below.)
+
+In addition the two parallel operations accept the following optional argument:
+`num_parallel_scans` which configures the number of parallel Cloud Bigtable scan
+operations to run. A reasonable default is automatically chosen for small
+Cloud Bigtable clusters. If you have a large cluster, or an extremely demanding
+workload, you can tune this value to optimize performance.
+
+#### Specifying columns to read when scanning
+
+All of the scan operations allow you to specify the column family and columns
+in the same ways.
+
+##### Using `columns`
+
+The first way to specify the data to read is via the `columns` parameter. The
+value should be a tuple (or list of tuples) of strings. The first string in the
+tuple is the column family, and the second string in the tuple is the column
+qualifier.
+
+##### Using `**kwargs`
+
+The second way to specify the data to read is via the `**kwargs` parameter,
+which you can use to specify keyword arguments corresponding to the columns that
+you want to read. The keyword to use is the column family name, and the argument
+value should be either a string, or a tuple of strings, specifying the column
+qualifiers (column names).
+
+Although using `**kwargs` has the advantage of requiring less typing, it is not
+future-proof in all cases. (If we add a new parameter to the scan functions that
+has the same name as your column family, your code will break.)
+
+##### Examples
+
+Below are two equivalent snippets for how to specify which columns to read:
+
+```python
+ds1 = table.scan_range("row_start", "row_end", columns=[("cfa", "c1"),
+                                                        ("cfa", "c2"),
+                                                        ("cfb", "c3")])
+ds2 = table.scan_range("row_start", "row_end", cfa=["c1", "c2"], cfb="c3")
+```
+
+In this example, we are reading 3 columns from a total of 2 column families.
+From the `cfa` column family, we are reading columns `c1`, and `c2`. From the
+second column family (`cfb`), we are reading `c3`. Both `ds1` and `ds2` will
+output elements of the following types (`tf.string`, `tf.string`, `tf.string`,
+`tf.string`). The first `tf.string` is the row key, the second `tf.string` is
+the latest data in cell `cfa:c1`, the third corresponds to `cfa:c2`, and the
+final one is `cfb:c3`.
+
+#### Determinism when scanning
+
+While the non-parallel scan operations are fully deterministic, the parallel
+scan operations are not. If you would like to scan in parallel without losing
+determinism, you can build up the `parallel_interleave` yourself. As an example,
+say we wanted to scan all rows between `training_data_00000`, and
+`training_data_90000`, we can use the following code snippet:
+
+```python
+table = # ...
+columns = [('cf1', 'col1'), ('cf1', 'col2')]
+NUM_PARALLEL_READS = # ...
+ds = tf.data.Dataset.range(9).shuffle(10)
+def interleave_fn(index):
+  # Given a starting index, create 2 strings to be the start and end
+  start_idx = index
+  end_idx = index + 1
+  start_idx_str = tf.as_string(start_idx * 10000, width=5, fill='0')
+  end_idx_str = tf.as_string(end_idx * 10000, width=5, fill='0')
+  start = tf.string_join(['training_data_', start_idx_str])
+  end = tf.string_join(['training_data_', end_idx_str])
+  return table.scan_range(start_idx, end_idx, columns=columns)
+ds = ds.apply(tf.contrib.data.parallel_interleave(
+    interleave_fn, cycle_length=NUM_PARALLEL_READS, prefetch_input_elements=1))
+```
+
+> Note: you should divide up the key range into more sub-ranges for increased
+> parallelism.
+
+## Writing to Cloud Bigtable
+
+In order to simplify getting started, this package provides basic support for
+writing data into Cloud Bigtable.
+
+> Note: The implementation is not optimized for performance! Please consider
+> using alternative frameworks such as Apache Beam / Cloud Dataflow for
+> production workloads.
+
+Below is an example for how to write a trivial dataset into Cloud Bigtable.
+
+```python
+import tensorflow as tf
+
+GCP_PROJECT_ID = '<FILL_ME_IN>'
+BIGTABLE_INSTANCE_ID = '<FILL_ME_IN>'
+BIGTABLE_TABLE_NAME = '<FILL_ME_IN>'
+COLUMN_FAMILY = '<FILL_ME_IN>'
+COLUMN_QUALIFIER = '<FILL_ME_IN>'
+
+def make_dataset():
+  """Makes a dataset to write to Cloud Bigtable."""
+  return tf.data.Dataset.from_tensor_slices([
+      'training_data_1',
+      'training_data_2',
+      'training_data_3',
+  ])
+
+def make_row_key_dataset():
+  """Makes a dataset of strings used for row keys.
+
+  The strings are of the form: `fake-data-` followed by a sequential counter.
+  For example, this dataset would contain the following elements:
+
+   - fake-data-00000001
+   - fake-data-00000002
+   - ...
+   - fake-data-23498103
+  """
+  counter_dataset = tf.contrib.data.Counter()
+  width = 8
+  row_key_prefix = 'fake-data-'
+  ds = counter_dataset.map(lambda index: tf.as_string(index,
+                                                      width=width,
+                                                      fill='0'))
+  ds = ds.map(lambda idx_str: tf.string_join([row_key_prefix, idx_str]))
+  return ds
+
+
+def main():
+  client = tf.contrib.cloud.BigtableClient(GCP_PROJECT_ID, BIGTABLE_INSTANCE_ID)
+  table = client.table(BIGTABLE_TABLE_NAME)
+  dataset = make_dataset()
+  index_dataset = make_row_key_dataset()
+  aggregate_dataset = tf.data.Dataset.zip((index_dataset, dataset))
+  write_op = table.write(aggregate_dataset, column_families=[COLUMN_FAMILY],
+                         columns=[COLUMN_QUALIFIER])
+
+  with tf.Session() as sess:
+    print('Starting transfer.')
+    sess.run(write_op)
+    print('Transfer complete.')
+
+if __name__ == '__main__':
+  main()
+```
+
+## Sample applications and architectures
+
+While most machine learning applications are well suited by a high performance
+distributed file system, there are certain applications where using Cloud
+Bigtable works extremely well.
+
+### Perfect Shuffling
+
+Normally, training data is stored in flat files, and a combination of
+(1) `tf.data.Dataset.interleave` (or `parallel_interleave`), (2)
+`tf.data.Dataset.shuffle`, and (3) writing the data in an unsorted order in the
+data files in the first place, provides enough randomization to ensure models
+train efficiently. However, if you would like perfect shuffling, you can use
+Cloud Bigtable's low-latency random access capabilities. Create a
+`tf.data.Dataset` that generates the keys in a perfectly random order (or read
+all the keys into memory and use a shuffle buffer sized to fit all of them for a
+perfect random shuffle using `tf.data.Dataset.shuffle`), and then use
+`lookup_columns` to retrieve the training data.
+
+### Distributed Reinforcement Learning
+
+Sophisticated reinforcement learning algorithms are commonly trained across a
+distributed cluster. (See [IMPALA by DeepMind][impala].) One part of the cluster
+runs self-play, while the other part of the cluster learns a new version of the
+model based on the training data generated by self-play. The new model version
+is then distributed to the self-play half of the cluster, and new training data
+is generated to continue the cycle.
+
+In such a configuration, because there is value in training on the freshest
+examples, a storage service like Cloud Bigtable can be used to store and
+serve the generated training data. When using Cloud Bigtable, there is no need
+to aggregate the examples into large batch files, but the examples can instead
+be written as soon as they are generated, and then retrieved at high speed.
+
+[impala]: https://arxiv.org/abs/1802.01561
+
+## Common Gotchas!
+
+### gRPC Certificates
+
+If you encounter a log line that includes the following:
+
+```
+"description":"Failed to load file", [...],
+"filename":"/usr/share/grpc/roots.pem"
+```
+
+you likely need to copy the [gRPC roots.pem file][grpcPem] to
+`/usr/share/grpc/roots.pem` on your local machine.
+
+[grpcPem]: https://github.com/grpc/grpc/blob/master/etc/roots.pem
+
+### Permission denied errors
+
+The TensorFlow Cloud Bigtable client will search for credentials to use in the
+process's environment. It will use the first credentials it finds if multiple
+are available.
+
+ - **Compute Engine**: When running on Compute Engine, the client will often use
+   the service account from the virtual machine's metadata service. Be sure to
+   authorize your Compute Engine VM to have access to the Cloud Bigtable service
+   when creating your VM.
+ - **Cloud TPU**: Your Cloud TPUs run with the designated Cloud TPU service
+   account dedicated to your GCP project. Ensure the service account has been
+   authorized via the Cloud Console to access your Cloud Bigtable instances.
diff --git a/tensorflow/contrib/bigtable/__init__.py b/tensorflow/contrib/bigtable/__init__.py
index 7df054637c..b7d89c9842 100644
--- a/tensorflow/contrib/bigtable/__init__.py
+++ b/tensorflow/contrib/bigtable/__init__.py
@@ -18,7 +18,7 @@ This contrib package allows TensorFlow to interface directly with Cloud Bigtable
 for high-speed data loading.
 
 @@BigtableClient
-@@BigTable
+@@BigtableTable
 
 """
 
@@ -26,14 +26,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.bigtable.python.ops.bigtable_api import BigTable
 from tensorflow.contrib.bigtable.python.ops.bigtable_api import BigtableClient
+from tensorflow.contrib.bigtable.python.ops.bigtable_api import BigtableTable
 
 from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = [
-    'BigTable',
     'BigtableClient',
+    'BigtableTable',
 ]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/bigtable/kernels/bigtable_kernels.cc b/tensorflow/contrib/bigtable/kernels/bigtable_kernels.cc
index 70923e6287..a6755a3496 100644
--- a/tensorflow/contrib/bigtable/kernels/bigtable_kernels.cc
+++ b/tensorflow/contrib/bigtable/kernels/bigtable_kernels.cc
@@ -276,7 +276,7 @@ class ToBigtableOp : public AsyncOpKernel {
         }
         OP_REQUIRES_ASYNC(
             ctx, failures.empty() && mutation_status.ok(),
-            errors::Unknown("Failure while writing to BigTable: ",
+            errors::Unknown("Failure while writing to Cloud Bigtable: ",
                             mutation_status.error_code(), " - ",
                             mutation_status.error_message(), " (",
                             mutation_status.error_details(),
diff --git a/tensorflow/contrib/bigtable/kernels/bigtable_lib.cc b/tensorflow/contrib/bigtable/kernels/bigtable_lib.cc
index 2514575f30..67bf14c176 100644
--- a/tensorflow/contrib/bigtable/kernels/bigtable_lib.cc
+++ b/tensorflow/contrib/bigtable/kernels/bigtable_lib.cc
@@ -27,10 +27,10 @@ Status GrpcStatusToTfStatus(const ::grpc::Status& status) {
       status.error_code() == ::grpc::StatusCode::OUT_OF_RANGE) {
     grpc_code = ::grpc::StatusCode::INTERNAL;
   }
-  return Status(
-      static_cast<::tensorflow::error::Code>(status.error_code()),
-      strings::StrCat("Error reading from BigTable: ", status.error_message(),
-                      " (Details: ", status.error_details(), ")"));
+  return Status(static_cast<::tensorflow::error::Code>(status.error_code()),
+                strings::StrCat("Error reading from Cloud Bigtable: ",
+                                status.error_message(),
+                                " (Details: ", status.error_details(), ")"));
 }
 
 string RegexFromStringSet(const std::vector<string>& strs) {
diff --git a/tensorflow/contrib/bigtable/python/kernel_tests/bigtable_ops_test.py b/tensorflow/contrib/bigtable/python/kernel_tests/bigtable_ops_test.py
index 2f20064619..e36f7f32c6 100644
--- a/tensorflow/contrib/bigtable/python/kernel_tests/bigtable_ops_test.py
+++ b/tensorflow/contrib/bigtable/python/kernel_tests/bigtable_ops_test.py
@@ -44,7 +44,7 @@ class BigtableOpsTest(test.TestCase):
   def setUp(self):
     self._client = gen_bigtable_test_ops.bigtable_test_client()
     table = gen_bigtable_ops.bigtable_table(self._client, "testtable")
-    self._table = bigtable.BigTable("testtable", None, table)
+    self._table = bigtable.BigtableTable("testtable", None, table)
 
   def _makeSimpleDataset(self):
     output_rows = dataset_ops.Dataset.from_tensor_slices(self.COMMON_ROW_KEYS)
diff --git a/tensorflow/contrib/bigtable/python/ops/bigtable_api.py b/tensorflow/contrib/bigtable/python/ops/bigtable_api.py
index 9f73b7223c..fd30aa8bbb 100644
--- a/tensorflow/contrib/bigtable/python/ops/bigtable_api.py
+++ b/tensorflow/contrib/bigtable/python/ops/bigtable_api.py
@@ -94,7 +94,7 @@ class BigtableClient(object):
         project_id, instance_id, connection_pool_size, max_receive_message_size)
 
   def table(self, name, snapshot=None):
-    """Opens a table and returns a `BigTable` object.
+    """Opens a table and returns a `BigtableTable` object.
 
     Args:
       name: A `tf.string` `tf.Tensor` name of the table to open.
@@ -102,19 +102,20 @@ class BigtableClient(object):
         request the creation of a snapshot. (Note: currently unimplemented.)
 
     Returns:
-      A `BigTable` python object representing the operations available on the
-      table.
+      A `BigtableTable` python object representing the operations available on
+      the table.
     """
     # TODO(saeta): Implement snapshot functionality.
     table = gen_bigtable_ops.bigtable_table(self._resource, name)
-    return BigTable(name, snapshot, table)
+    return BigtableTable(name, snapshot, table)
 
 
-class BigTable(object):
-  """BigTable is the entrypoint for reading and writing data in Cloud Bigtable.
+class BigtableTable(object):
+  """BigtableTable is the entrypoint for reading and writing data in Cloud
+  Bigtable.
 
-  This BigTable class is the python representation of the Cloud Bigtable table
-  within TensorFlow. Methods on this class allow data to be read from and
+  This BigtableTable class is the Python representation of the Cloud Bigtable
+  table within TensorFlow. Methods on this class allow data to be read from and
   written to the Cloud Bigtable service in flexible and high performance
   manners.
   """
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/BUILD b/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
index ef0e80cd09..f4a375328e 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
@@ -147,6 +147,7 @@ py_library(
     deps = [
         ":distillation_loss",
         ":estimator_utils",
+        ":model",
         ":trainer_hooks",
         "//tensorflow/contrib/boosted_trees:gbdt_batch",
         "//tensorflow/contrib/boosted_trees:model_ops_py",
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py
index 7eb429b636..dbfa69edcb 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py
@@ -26,6 +26,7 @@ from __future__ import print_function
 import six
 
 from tensorflow.contrib import layers
+from tensorflow.contrib.boosted_trees.estimator_batch import model
 from tensorflow.contrib.boosted_trees.estimator_batch import distillation_loss
 from tensorflow.contrib.boosted_trees.estimator_batch import estimator_utils
 from tensorflow.contrib.boosted_trees.estimator_batch import trainer_hooks
@@ -34,6 +35,7 @@ from tensorflow.contrib.boosted_trees.python.training.functions import gbdt_batc
 from tensorflow.contrib.layers.python.layers import optimizers
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
+from tensorflow.python.estimator import estimator as core_estimator
 from tensorflow.contrib.learn.python.learn.estimators import model_fn
 from tensorflow.python.feature_column import feature_column as feature_column_lib
 from tensorflow.python.framework import ops
@@ -62,27 +64,29 @@ def _add_hidden_layer_summary(value, tag):
   summary.histogram("%s_activation" % tag, value)
 
 
-def _dnn_tree_combined_model_fn(features,
-                                labels,
-                                mode,
-                                head,
-                                dnn_hidden_units,
-                                dnn_feature_columns,
-                                tree_learner_config,
-                                num_trees,
-                                tree_examples_per_layer,
-                                config=None,
-                                dnn_optimizer="Adagrad",
-                                dnn_activation_fn=nn.relu,
-                                dnn_dropout=None,
-                                dnn_input_layer_partitioner=None,
-                                dnn_input_layer_to_tree=True,
-                                dnn_steps_to_train=10000,
-                                predict_with_tree_only=False,
-                                tree_feature_columns=None,
-                                tree_center_bias=False,
-                                dnn_to_tree_distillation_param=None,
-                                use_core_versions=False):
+def _dnn_tree_combined_model_fn(
+    features,
+    labels,
+    mode,
+    head,
+    dnn_hidden_units,
+    dnn_feature_columns,
+    tree_learner_config,
+    num_trees,
+    tree_examples_per_layer,
+    config=None,
+    dnn_optimizer="Adagrad",
+    dnn_activation_fn=nn.relu,
+    dnn_dropout=None,
+    dnn_input_layer_partitioner=None,
+    dnn_input_layer_to_tree=True,
+    dnn_steps_to_train=10000,
+    predict_with_tree_only=False,
+    tree_feature_columns=None,
+    tree_center_bias=False,
+    dnn_to_tree_distillation_param=None,
+    use_core_versions=False,
+    output_type=model.ModelBuilderOutputType.MODEL_FN_OPS):
   """DNN and GBDT combined model_fn.
 
   Args:
@@ -156,6 +160,10 @@ def _dnn_tree_combined_model_fn(features,
       partitioned_variables.min_max_variable_partitioner(
           max_partitions=config.num_ps_replicas, min_slice_size=64 << 20))
 
+  if (output_type == model.ModelBuilderOutputType.ESTIMATOR_SPEC and
+      not use_core_versions):
+    raise ValueError("You must use core versions with Estimator Spec")
+
   with variable_scope.variable_scope(
       dnn_parent_scope,
       values=tuple(six.itervalues(features)),
@@ -235,7 +243,8 @@ def _dnn_tree_combined_model_fn(features,
       learner_config=tree_learner_config,
       feature_columns=tree_feature_columns,
       logits_dimension=head.logits_dimension,
-      features=tree_features)
+      features=tree_features,
+      use_core_columns=use_core_versions)
 
   with ops.name_scope("gbdt"):
     predictions_dict = gbdt_model.predict(mode)
@@ -284,63 +293,96 @@ def _dnn_tree_combined_model_fn(features,
     del loss
     return control_flow_ops.no_op()
 
-  if use_core_versions:
-    model_fn_ops = head.create_estimator_spec(
-        features=features,
-        mode=mode,
-        labels=labels,
-        train_op_fn=_no_train_op_fn,
-        logits=tree_train_logits)
-    dnn_train_op = head.create_estimator_spec(
-        features=features,
-        mode=mode,
-        labels=labels,
-        train_op_fn=_dnn_train_op_fn,
-        logits=dnn_logits)
-    dnn_train_op = estimator_utils.estimator_spec_to_model_fn_ops(
-        dnn_train_op).train_op
+  if tree_center_bias:
+    num_trees += 1
+  finalized_trees, attempted_trees = gbdt_model.get_number_of_trees_tensor()
 
-    tree_train_op = head.create_estimator_spec(
-        features=tree_features,
-        mode=mode,
-        labels=labels,
-        train_op_fn=_tree_train_op_fn,
-        logits=tree_train_logits)
-    tree_train_op = estimator_utils.estimator_spec_to_model_fn_ops(
-        tree_train_op).train_op
+  if output_type == model.ModelBuilderOutputType.MODEL_FN_OPS:
+    if use_core_versions:
+      model_fn_ops = head.create_estimator_spec(
+          features=features,
+          mode=mode,
+          labels=labels,
+          train_op_fn=_no_train_op_fn,
+          logits=tree_train_logits)
+      dnn_train_op = head.create_estimator_spec(
+          features=features,
+          mode=mode,
+          labels=labels,
+          train_op_fn=_dnn_train_op_fn,
+          logits=dnn_logits)
+      dnn_train_op = estimator_utils.estimator_spec_to_model_fn_ops(
+          dnn_train_op).train_op
 
-    model_fn_ops = estimator_utils.estimator_spec_to_model_fn_ops(model_fn_ops)
-  else:
-    model_fn_ops = head.create_model_fn_ops(
+      tree_train_op = head.create_estimator_spec(
+          features=tree_features,
+          mode=mode,
+          labels=labels,
+          train_op_fn=_tree_train_op_fn,
+          logits=tree_train_logits)
+      tree_train_op = estimator_utils.estimator_spec_to_model_fn_ops(
+          tree_train_op).train_op
+
+      model_fn_ops = estimator_utils.estimator_spec_to_model_fn_ops(
+          model_fn_ops)
+    else:
+      model_fn_ops = head.create_model_fn_ops(
+          features=features,
+          mode=mode,
+          labels=labels,
+          train_op_fn=_no_train_op_fn,
+          logits=tree_train_logits)
+      dnn_train_op = head.create_model_fn_ops(
+          features=features,
+          mode=mode,
+          labels=labels,
+          train_op_fn=_dnn_train_op_fn,
+          logits=dnn_logits).train_op
+      tree_train_op = head.create_model_fn_ops(
+          features=tree_features,
+          mode=mode,
+          labels=labels,
+          train_op_fn=_tree_train_op_fn,
+          logits=tree_train_logits).train_op
+
+    # Add the hooks
+    model_fn_ops.training_hooks.extend([
+        trainer_hooks.SwitchTrainOp(dnn_train_op, dnn_steps_to_train,
+                                    tree_train_op),
+        trainer_hooks.StopAfterNTrees(num_trees, attempted_trees,
+                                      finalized_trees)
+    ])
+    return model_fn_ops
+
+  elif output_type == model.ModelBuilderOutputType.ESTIMATOR_SPEC:
+    fusion_spec = head.create_estimator_spec(
         features=features,
         mode=mode,
         labels=labels,
         train_op_fn=_no_train_op_fn,
         logits=tree_train_logits)
-    dnn_train_op = head.create_model_fn_ops(
+    dnn_spec = head.create_estimator_spec(
         features=features,
         mode=mode,
         labels=labels,
         train_op_fn=_dnn_train_op_fn,
-        logits=dnn_logits).train_op
-    tree_train_op = head.create_model_fn_ops(
+        logits=dnn_logits)
+    tree_spec = head.create_estimator_spec(
         features=tree_features,
         mode=mode,
         labels=labels,
         train_op_fn=_tree_train_op_fn,
-        logits=tree_train_logits).train_op
-
-  if tree_center_bias:
-    num_trees += 1
-  finalized_trees, attempted_trees = gbdt_model.get_number_of_trees_tensor()
-
-  model_fn_ops.training_hooks.extend([
-      trainer_hooks.SwitchTrainOp(dnn_train_op, dnn_steps_to_train,
-                                  tree_train_op),
-      trainer_hooks.StopAfterNTrees(num_trees, attempted_trees, finalized_trees)
-  ])
+        logits=tree_train_logits)
 
-  return model_fn_ops
+    training_hooks = [
+        trainer_hooks.SwitchTrainOp(dnn_spec.train_op, dnn_steps_to_train,
+                                    tree_spec.train_op),
+        trainer_hooks.StopAfterNTrees(num_trees, attempted_trees,
+                                      finalized_trees)
+    ]
+    fusion_spec = fusion_spec._replace(training_hooks=training_hooks +
+                                       list(fusion_spec.training_hooks))
+    return fusion_spec
 
 
 class DNNBoostedTreeCombinedClassifier(estimator.Estimator):
@@ -697,3 +739,100 @@ class DNNBoostedTreeCombinedEstimator(estimator.Estimator):
         model_dir=model_dir,
         config=config,
         feature_engineering_fn=feature_engineering_fn)
+
+
+class CoreDNNBoostedTreeCombinedEstimator(core_estimator.Estimator):
+  """Initializes a core version of DNNBoostedTreeCombinedEstimator.
+
+    Args:
+      dnn_hidden_units: List of hidden units per layer for DNN.
+      dnn_feature_columns: An iterable containing all the feature columns
+        used by the model's DNN.
+      tree_learner_config: A config for the tree learner.
+      num_trees: Number of trees to grow model to after training DNN.
+      tree_examples_per_layer: Number of examples to accumulate before
+        growing the tree a layer. This value has a big impact on model
+        quality and should be set equal to the number of examples in
+        training dataset if possible. It can also be a function that computes
+        the number of examples based on the depth of the layer that's
+        being built.
+      head: `Head` instance.
+      model_dir: Directory for model exports.
+      config: `RunConfig` of the estimator.
+      dnn_optimizer: string, `Optimizer` object, or callable that defines the
+        optimizer to use for training the DNN. If `None`, will use the Adagrad
+        optimizer with default learning rate.
+      dnn_activation_fn: Activation function applied to each layer of the DNN.
+        If `None`, will use `tf.nn.relu`.
+      dnn_dropout: When not `None`, the probability to drop out a given
+        unit in the DNN.
+      dnn_input_layer_partitioner: Partitioner for input layer of the DNN.
+        Defaults to `min_max_variable_partitioner` with `min_slice_size`
+        64 << 20.
+      dnn_input_layer_to_tree: Whether to provide the DNN's input layer
+      as a feature to the tree.
+      dnn_steps_to_train: Number of steps to train dnn for before switching
+        to gbdt.
+      predict_with_tree_only: Whether to use only the tree model output as the
+        final prediction.
+      tree_feature_columns: An iterable containing all the feature columns
+        used by the model's boosted trees. If dnn_input_layer_to_tree is
+        set to True, these features are in addition to dnn_feature_columns.
+      tree_center_bias: Whether a separate tree should be created for
+        first fitting the bias.
+      dnn_to_tree_distillation_param: A Tuple of (float, loss_fn), where the
+        float defines the weight of the distillation loss, and the loss_fn, for
+        computing distillation loss, takes dnn_logits, tree_logits and weight
+        tensor. If the entire tuple is None, no distillation will be applied. If
+        only the loss_fn is None, we will take the sigmoid/softmax cross entropy
+        loss be default. When distillation is applied, `predict_with_tree_only`
+        will be set to True.
+    """
+
+  def __init__(self,
+               dnn_hidden_units,
+               dnn_feature_columns,
+               tree_learner_config,
+               num_trees,
+               tree_examples_per_layer,
+               head,
+               model_dir=None,
+               config=None,
+               dnn_optimizer="Adagrad",
+               dnn_activation_fn=nn.relu,
+               dnn_dropout=None,
+               dnn_input_layer_partitioner=None,
+               dnn_input_layer_to_tree=True,
+               dnn_steps_to_train=10000,
+               predict_with_tree_only=False,
+               tree_feature_columns=None,
+               tree_center_bias=False,
+               dnn_to_tree_distillation_param=None):
+
+    def _model_fn(features, labels, mode, config):
+      return _dnn_tree_combined_model_fn(
+          features=features,
+          labels=labels,
+          mode=mode,
+          head=head,
+          dnn_hidden_units=dnn_hidden_units,
+          dnn_feature_columns=dnn_feature_columns,
+          tree_learner_config=tree_learner_config,
+          num_trees=num_trees,
+          tree_examples_per_layer=tree_examples_per_layer,
+          config=config,
+          dnn_optimizer=dnn_optimizer,
+          dnn_activation_fn=dnn_activation_fn,
+          dnn_dropout=dnn_dropout,
+          dnn_input_layer_partitioner=dnn_input_layer_partitioner,
+          dnn_input_layer_to_tree=dnn_input_layer_to_tree,
+          dnn_steps_to_train=dnn_steps_to_train,
+          predict_with_tree_only=predict_with_tree_only,
+          tree_feature_columns=tree_feature_columns,
+          tree_center_bias=tree_center_bias,
+          dnn_to_tree_distillation_param=dnn_to_tree_distillation_param,
+          output_type=model.ModelBuilderOutputType.ESTIMATOR_SPEC,
+          use_core_versions=True)
+
+    super(CoreDNNBoostedTreeCombinedEstimator, self).__init__(
+        model_fn=_model_fn, model_dir=model_dir, config=config)
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator_test.py b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator_test.py
index 9b7acfa664..839eedd3a8 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator_test.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator_test.py
@@ -28,10 +28,11 @@ from tensorflow.python.estimator.canned import head as head_lib
 from tensorflow.python.feature_column import feature_column_lib as core_feature_column
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import googletest
-
+from tensorflow.python.training import checkpoint_utils
 
 def _train_input_fn():
   features = {
@@ -156,5 +157,72 @@ class DNNBoostedTreeCombinedTest(test_util.TensorFlowTestCase):
     classifier.evaluate(input_fn=_eval_input_fn, steps=1)
 
 
+class CoreDNNBoostedTreeCombinedTest(test_util.TensorFlowTestCase):
+
+  def _assert_checkpoint(self, model_dir, global_step):
+    reader = checkpoint_utils.load_checkpoint(model_dir)
+    self.assertEqual(global_step, reader.get_tensor(ops.GraphKeys.GLOBAL_STEP))
+
+  def testTrainEvaluateInferDoesNotThrowErrorWithNoDnnInput(self):
+    head_fn = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+        loss_reduction=losses.Reduction.SUM_OVER_NONZERO_WEIGHTS)
+
+    learner_config = learner_pb2.LearnerConfig()
+    learner_config.num_classes = 2
+    learner_config.constraints.max_tree_depth = 3
+    model_dir = tempfile.mkdtemp()
+    config = run_config.RunConfig()
+
+    est = estimator.CoreDNNBoostedTreeCombinedEstimator(
+        head=head_fn,
+        dnn_hidden_units=[1],
+        dnn_feature_columns=[core_feature_column.numeric_column("x")],
+        tree_learner_config=learner_config,
+        num_trees=1,
+        tree_examples_per_layer=3,
+        model_dir=model_dir,
+        config=config,
+        dnn_steps_to_train=10,
+        dnn_input_layer_to_tree=False,
+        tree_feature_columns=[core_feature_column.numeric_column("x")])
+
+    # Train for a few steps.
+    est.train(input_fn=_train_input_fn, steps=1000)
+    # 10 steps for dnn, 3  for 1 tree of depth 3 + 1 after the tree finished
+    self._assert_checkpoint(est.model_dir, global_step=14)
+    res = est.evaluate(input_fn=_eval_input_fn, steps=1)
+    self.assertLess(0.5, res["auc"])
+    est.predict(input_fn=_eval_input_fn)
+
+  def testTrainEvaluateInferDoesNotThrowErrorWithDnnInput(self):
+    head_fn = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+        loss_reduction=losses.Reduction.SUM_OVER_NONZERO_WEIGHTS)
+
+    learner_config = learner_pb2.LearnerConfig()
+    learner_config.num_classes = 2
+    learner_config.constraints.max_tree_depth = 3
+    model_dir = tempfile.mkdtemp()
+    config = run_config.RunConfig()
+
+    est = estimator.CoreDNNBoostedTreeCombinedEstimator(
+        head=head_fn,
+        dnn_hidden_units=[1],
+        dnn_feature_columns=[core_feature_column.numeric_column("x")],
+        tree_learner_config=learner_config,
+        num_trees=1,
+        tree_examples_per_layer=3,
+        model_dir=model_dir,
+        config=config,
+        dnn_steps_to_train=10,
+        dnn_input_layer_to_tree=True,
+        tree_feature_columns=[])
+
+    # Train for a few steps.
+    est.train(input_fn=_train_input_fn, steps=1000)
+    res = est.evaluate(input_fn=_eval_input_fn, steps=1)
+    self.assertLess(0.5, res["auc"])
+    est.predict(input_fn=_eval_input_fn)
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py b/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py
index 59a78515c6..2df879f924 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py
@@ -22,9 +22,16 @@ from tensorflow.contrib.boosted_trees.estimator_batch import model
 from tensorflow.contrib.boosted_trees.python.utils import losses
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
+from tensorflow.python.estimator import estimator as core_estimator
 from tensorflow.python.ops import math_ops
 
 
+# ================== Old estimator interface===================================
+# The estimators below were designed for old feature columns and old estimator
+# interface. They can be used with new feature columns and losses by setting
+# use_core_libs = True.
+
+
 class GradientBoostedDecisionTreeClassifier(estimator.Estimator):
   """An estimator using gradient boosted decision trees."""
 
@@ -354,3 +361,166 @@ class GradientBoostedDecisionTreeRanker(estimator.Estimator):
         model_dir=model_dir,
         config=config,
         feature_engineering_fn=feature_engineering_fn)
+
+# ================== New Estimator interface===================================
+# The estimators below use new core Estimator interface and must be used with
+# new feature columns and heads.
+
+
+class CoreGradientBoostedDecisionTreeEstimator(core_estimator.Estimator):
+  """An estimator using gradient boosted decision trees.
+
+  Useful for training with user specified `Head`.
+  """
+
+  def __init__(self,
+               learner_config,
+               examples_per_layer,
+               head,
+               num_trees=None,
+               feature_columns=None,
+               weight_column_name=None,
+               model_dir=None,
+               config=None,
+               label_keys=None,
+               feature_engineering_fn=None,
+               logits_modifier_function=None,
+               center_bias=True,
+               output_leaf_index=False):
+    """Initializes a core version of GradientBoostedDecisionTreeEstimator.
+
+    Args:
+      learner_config: A config for the learner.
+      examples_per_layer: Number of examples to accumulate before growing a
+        layer. It can also be a function that computes the number of examples
+        based on the depth of the layer that's being built.
+      head: `Head` instance.
+      num_trees: An int, number of trees to build.
+      feature_columns: A list of feature columns.
+      weight_column_name: Name of the column for weights, or None if not
+        weighted.
+      model_dir: Directory for model exports, etc.
+      config: `RunConfig` object to configure the runtime settings.
+      label_keys: Optional list of strings with size `[n_classes]` defining the
+        label vocabulary. Only supported for `n_classes` > 2.
+      feature_engineering_fn: Feature engineering function. Takes features and
+        labels which are the output of `input_fn` and returns features and
+        labels which will be fed into the model.
+      logits_modifier_function: A modifier function for the logits.
+      center_bias: Whether a separate tree should be created for first fitting
+        the bias.
+      output_leaf_index: whether to output leaf indices along with predictions
+        during inference. The leaf node indexes are available in predictions
+        dict by the key 'leaf_index'. For example,
+        result_dict = classifier.predict(...)
+        for example_prediction_result in result_dict:
+          # access leaf index list by example_prediction_result["leaf_index"]
+          # which contains one leaf index per tree
+    """
+
+    def _model_fn(features, labels, mode, config):
+      return model.model_builder(
+          features=features,
+          labels=labels,
+          mode=mode,
+          config=config,
+          params={
+              'head': head,
+              'feature_columns': feature_columns,
+              'learner_config': learner_config,
+              'num_trees': num_trees,
+              'weight_column_name': weight_column_name,
+              'examples_per_layer': examples_per_layer,
+              'center_bias': center_bias,
+              'logits_modifier_function': logits_modifier_function,
+              'use_core_libs': True,
+              'output_leaf_index': output_leaf_index,
+          },
+          output_type=model.ModelBuilderOutputType.ESTIMATOR_SPEC)
+
+    super(CoreGradientBoostedDecisionTreeEstimator, self).__init__(
+        model_fn=_model_fn, model_dir=model_dir, config=config)
+
+
+class CoreGradientBoostedDecisionTreeRanker(core_estimator.Estimator):
+  """A ranking estimator using gradient boosted decision trees."""
+
+  def __init__(
+      self,
+      learner_config,
+      examples_per_layer,
+      head,
+      ranking_model_pair_keys,
+      num_trees=None,
+      feature_columns=None,
+      weight_column_name=None,
+      model_dir=None,
+      config=None,
+      label_keys=None,
+      logits_modifier_function=None,
+      center_bias=False,
+      output_leaf_index=False,
+  ):
+    """Initializes a GradientBoostedDecisionTreeRanker instance.
+
+    This is an estimator that can be trained off the pairwise data and can be
+    used for inference on non-paired data. This is essentially LambdaMart.
+    Args:
+      learner_config: A config for the learner.
+      examples_per_layer: Number of examples to accumulate before growing a
+        layer. It can also be a function that computes the number of examples
+        based on the depth of the layer that's being built.
+      head: `Head` instance.
+      ranking_model_pair_keys: Keys to distinguish between features
+        for left and right part of the training pairs for ranking. For example,
+        for an Example with features "a.f1" and "b.f1", the keys would be
+        ("a", "b").
+      num_trees: An int, number of trees to build.
+      feature_columns: A list of feature columns.
+      weight_column_name: Name of the column for weights, or None if not
+        weighted.
+      model_dir: Directory for model exports, etc.
+      config: `RunConfig` object to configure the runtime settings.
+      label_keys: Optional list of strings with size `[n_classes]` defining the
+        label vocabulary. Only supported for `n_classes` > 2.
+      logits_modifier_function: A modifier function for the logits.
+      center_bias: Whether a separate tree should be created for first fitting
+        the bias.
+      output_leaf_index: whether to output leaf indices along with predictions
+        during inference. The leaf node indexes are available in predictions
+        dict by the key 'leaf_index'. It is a Tensor of rank 2 and its shape is
+        [batch_size, num_trees].
+        For example,
+        result_iter = classifier.predict(...)
+        for result_dict in result_iter:
+          # access leaf index list by result_dict["leaf_index"]
+          # which contains one leaf index per tree
+
+    Raises:
+      ValueError: If learner_config is not valid.
+    """
+
+    def _model_fn(features, labels, mode, config):
+      return model.ranking_model_builder(
+          features=features,
+          labels=labels,
+          mode=mode,
+          config=config,
+          params={
+              'head': head,
+              'n_classes': 2,
+              'feature_columns': feature_columns,
+              'learner_config': learner_config,
+              'num_trees': num_trees,
+              'weight_column_name': weight_column_name,
+              'examples_per_layer': examples_per_layer,
+              'center_bias': center_bias,
+              'logits_modifier_function': logits_modifier_function,
+              'use_core_libs': True,
+              'output_leaf_index': output_leaf_index,
+              'ranking_model_pair_keys': ranking_model_pair_keys,
+          },
+          output_type=model.ModelBuilderOutputType.ESTIMATOR_SPEC)
+
+    super(CoreGradientBoostedDecisionTreeRanker, self).__init__(
+        model_fn=_model_fn, model_dir=model_dir, config=config)
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
index 2c2dcb039d..9e9febbbef 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
@@ -182,7 +182,7 @@ class BoostedTreeEstimatorTest(test_util.TensorFlowTestCase):
     config = run_config.RunConfig()
 
     head_fn = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
-        loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE)
+        loss_reduction=losses.Reduction.SUM_OVER_NONZERO_WEIGHTS)
 
     model = estimator.GradientBoostedDecisionTreeRanker(
         head=head_fn,
@@ -203,5 +203,60 @@ class BoostedTreeEstimatorTest(test_util.TensorFlowTestCase):
     model.predict(input_fn=_infer_ranking_train_input_fn)
 
 
+class CoreGradientBoostedDecisionTreeEstimators(test_util.TensorFlowTestCase):
+
+  def testTrainEvaluateInferDoesNotThrowError(self):
+    head_fn = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+        loss_reduction=losses.Reduction.SUM_OVER_NONZERO_WEIGHTS)
+
+    learner_config = learner_pb2.LearnerConfig()
+    learner_config.num_classes = 2
+    learner_config.constraints.max_tree_depth = 1
+    model_dir = tempfile.mkdtemp()
+    config = run_config.RunConfig()
+
+    est = estimator.CoreGradientBoostedDecisionTreeEstimator(
+        head=head_fn,
+        learner_config=learner_config,
+        num_trees=1,
+        examples_per_layer=3,
+        model_dir=model_dir,
+        config=config,
+        feature_columns=[core_feature_column.numeric_column("x")])
+
+    # Train for a few steps.
+    est.train(input_fn=_train_input_fn, steps=1000)
+    est.evaluate(input_fn=_eval_input_fn, steps=1)
+    est.predict(input_fn=_eval_input_fn)
+
+  def testRankingDontThrowExceptionForForEstimator(self):
+    learner_config = learner_pb2.LearnerConfig()
+    learner_config.num_classes = 2
+    learner_config.constraints.max_tree_depth = 1
+    model_dir = tempfile.mkdtemp()
+    config = run_config.RunConfig()
+
+    head_fn = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+        loss_reduction=losses.Reduction.SUM_OVER_NONZERO_WEIGHTS)
+
+    est = estimator.CoreGradientBoostedDecisionTreeRanker(
+        head=head_fn,
+        learner_config=learner_config,
+        num_trees=1,
+        examples_per_layer=3,
+        model_dir=model_dir,
+        config=config,
+        feature_columns=[
+            core_feature_column.numeric_column("f1"),
+            core_feature_column.numeric_column("f2")
+        ],
+        ranking_model_pair_keys=("a", "b"))
+
+    # Train for a few steps.
+    est.train(input_fn=_ranking_train_input_fn, steps=1000)
+    est.evaluate(input_fn=_ranking_train_input_fn, steps=1)
+    est.predict(input_fn=_infer_ranking_train_input_fn)
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/model.py b/tensorflow/contrib/boosted_trees/estimator_batch/model.py
index 0e8a56e6e9..161cc42cb0 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/model.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/model.py
@@ -29,7 +29,17 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.training import training_util
 
-def model_builder(features, labels, mode, params, config):
+class ModelBuilderOutputType(object):
+  MODEL_FN_OPS = 0
+  ESTIMATOR_SPEC = 1
+
+
+def model_builder(features,
+                  labels,
+                  mode,
+                  params,
+                  config,
+                  output_type=ModelBuilderOutputType.MODEL_FN_OPS):
   """Multi-machine batch gradient descent tree model.
 
   Args:
@@ -49,6 +59,8 @@ def model_builder(features, labels, mode, params, config):
       * center_bias: Whether a separate tree should be created for first fitting
           the bias.
     config: `RunConfig` of the estimator.
+    output_type: Whether to return ModelFnOps (old interface) or EstimatorSpec
+      (new interface).
 
   Returns:
     A `ModelFnOps` object.
@@ -115,35 +127,63 @@ def model_builder(features, labels, mode, params, config):
         return update_op
 
   create_estimator_spec_op = getattr(head, "create_estimator_spec", None)
-  if use_core_libs and callable(create_estimator_spec_op):
-    model_fn_ops = head.create_estimator_spec(
-        features=features,
-        mode=mode,
-        labels=labels,
-        train_op_fn=_train_op_fn,
-        logits=logits)
-    model_fn_ops = estimator_utils.estimator_spec_to_model_fn_ops(model_fn_ops)
-  else:
-    model_fn_ops = head.create_model_fn_ops(
-        features=features,
-        mode=mode,
-        labels=labels,
-        train_op_fn=_train_op_fn,
-        logits=logits)
-  if output_leaf_index and gbdt_batch.LEAF_INDEX in predictions_dict:
-    model_fn_ops.predictions[gbdt_batch.LEAF_INDEX] = predictions_dict[
-        gbdt_batch.LEAF_INDEX]
+
+  training_hooks = []
   if num_trees:
     if center_bias:
       num_trees += 1
+
     finalized_trees, attempted_trees = gbdt_model.get_number_of_trees_tensor()
-    model_fn_ops.training_hooks.append(
+    training_hooks.append(
         trainer_hooks.StopAfterNTrees(num_trees, attempted_trees,
                                       finalized_trees))
+
+  if output_type == ModelBuilderOutputType.MODEL_FN_OPS:
+    if use_core_libs and callable(create_estimator_spec_op):
+      model_fn_ops = head.create_estimator_spec(
+          features=features,
+          mode=mode,
+          labels=labels,
+          train_op_fn=_train_op_fn,
+          logits=logits)
+      model_fn_ops = estimator_utils.estimator_spec_to_model_fn_ops(
+          model_fn_ops)
+    else:
+      model_fn_ops = head.create_model_fn_ops(
+          features=features,
+          mode=mode,
+          labels=labels,
+          train_op_fn=_train_op_fn,
+          logits=logits)
+
+    if output_leaf_index and gbdt_batch.LEAF_INDEX in predictions_dict:
+      model_fn_ops.predictions[gbdt_batch.LEAF_INDEX] = predictions_dict[
+          gbdt_batch.LEAF_INDEX]
+
+    model_fn_ops.training_hooks.extend(training_hooks)
+    return model_fn_ops
+  elif output_type == ModelBuilderOutputType.ESTIMATOR_SPEC:
+    assert callable(create_estimator_spec_op)
+    estimator_spec = head.create_estimator_spec(
+        features=features,
+        mode=mode,
+        labels=labels,
+        train_op_fn=_train_op_fn,
+        logits=logits)
+
+    estimator_spec = estimator_spec._replace(
+        training_hooks=training_hooks + list(estimator_spec.training_hooks))
+    return estimator_spec
+
   return model_fn_ops
 
 
-def ranking_model_builder(features, labels, mode, params, config):
+def ranking_model_builder(features,
+                          labels,
+                          mode,
+                          params,
+                          config,
+                          output_type=ModelBuilderOutputType.MODEL_FN_OPS):
   """Multi-machine batch gradient descent tree model for ranking.
 
   Args:
@@ -167,6 +207,9 @@ def ranking_model_builder(features, labels, mode, params, config):
         for an Example with features "a.f1" and "b.f1", the keys would be
         ("a", "b").
     config: `RunConfig` of the estimator.
+    output_type: Whether to return ModelFnOps (old interface) or EstimatorSpec
+      (new interface).
+
 
   Returns:
     A `ModelFnOps` object.
@@ -294,31 +337,54 @@ def ranking_model_builder(features, labels, mode, params, config):
         return update_op
 
   create_estimator_spec_op = getattr(head, "create_estimator_spec", None)
-  if use_core_libs and callable(create_estimator_spec_op):
-    model_fn_ops = head.create_estimator_spec(
-        features=features,
-        mode=mode,
-        labels=labels,
-        train_op_fn=_train_op_fn,
-        logits=logits)
-    model_fn_ops = estimator_utils.estimator_spec_to_model_fn_ops(model_fn_ops)
-  else:
-    model_fn_ops = head.create_model_fn_ops(
-        features=features,
-        mode=mode,
-        labels=labels,
-        train_op_fn=_train_op_fn,
-        logits=logits)
 
-  if output_leaf_index and gbdt_batch.LEAF_INDEX in predictions_dict:
-    model_fn_ops.predictions[gbdt_batch.LEAF_INDEX] = predictions_dict[
-        gbdt_batch.LEAF_INDEX]
+  training_hooks = []
   if num_trees:
     if center_bias:
       num_trees += 1
+
     finalized_trees, attempted_trees = (
         gbdt_model_main.get_number_of_trees_tensor())
-    model_fn_ops.training_hooks.append(
+    training_hooks.append(
         trainer_hooks.StopAfterNTrees(num_trees, attempted_trees,
                                       finalized_trees))
+
+  if output_type == ModelBuilderOutputType.MODEL_FN_OPS:
+    if use_core_libs and callable(create_estimator_spec_op):
+      model_fn_ops = head.create_estimator_spec(
+          features=features,
+          mode=mode,
+          labels=labels,
+          train_op_fn=_train_op_fn,
+          logits=logits)
+      model_fn_ops = estimator_utils.estimator_spec_to_model_fn_ops(
+          model_fn_ops)
+    else:
+      model_fn_ops = head.create_model_fn_ops(
+          features=features,
+          mode=mode,
+          labels=labels,
+          train_op_fn=_train_op_fn,
+          logits=logits)
+
+    if output_leaf_index and gbdt_batch.LEAF_INDEX in predictions_dict:
+      model_fn_ops.predictions[gbdt_batch.LEAF_INDEX] = predictions_dict[
+          gbdt_batch.LEAF_INDEX]
+
+    model_fn_ops.training_hooks.extend(training_hooks)
+    return model_fn_ops
+
+  elif output_type == ModelBuilderOutputType.ESTIMATOR_SPEC:
+    assert callable(create_estimator_spec_op)
+    estimator_spec = head.create_estimator_spec(
+        features=features,
+        mode=mode,
+        labels=labels,
+        train_op_fn=_train_op_fn,
+        logits=logits)
+
+    estimator_spec = estimator_spec._replace(
+        training_hooks=training_hooks + list(estimator_spec.training_hooks))
+    return estimator_spec
+
   return model_fn_ops
diff --git a/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc b/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc
index 0b28f81e7c..5b4be2f258 100644
--- a/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc
@@ -241,6 +241,11 @@ class CreateQuantileAccumulatorOp : public OpKernel {
     // other exceptions. If one already exists, it unrefs the new one.
     const Tensor* stamp_token_t;
     OP_REQUIRES_OK(context, context->input(kStampTokenName, &stamp_token_t));
+    // An epsilon value of zero could cause perfoamance issues and is therefore,
+    // disallowed.
+    OP_REQUIRES(
+        context, epsilon_ > 0,
+        errors::InvalidArgument("An epsilon value of zero is not allowed."));
     auto result = new QuantileStreamResource(epsilon_, num_quantiles_,
                                              max_elements_, generate_quantiles_,
                                              stamp_token_t->scalar<int64>()());
diff --git a/tensorflow/contrib/boosted_trees/kernels/training_ops.cc b/tensorflow/contrib/boosted_trees/kernels/training_ops.cc
index 1bfeed3066..6d9a6ee5a0 100644
--- a/tensorflow/contrib/boosted_trees/kernels/training_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/training_ops.cc
@@ -372,12 +372,18 @@ class GrowTreeEnsembleOp : public OpKernel {
       return;
     }
 
+    // Get the max tree depth.
+    const Tensor* max_tree_depth_t;
+    OP_REQUIRES_OK(context,
+                   context->input("max_tree_depth", &max_tree_depth_t));
+    const int32 max_tree_depth = max_tree_depth_t->scalar<int32>()();
+
     // Update and retrieve the growable tree.
     // If the tree is fully built and dropout was applied, it also adjusts the
     // weights of dropped and the last tree.
     boosted_trees::trees::DecisionTreeConfig* const tree_config =
         UpdateAndRetrieveGrowableTree(ensemble_resource, learning_rate,
-                                      dropout_seed);
+                                      dropout_seed, max_tree_depth);
 
     // Split tree nodes.
     for (auto& split_entry : best_splits) {
@@ -494,7 +500,8 @@ class GrowTreeEnsembleOp : public OpKernel {
   boosted_trees::trees::DecisionTreeConfig* UpdateAndRetrieveGrowableTree(
       boosted_trees::models::DecisionTreeEnsembleResource* const
           ensemble_resource,
-      const float learning_rate, const uint64 dropout_seed) {
+      const float learning_rate, const uint64 dropout_seed,
+      const int32 max_tree_depth) {
     const auto num_trees = ensemble_resource->num_trees();
     if (num_trees <= 0 ||
         ensemble_resource->LastTreeMetadata()->is_finalized()) {
@@ -506,8 +513,7 @@ class GrowTreeEnsembleOp : public OpKernel {
       tree_config->add_nodes()->mutable_leaf();
       boosted_trees::trees::DecisionTreeMetadata* const tree_metadata =
           ensemble_resource->LastTreeMetadata();
-      tree_metadata->set_is_finalized(
-          learner_config_.constraints().max_tree_depth() <= 1);
+      tree_metadata->set_is_finalized(max_tree_depth <= 1);
       tree_metadata->set_num_tree_weight_updates(1);
     } else {
       // The growable tree is by definition the last tree in the ensemble.
@@ -518,8 +524,7 @@ class GrowTreeEnsembleOp : public OpKernel {
               << num_trees - 1 << " of ensemble of " << num_trees << " trees.";
       // Update growable tree metadata.
       tree_metadata->set_num_layers_grown(new_num_layers);
-      tree_metadata->set_is_finalized(
-          new_num_layers >= learner_config_.constraints().max_tree_depth());
+      tree_metadata->set_is_finalized(new_num_layers >= max_tree_depth);
     }
     UpdateTreeWeightsIfDropout(ensemble_resource, dropout_seed);
     return ensemble_resource->LastTree();
diff --git a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h
index c120dd8a6c..f19e5116f5 100644
--- a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h
+++ b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h
@@ -58,6 +58,8 @@ namespace quantiles {
 // Compute: O(n * log(1/eps * log(eps * n))).
 // Memory: O(1/eps * log^2(eps * n)) <- for one worker streaming through the
 //                                      entire dataset.
+// An epsilon value of zero would make the algorithm extremely inefficent and
+// therefore, is disallowed.
 template <typename ValueType, typename WeightType,
           typename CompareFn = std::less<ValueType>>
 class WeightedQuantilesStream {
@@ -69,6 +71,9 @@ class WeightedQuantilesStream {
 
   explicit WeightedQuantilesStream(double eps, int64 max_elements)
       : eps_(eps), buffer_(1LL, 2LL), finalized_(false) {
+    // See the class documentation. An epsilon value of zero could cause
+    // perfoamance issues.
+    QCHECK(eps > 0) << "An epsilon value of zero is not allowed.";
     std::tie(max_levels_, block_size_) = GetQuantileSpecs(eps, max_elements);
     buffer_ = Buffer(block_size_, max_elements);
     summary_levels_.reserve(max_levels_);
diff --git a/tensorflow/contrib/boosted_trees/ops/training_ops.cc b/tensorflow/contrib/boosted_trees/ops/training_ops.cc
index f63c199ad6..22ac9edb72 100644
--- a/tensorflow/contrib/boosted_trees/ops/training_ops.cc
+++ b/tensorflow/contrib/boosted_trees/ops/training_ops.cc
@@ -56,6 +56,7 @@ REGISTER_OP("GrowTreeEnsemble")
     .Input("next_stamp_token: int64")
     .Input("learning_rate: float")
     .Input("dropout_seed: int64")
+    .Input("max_tree_depth: int32")
     .Input("partition_ids: num_handlers * int32")
     .Input("gains: num_handlers * float")
     .Input("splits: num_handlers * string")
@@ -67,6 +68,8 @@ REGISTER_OP("GrowTreeEnsemble")
       TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused_input));
       // Dropout seed.
       TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused_input));
+      // Maximum tree depth.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused_input));
       return Status::OK();
     })
     .Doc(R"doc(
diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/training_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/training_ops_test.py
index 3e524efbea..e39e1de8d1 100644
--- a/tensorflow/contrib/boosted_trees/python/kernel_tests/training_ops_test.py
+++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/training_ops_test.py
@@ -296,7 +296,7 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
           pruning_mode=learner_pb2.LearnerConfig.PRE_PRUNE,
           growing_mode=learner_pb2.LearnerConfig.WHOLE_TREE,
           # Dropout does not change anything here, tree is not finalized.
-          dropout_probability=0.5).SerializeToString()
+          dropout_probability=0.5)
 
       # Prepare handler inputs.
       # Note that handlers 1 & 3 have the same gain but different splits.
@@ -321,9 +321,10 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
           ],
           gains=[handler1_gains, handler2_gains, handler3_gains],
           splits=[handler1_split, handler2_split, handler3_split],
-          learner_config=learner_config,
+          learner_config=learner_config.SerializeToString(),
           dropout_seed=123,
-          center_bias=True)
+          center_bias=True,
+          max_tree_depth=learner_config.constraints.max_tree_depth)
       session.run(grow_op)
 
       # Expect the simpler split from handler 1 to be chosen.
@@ -443,7 +444,7 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
           pruning_mode=learner_pb2.LearnerConfig.PRE_PRUNE,
           growing_mode=learner_pb2.LearnerConfig.WHOLE_TREE,
           # Dropout does not change anything here - tree is not finalized.
-          dropout_probability=0.5).SerializeToString()
+          dropout_probability=0.5)
 
       # Prepare handler inputs.
       # Handler 1 only has a candidate for partition 1, handler 2 has candidates
@@ -472,9 +473,10 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
           ],
           gains=[handler1_gains, handler2_gains, handler3_gains],
           splits=[handler1_split, handler2_split, handler3_split],
-          learner_config=learner_config,
+          learner_config=learner_config.SerializeToString(),
           dropout_seed=123,
-          center_bias=True)
+          center_bias=True,
+          max_tree_depth=learner_config.constraints.max_tree_depth)
       session.run(grow_op)
 
       # Expect the split for partition 1 to be chosen from handler 1 and
@@ -632,8 +634,7 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
           max_depth=1,
           min_node_weight=0,
           pruning_mode=learner_pb2.LearnerConfig.PRE_PRUNE,
-          growing_mode=learner_pb2.LearnerConfig.WHOLE_TREE).SerializeToString(
-          )
+          growing_mode=learner_pb2.LearnerConfig.WHOLE_TREE)
 
       # Prepare handler inputs.
       handler1_partitions = np.array([0], dtype=np.int32)
@@ -657,9 +658,10 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
           ],
           gains=[handler1_gains, handler2_gains, handler3_gains],
           splits=[handler1_split, handler2_split, handler3_split],
-          learner_config=learner_config,
+          learner_config=learner_config.SerializeToString(),
           dropout_seed=123,
-          center_bias=True)
+          center_bias=True,
+          max_tree_depth=learner_config.constraints.max_tree_depth)
       session.run(grow_op)
 
       # Expect a new tree to be added with the split from handler 1.
@@ -773,8 +775,7 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
           max_depth=1,
           min_node_weight=0,
           pruning_mode=learner_pb2.LearnerConfig.PRE_PRUNE,
-          growing_mode=learner_pb2.LearnerConfig.WHOLE_TREE).SerializeToString(
-          )
+          growing_mode=learner_pb2.LearnerConfig.WHOLE_TREE)
 
       # Prepare handler inputs.
       # All handlers have negative gain.
@@ -794,9 +795,10 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
           partition_ids=[handler1_partitions, handler2_partitions],
           gains=[handler1_gains, handler2_gains],
           splits=[handler1_split, handler2_split],
-          learner_config=learner_config,
+          learner_config=learner_config.SerializeToString(),
           dropout_seed=123,
-          center_bias=True)
+          center_bias=True,
+          max_tree_depth=learner_config.constraints.max_tree_depth)
       session.run(grow_op)
 
       # Expect the ensemble to be empty.
@@ -839,8 +841,7 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
           max_depth=1,
           min_node_weight=0,
           pruning_mode=learner_pb2.LearnerConfig.POST_PRUNE,
-          growing_mode=learner_pb2.LearnerConfig.WHOLE_TREE).SerializeToString(
-          )
+          growing_mode=learner_pb2.LearnerConfig.WHOLE_TREE)
 
       # Prepare handler inputs.
       # Note that handlers 1 & 3 have the same gain but different splits.
@@ -865,9 +866,10 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
           ],
           gains=[handler1_gains, handler2_gains, handler3_gains],
           splits=[handler1_split, handler2_split, handler3_split],
-          learner_config=learner_config,
+          learner_config=learner_config.SerializeToString(),
           dropout_seed=123,
-          center_bias=True)
+          center_bias=True,
+          max_tree_depth=learner_config.constraints.max_tree_depth)
       session.run(grow_op)
 
       # Expect the simpler split from handler 1 to be chosen.
@@ -946,8 +948,7 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
           max_depth=2,
           min_node_weight=0,
           pruning_mode=learner_pb2.LearnerConfig.POST_PRUNE,
-          growing_mode=learner_pb2.LearnerConfig.WHOLE_TREE).SerializeToString(
-          )
+          growing_mode=learner_pb2.LearnerConfig.WHOLE_TREE)
 
       # Prepare handler inputs.
       # All handlers have negative gain.
@@ -967,9 +968,10 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
           partition_ids=[handler1_partitions, handler2_partitions],
           gains=[handler1_gains, handler2_gains],
           splits=[handler1_split, handler2_split],
-          learner_config=learner_config,
+          learner_config=learner_config.SerializeToString(),
           dropout_seed=123,
-          center_bias=True)
+          center_bias=True,
+          max_tree_depth=learner_config.constraints.max_tree_depth)
       session.run(grow_op)
 
       # Expect the split from handler 2 to be chosen despite the negative gain.
@@ -1048,9 +1050,10 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
           partition_ids=[handler1_partitions],
           gains=[handler1_gains],
           splits=[handler1_split],
-          learner_config=learner_config,
+          learner_config=learner_config.SerializeToString(),
           dropout_seed=123,
-          center_bias=True)
+          center_bias=True,
+          max_tree_depth=learner_config.constraints.max_tree_depth)
       session.run(grow_op)
 
       # Expect the ensemble to be empty as post-pruning will prune
@@ -1094,8 +1097,7 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
           max_depth=2,
           min_node_weight=0,
           pruning_mode=learner_pb2.LearnerConfig.POST_PRUNE,
-          growing_mode=learner_pb2.LearnerConfig.WHOLE_TREE).SerializeToString(
-          )
+          growing_mode=learner_pb2.LearnerConfig.WHOLE_TREE)
 
       # Prepare handler inputs.
       # Second handler has positive gain.
@@ -1115,9 +1117,10 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
           partition_ids=[handler1_partitions, handler2_partitions],
           gains=[handler1_gains, handler2_gains],
           splits=[handler1_split, handler2_split],
-          learner_config=learner_config,
+          learner_config=learner_config.SerializeToString(),
           dropout_seed=123,
-          center_bias=True)
+          center_bias=True,
+          max_tree_depth=learner_config.constraints.max_tree_depth)
       session.run(grow_op)
 
       # Expect the split from handler 2 to be chosen despite the negative gain.
@@ -1194,9 +1197,10 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
           partition_ids=[handler1_partitions],
           gains=[handler1_gains],
           splits=[handler1_split],
-          learner_config=learner_config,
+          learner_config=learner_config.SerializeToString(),
           dropout_seed=123,
-          center_bias=True)
+          center_bias=True,
+          max_tree_depth=learner_config.constraints.max_tree_depth)
       session.run(grow_op)
 
       # Expect the negative gain split of partition 1 to be pruned and the
@@ -1335,7 +1339,7 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
           pruning_mode=learner_pb2.LearnerConfig.PRE_PRUNE,
           growing_mode=learner_pb2.LearnerConfig.LAYER_BY_LAYER,
           # Dropout will have no effect, since the tree will not be fully grown.
-          dropout_probability=1.0).SerializeToString()
+          dropout_probability=1.0)
 
       # Prepare handler inputs.
       # Handler 1 only has a candidate for partition 1, handler 2 has candidates
@@ -1364,9 +1368,10 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
           ],
           gains=[handler1_gains, handler2_gains, handler3_gains],
           splits=[handler1_split, handler2_split, handler3_split],
-          learner_config=learner_config,
+          learner_config=learner_config.SerializeToString(),
           dropout_seed=123,
-          center_bias=True)
+          center_bias=True,
+          max_tree_depth=learner_config.constraints.max_tree_depth)
       session.run(grow_op)
 
       # Expect the split for partition 1 to be chosen from handler 1 and
@@ -1543,7 +1548,7 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
           min_node_weight=0,
           pruning_mode=learner_pb2.LearnerConfig.PRE_PRUNE,
           growing_mode=learner_pb2.LearnerConfig.WHOLE_TREE,
-          dropout_probability=1.0).SerializeToString()
+          dropout_probability=1.0)
 
       # Prepare handler inputs.
       handler1_partitions = np.array([0], dtype=np.int32)
@@ -1567,9 +1572,10 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
           ],
           gains=[handler1_gains, handler2_gains, handler3_gains],
           splits=[handler1_split, handler2_split, handler3_split],
-          learner_config=learner_config,
+          learner_config=learner_config.SerializeToString(),
           dropout_seed=123,
-          center_bias=True)
+          center_bias=True,
+          max_tree_depth=learner_config.constraints.max_tree_depth)
       session.run(grow_op)
 
       # Expect a new tree to be added with the split from handler 1.
@@ -1669,7 +1675,6 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
           growing_mode=learner_pb2.LearnerConfig.WHOLE_TREE)
 
       learner_config.constraints.max_number_of_unique_feature_columns = 3
-      learner_config = learner_config.SerializeToString()
       # Prepare handler inputs.
       handler1_partitions = np.array([0], dtype=np.int32)
       handler1_gains = np.array([7.62], dtype=np.float32)
@@ -1692,9 +1697,10 @@ class GrowTreeEnsembleOpTest(test_util.TensorFlowTestCase):
           ],
           gains=[handler1_gains, handler2_gains, handler3_gains],
           splits=[handler1_split, handler2_split, handler3_split],
-          learner_config=learner_config,
+          learner_config=learner_config.SerializeToString(),
           dropout_seed=123,
-          center_bias=True)
+          center_bias=True,
+          max_tree_depth=learner_config.constraints.max_tree_depth)
       session.run(grow_op)
 
       _, serialized = session.run(
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
index 1ee7f2395e..19e053fcb6 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
@@ -287,7 +287,8 @@ class GradientBoostedDecisionTreeModel(object):
                loss_reduction=losses.Reduction.SUM_OVER_NONZERO_WEIGHTS,
                feature_columns=None,
                use_core_columns=False,
-               output_leaf_index=False):
+               output_leaf_index=False,
+               output_leaf_index_modes=None):
     """Construct a new GradientBoostedDecisionTreeModel function.
 
     Args:
@@ -307,6 +308,9 @@ class GradientBoostedDecisionTreeModel(object):
         used.
       output_leaf_index: A boolean variable indicating whether to output leaf
         index into predictions dictionary.
+      output_leaf_index_modes: A list of modes from (TRAIN, EVAL, INFER) which
+        dictates when leaf indices will be outputted. By default, leaf indices
+        are only outputted in INFER mode.
 
     Raises:
       ValueError: if inputs are not valid.
@@ -376,6 +380,8 @@ class GradientBoostedDecisionTreeModel(object):
     self._learner_config = learner_config
     self._feature_columns = feature_columns
     self._learner_config_serialized = learner_config.SerializeToString()
+    self._max_tree_depth = variables.Variable(
+        initial_value=self._learner_config.constraints.max_tree_depth)
     self._attempted_trees = variables.Variable(
         initial_value=array_ops.zeros([], dtypes.int64),
         trainable=False,
@@ -404,7 +410,16 @@ class GradientBoostedDecisionTreeModel(object):
         self._learner_config.multi_class_strategy ==
         learner_pb2.LearnerConfig.TREE_PER_CLASS and
         learner_config.num_classes == 2)
+
+    if output_leaf_index_modes is None:
+      output_leaf_index_modes = [learn.ModeKeys.INFER]
+    elif not all(
+        mode in (learn.ModeKeys.TRAIN, learn.ModeKeys.EVAL,
+                 learn.ModeKeys.INFER) for mode in output_leaf_index_modes):
+      raise ValueError("output_leaf_index_modes should only contain ModeKeys.")
+
     self._output_leaf_index = output_leaf_index
+    self._output_leaf_index_modes = output_leaf_index_modes
 
   def _predict_and_return_dict(self, ensemble_handle, ensemble_stamp, mode):
     """Runs prediction and returns a dictionary of the prediction results.
@@ -435,8 +450,7 @@ class GradientBoostedDecisionTreeModel(object):
     # the right stamp.
     with ops.control_dependencies(ensemble_stats):
       leaf_index = None
-      # Only used in infer (predict), not used in train and eval.
-      if self._output_leaf_index and mode == learn.ModeKeys.INFER:
+      if self._output_leaf_index and mode in self._output_leaf_index_modes:
         predictions, _, leaf_index = (
             prediction_ops).gradient_trees_prediction_verbose(
                 ensemble_handle,
@@ -508,9 +522,6 @@ class GradientBoostedDecisionTreeModel(object):
     if not input_deps:
       raise ValueError("No input tensors for prediction.")
 
-    if any(i.device != input_deps[0].device for i in input_deps):
-      raise ValueError("All input tensors should be on the same device.")
-
     # Get most current model stamp.
     ensemble_stamp = model_ops.tree_ensemble_stamp_token(self._ensemble_handle)
 
@@ -1042,7 +1053,8 @@ class GradientBoostedDecisionTreeModel(object):
             splits=split_info_list,
             learner_config=self._learner_config_serialized,
             dropout_seed=dropout_seed,
-            center_bias=self._center_bias)
+            center_bias=self._center_bias,
+            max_tree_depth=self._max_tree_depth)
 
       def _grow_ensemble_not_ready_fn():
         # Don't grow the ensemble, just update the stamp.
@@ -1056,7 +1068,8 @@ class GradientBoostedDecisionTreeModel(object):
             splits=[],
             learner_config=self._learner_config_serialized,
             dropout_seed=dropout_seed,
-            center_bias=self._center_bias)
+            center_bias=self._center_bias,
+            max_tree_depth=self._max_tree_depth)
 
       def _grow_ensemble_fn():
         # Conditionally grow an ensemble depending on whether the splits
@@ -1096,6 +1109,9 @@ class GradientBoostedDecisionTreeModel(object):
   def get_number_of_trees_tensor(self):
     return self._finalized_trees, self._attempted_trees
 
+  def get_max_tree_depth(self):
+    return self._max_tree_depth
+
   def train(self, loss, predictions_dict, labels):
     """Updates the accumalator stats and grows the ensemble.
 
diff --git a/tensorflow/contrib/cloud/README.md b/tensorflow/contrib/cloud/README.md
index 134ce057f4..a80d8965f3 100644
--- a/tensorflow/contrib/cloud/README.md
+++ b/tensorflow/contrib/cloud/README.md
@@ -1,8 +1,8 @@
 # Cloud #
 
-## BigTable ##
+## Cloud Bigtable ##
 
-[Google Cloud BigTable](https://cloud.google.com/bigtable/) is a high
+[Google Cloud Bigtable](https://cloud.google.com/bigtable/) is a high
 performance storage system that can store and serve training data. This contrib
 package contains an experimental integration with TensorFlow.
 
diff --git a/tensorflow/contrib/cloud/__init__.py b/tensorflow/contrib/cloud/__init__.py
index af81106a68..8efd259946 100644
--- a/tensorflow/contrib/cloud/__init__.py
+++ b/tensorflow/contrib/cloud/__init__.py
@@ -25,8 +25,8 @@ from tensorflow.contrib.cloud.python.ops.bigquery_reader_ops import *
 from tensorflow.contrib.cloud.python.ops.gcs_config_ops import *
 
 if os.name != 'nt':
-  from tensorflow.contrib.bigtable.python.ops.bigtable_api import BigTable
   from tensorflow.contrib.bigtable.python.ops.bigtable_api import BigtableClient
+  from tensorflow.contrib.bigtable.python.ops.bigtable_api import BigtableTable
 
 del os
 
@@ -34,8 +34,8 @@ from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = [
     'BigQueryReader',
-    'BigTable',
     'BigtableClient',
+    'BigtableTable',
     'BlockCacheParams',
     'configure_colab_session',
     'configure_gcs',
diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
index 8f521ffee4..f9dc3effd0 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
@@ -259,11 +259,11 @@ class TPUClusterResolver(ClusterResolver):
 
       if 'state' in response and response['state'] != 'READY':
         raise RuntimeError('TPU "%s" is not yet ready; state: "%s"' %
-                           (self._tpu, response['state']))
+                           (compat.as_text(self._tpu), response['state']))
 
       if 'health' in response and response['health'] != 'HEALTHY':
-        raise RuntimeError('TPU "%s" is unhealthy: "%s"' % (self._tpu,
-                                                            response['health']))
+        raise RuntimeError('TPU "%s" is unhealthy: "%s"' %
+                           (compat.as_text(self._tpu), response['health']))
 
       if 'networkEndpoints' in response:
         worker_list = [
diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index 708618dcb0..f6c928e2be 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -394,16 +394,20 @@ if (tensorflow_ENABLE_GPU)
 
   # by default we assume compute cabability 3.5 and 5.2. If you change this change it in
   # CUDA_NVCC_FLAGS and cuda_config.h below
-  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_30,code=\"sm_30,compute_30\";-gencode arch=compute_35,code=\"sm_35,compute_35\";-gencode arch=compute_52,code=\"sm_52,compute_52\")
+  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_37,code=\"sm_37,compute_37\")
+  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_52,code=\"sm_52,compute_52\")
+  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_60,code=\"sm_60,compute_60\")
+  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_61,code=\"sm_61,compute_61\")
+  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_70,code=\"sm_70,compute_70\")
   set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--include-path ${PROJECT_BINARY_DIR}/$\{build_configuration\};--expt-relaxed-constexpr)
   set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-ftz=true)  # Flush denormals to zero
   set(CUDA_INCLUDE ${CUDA_TOOLKIT_TARGET_DIR} ${CUDA_TOOLKIT_TARGET_DIR}/extras/CUPTI/include)
   include_directories(${CUDA_INCLUDE})
   if (WIN32)
-    add_definitions(-DGOOGLE_CUDA=1 -DTF_EXTRA_CUDA_CAPABILITIES=3.0,3.5,5.2)
+    add_definitions(-DGOOGLE_CUDA=1 -DTF_EXTRA_CUDA_CAPABILITIES=3.7,5.2,6.0,6.1,7.0)
   else (WIN32)
-    # Without these double quotes, cmake in Linux makes it "-DTF_EXTRA_CUDA_CAPABILITIES=3.0, -D3.5, -D5.2" for cc, which incurs build breaks
-    add_definitions(-DGOOGLE_CUDA=1 -D"TF_EXTRA_CUDA_CAPABILITIES=3.0,3.5,5.2")
+    # Without these double quotes, cmake in Linux makes it "-DTF_EXTRA_CUDA_CAPABILITIES=3.7, -D5.2, ..." for cc, which incurs build breaks
+    add_definitions(-DGOOGLE_CUDA=1 -D"TF_EXTRA_CUDA_CAPABILITIES=3.7,5.2,6.0,6.1,7.0")
   endif (WIN32)
 
   if (WIN32)
@@ -452,7 +456,7 @@ if (tensorflow_ENABLE_GPU)
   FILE(WRITE ${tensorflow_source_dir}/third_party/gpus/cuda/cuda_config.h
     "#ifndef CUDA_CUDA_CONFIG_H_\n"
     "#define CUDA_CUDA_CONFIG_H_\n"
-    "#define TF_CUDA_CAPABILITIES CudaVersion(\"3.0\"),CudaVersion(\"3.5\"),CudaVersion(\"5.2\")\n"
+    "#define TF_CUDA_CAPABILITIES CudaVersion(\"3.7\"),CudaVersion(\"5.2\"),CudaVersion(\"6.0\"),CudaVersion(\"6.1\"),CudaVersion(\"7.0\")\n"
     "#define TF_CUDA_VERSION \"64_${short_CUDA_VER}\"\n"
     "#define TF_CUDNN_VERSION \"64_${tensorflow_CUDNN_VERSION}\"\n"
     "#define TF_CUDA_TOOLKIT_PATH \"${CUDA_TOOLKIT_ROOT_DIR}\"\n"
@@ -467,7 +471,6 @@ if (tensorflow_ENABLE_GPU)
     ${CUDA_TOOLKIT_TARGET_DIR}/include/cuComplex.h
     ${CUDA_TOOLKIT_TARGET_DIR}/include/cublas_v2.h
     ${CUDA_TOOLKIT_TARGET_DIR}/include/cusolverDn.h
-    ${CUDA_TOOLKIT_TARGET_DIR}/include/cuda_fp16.h
     ${CUDA_TOOLKIT_TARGET_DIR}/include/device_functions.h
     ${CUDA_TOOLKIT_TARGET_DIR}/include/cufft.h
     ${CUDA_TOOLKIT_TARGET_DIR}/include/curand.h
diff --git a/tensorflow/contrib/copy_graph/python/util/copy_elements.py b/tensorflow/contrib/copy_graph/python/util/copy_elements.py
index 5931c8a279..6c9ab6aeb8 100644
--- a/tensorflow/contrib/copy_graph/python/util/copy_elements.py
+++ b/tensorflow/contrib/copy_graph/python/util/copy_elements.py
@@ -219,8 +219,10 @@ def copy_op_to_graph(org_instance, to_graph, variables, scope=''):
                            op_def)
     #Use Graph's hidden methods to add the op
     to_graph._record_op_seen_by_control_dependencies(new_op)
-    for device_function in reversed(to_graph._device_function_stack):
+    # pylint: disable=protected-access
+    for device_function in to_graph._device_functions_outer_to_inner:
       new_op._set_device(device_function(new_op))
+    # pylint: enable=protected-access
 
     return new_op
 
diff --git a/tensorflow/contrib/crf/__init__.py b/tensorflow/contrib/crf/__init__.py
index 046c509626..615e62b16f 100644
--- a/tensorflow/contrib/crf/__init__.py
+++ b/tensorflow/contrib/crf/__init__.py
@@ -20,6 +20,7 @@ See the @{$python/contrib.crf} guide.
 @@crf_decode
 @@crf_log_likelihood
 @@crf_log_norm
+@@crf_multitag_sequence_score
 @@crf_sequence_score
 @@crf_unary_score
 @@CrfDecodeBackwardRnnCell
@@ -36,6 +37,7 @@ from tensorflow.contrib.crf.python.ops.crf import crf_binary_score
 from tensorflow.contrib.crf.python.ops.crf import crf_decode
 from tensorflow.contrib.crf.python.ops.crf import crf_log_likelihood
 from tensorflow.contrib.crf.python.ops.crf import crf_log_norm
+from tensorflow.contrib.crf.python.ops.crf import crf_multitag_sequence_score
 from tensorflow.contrib.crf.python.ops.crf import crf_sequence_score
 from tensorflow.contrib.crf.python.ops.crf import crf_unary_score
 from tensorflow.contrib.crf.python.ops.crf import CrfDecodeBackwardRnnCell
diff --git a/tensorflow/contrib/crf/python/kernel_tests/crf_test.py b/tensorflow/contrib/crf/python/kernel_tests/crf_test.py
index 74f2ec22ff..f56a973f6f 100644
--- a/tensorflow/contrib/crf/python/kernel_tests/crf_test.py
+++ b/tensorflow/contrib/crf/python/kernel_tests/crf_test.py
@@ -31,6 +31,15 @@ from tensorflow.python.platform import test
 
 class CrfTest(test.TestCase):
 
+  def calculateSequenceScore(self, inputs, transition_params, tag_indices,
+                             sequence_lengths):
+    expected_unary_score = sum(
+        inputs[i][tag_indices[i]] for i in range(sequence_lengths))
+    expected_binary_score = sum(
+        transition_params[tag_indices[i], tag_indices[i + 1]]
+        for i in range(sequence_lengths - 1))
+    return expected_unary_score + expected_binary_score
+
   def testCrfSequenceScore(self):
     transition_params = np.array(
         [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
@@ -60,14 +69,55 @@ class CrfTest(test.TestCase):
             transition_params=constant_op.constant(transition_params))
         sequence_score = array_ops.squeeze(sequence_score, [0])
         tf_sequence_score = sess.run(sequence_score)
-        expected_unary_score = sum(inputs[i][tag_indices[i]]
-                                   for i in range(sequence_lengths))
-        expected_binary_score = sum(
-            transition_params[tag_indices[i], tag_indices[i + 1]]
-            for i in range(sequence_lengths - 1))
-        expected_sequence_score = expected_unary_score + expected_binary_score
+        expected_sequence_score = self.calculateSequenceScore(
+            inputs, transition_params, tag_indices, sequence_lengths)
         self.assertAllClose(tf_sequence_score, expected_sequence_score)
 
+  def testCrfMultiTagSequenceScore(self):
+    transition_params = np.array(
+        [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
+    # Test both the length-1 and regular cases.
+    sequence_lengths_list = [
+        np.array(3, dtype=np.int32),
+        np.array(1, dtype=np.int32)
+    ]
+    inputs_list = [
+        np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]],
+                 dtype=np.float32),
+        np.array([[4, 5, -3]],
+                 dtype=np.float32),
+    ]
+    tag_bitmap_list = [
+        np.array(
+            [[True, True, False], [True, False, True], [False, True, True],
+             [True, False, True]],
+            dtype=np.bool),
+        np.array([[True, True, False]], dtype=np.bool)
+    ]
+    for sequence_lengths, inputs, tag_bitmap in zip(
+        sequence_lengths_list, inputs_list, tag_bitmap_list):
+      with self.test_session() as sess:
+        sequence_score = crf.crf_multitag_sequence_score(
+            inputs=array_ops.expand_dims(inputs, 0),
+            tag_bitmap=array_ops.expand_dims(tag_bitmap, 0),
+            sequence_lengths=array_ops.expand_dims(sequence_lengths, 0),
+            transition_params=constant_op.constant(transition_params))
+        sequence_score = array_ops.squeeze(sequence_score, [0])
+        tf_sum_sequence_score = sess.run(sequence_score)
+        all_indices_list = [
+            single_index_bitmap.nonzero()[0]
+            for single_index_bitmap in tag_bitmap[:sequence_lengths]
+        ]
+        expected_sequence_scores = [
+            self.calculateSequenceScore(inputs, transition_params, indices,
+                                        sequence_lengths)
+            for indices in itertools.product(*all_indices_list)
+        ]
+        expected_log_sum_exp_sequence_scores = np.logaddexp.reduce(
+            expected_sequence_scores)
+        self.assertAllClose(tf_sum_sequence_score,
+                            expected_log_sum_exp_sequence_scores)
+
   def testCrfUnaryScore(self):
     inputs = np.array(
         [[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=np.float32)
diff --git a/tensorflow/contrib/crf/python/ops/crf.py b/tensorflow/contrib/crf/python/ops/crf.py
index 2d2cbdc199..8a7ff61bc8 100644
--- a/tensorflow/contrib/crf/python/ops/crf.py
+++ b/tensorflow/contrib/crf/python/ops/crf.py
@@ -67,7 +67,7 @@ __all__ = [
     "crf_sequence_score", "crf_log_norm", "crf_log_likelihood",
     "crf_unary_score", "crf_binary_score", "CrfForwardRnnCell",
     "viterbi_decode", "crf_decode", "CrfDecodeForwardRnnCell",
-    "CrfDecodeBackwardRnnCell"
+    "CrfDecodeBackwardRnnCell", "crf_multitag_sequence_score"
 ]
 
 
@@ -114,6 +114,56 @@ def crf_sequence_score(inputs, tag_indices, sequence_lengths,
       false_fn=_multi_seq_fn)
 
 
+def crf_multitag_sequence_score(inputs, tag_bitmap, sequence_lengths,
+                                transition_params):
+  """Computes the unnormalized score of all tag sequences matching tag_bitmap.
+
+  tag_bitmap enables more than one tag to be considered correct at each time
+  step. This is useful when an observed output at a given time step is
+  consistent with more than one tag, and thus the log likelihood of that
+  observation must take into account all possible consistent tags.
+
+  Using one-hot vectors in tag_bitmap gives results identical to
+  crf_sequence_score.
+
+  Args:
+    inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
+        to use as input to the CRF layer.
+    tag_bitmap: A [batch_size, max_seq_len, num_tags] boolean tensor
+        representing all active tags at each index for which to calculate the
+        unnormalized score.
+    sequence_lengths: A [batch_size] vector of true sequence lengths.
+    transition_params: A [num_tags, num_tags] transition matrix.
+  Returns:
+    sequence_scores: A [batch_size] vector of unnormalized sequence scores.
+  """
+
+  # If max_seq_len is 1, we skip the score calculation and simply gather the
+  # unary potentials of all active tags.
+  def _single_seq_fn():
+    filtered_inputs = array_ops.where(
+        tag_bitmap, inputs,
+        array_ops.fill(array_ops.shape(inputs), float("-inf")))
+    return math_ops.reduce_logsumexp(
+        filtered_inputs, axis=[1, 2], keepdims=False)
+
+  def _multi_seq_fn():
+    # Compute the logsumexp of all scores of sequences matching the given tags.
+    filtered_inputs = array_ops.where(
+        tag_bitmap, inputs,
+        array_ops.fill(array_ops.shape(inputs), float("-inf")))
+    return crf_log_norm(
+        inputs=filtered_inputs,
+        sequence_lengths=sequence_lengths,
+        transition_params=transition_params)
+
+  return utils.smart_cond(
+      pred=math_ops.equal(inputs.shape[1].value or array_ops.shape(inputs)[1],
+                          1),
+      true_fn=_single_seq_fn,
+      false_fn=_multi_seq_fn)
+
+
 def crf_log_norm(inputs, sequence_lengths, transition_params):
   """Computes the normalization for a CRF.
 
diff --git a/tensorflow/contrib/data/kernels/prefetching_kernels.cc b/tensorflow/contrib/data/kernels/prefetching_kernels.cc
index b3d464d716..32f03ca683 100644
--- a/tensorflow/contrib/data/kernels/prefetching_kernels.cc
+++ b/tensorflow/contrib/data/kernels/prefetching_kernels.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include <deque>
 
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_op_kernel.h"
@@ -23,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
+namespace {
 
 struct BufferElement {
   // The producer sets `status` if getting the input element fails.
@@ -473,4 +475,466 @@ class IteratorGetDeviceOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("IteratorGetDevice").Device(DEVICE_CPU),
                         IteratorGetDeviceOp);
 
+Status VerifyTypesMatch(const DataTypeVector& expected,
+                        const DataTypeVector& received) {
+  if (expected.size() != received.size()) {
+    return errors::InvalidArgument(
+        "Number of components does not match: expected ", expected.size(),
+        " types but got ", received.size(), ".");
+  }
+  for (size_t i = 0; i < expected.size(); ++i) {
+    if (expected[i] != received[i]) {
+      return errors::InvalidArgument("Data type mismatch at component ", i,
+                                     ": expected ", DataTypeString(expected[i]),
+                                     " but got ", DataTypeString(received[i]),
+                                     ".");
+    }
+  }
+  return Status::OK();
+}
+
+Status VerifyShapesCompatible(const std::vector<PartialTensorShape>& expected,
+                              const std::vector<PartialTensorShape>& received) {
+  if (expected.size() != received.size()) {
+    return errors::InvalidArgument(
+        "Number of components does not match: expected ", expected.size(),
+        " shapes but got ", received.size(), ".");
+  }
+  for (size_t i = 0; i < expected.size(); ++i) {
+    if (!expected[i].IsCompatibleWith(received[i])) {
+      return errors::InvalidArgument("Incompatible shapes at component ", i,
+                                     ": expected ", expected[i].DebugString(),
+                                     " but got ", received[i].DebugString(),
+                                     ".");
+    }
+  }
+
+  return Status::OK();
+}
+
+string SanitizeThreadSuffix(string suffix) {
+  string clean;
+  for (int i = 0; i < suffix.size(); ++i) {
+    const char ch = suffix[i];
+    if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') ||
+        (ch >= '0' && ch <= '9') || ch == '_' || ch == '-') {
+      clean += ch;
+    } else {
+      clean += '_';
+    }
+  }
+  return clean;
+}
+
+class MultiDeviceIterator : public ResourceBase {
+ public:
+  MultiDeviceIterator(const DataTypeVector& output_types,
+                      const std::vector<PartialTensorShape>& output_shapes,
+                      const std::vector<string>& devices,
+                      std::unique_ptr<FunctionLibraryDefinition> flib_def,
+                      std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
+                      FunctionLibraryRuntime* lib)
+      : output_types_(output_types),
+        output_shapes_(output_shapes),
+        devices_(devices),
+        flib_def_(std::move(flib_def)),
+        pflr_(std::move(pflr)),
+        lib_(lib) {
+    buffer_.resize(devices_.size());
+  }
+
+  string DebugString() override {
+    return strings::StrCat("MultiDeviceIterator");
+  }
+
+  Status Init(std::unique_ptr<IteratorBase> iterator, int64* incarnation_id) {
+    mutex_lock l(mu_);
+    if (iterator) {
+      TF_RETURN_IF_ERROR(
+          VerifyTypesMatch(output_types_, iterator->output_dtypes()));
+      TF_RETURN_IF_ERROR(
+          VerifyShapesCompatible(output_shapes_, iterator->output_shapes()));
+    }
+    host_iterator_.reset(iterator.release());
+    incarnation_id_++;
+    *incarnation_id = incarnation_id_;
+    max_buffer_size_ = 0;
+    num_elements_ = 0;
+    buffer_.clear();
+    buffer_.resize(devices_.size());
+    return Status::OK();
+  }
+
+  Status GetNextFromShard(IteratorContext* ctx, int shard_num,
+                          int64 incarnation_id,
+                          std::vector<Tensor>* out_tensors,
+                          bool* end_of_sequence) {
+    // TODO(rohanj): This might potentially strand elements in other shards.
+    // Opportunity to do smarter locking semantics.
+    mutex_lock l(mu_);
+    // Make sure we're in the right incarnation.
+    if (incarnation_id != incarnation_id_) {
+      return errors::InvalidArgument(
+          "Current incarnation: ", incarnation_id_,
+          "; Supplied incarnation: ", incarnation_id);
+    }
+    // Then look it up in the buffer.
+    if (!buffer_[shard_num].empty()) {
+      const HostBufferElement& elem = buffer_[shard_num].front();
+      *out_tensors = elem.value;
+      *end_of_sequence = elem.end_of_sequence;
+      Status s = elem.status;
+      buffer_[shard_num].pop_front();
+      return s;
+    }
+    std::shared_ptr<IteratorBase> captured_iterator(host_iterator_);
+    if (captured_iterator) {
+      if (lib_ != nullptr) {
+        ctx->set_lib(lib_);
+      }
+      while (true) {
+        HostBufferElement elem;
+        elem.status =
+            captured_iterator->GetNext(ctx, &elem.value, &elem.end_of_sequence);
+        int buffer_index = num_elements_ % devices_.size();
+        num_elements_++;
+        if (buffer_index == shard_num) {
+          out_tensors->swap(elem.value);
+          *end_of_sequence = elem.end_of_sequence;
+          return elem.status;
+        } else {
+          buffer_[buffer_index].push_back(std::move(elem));
+          // TODO(rohanj): Put an upper bound to buffer size.
+          if (buffer_[buffer_index].size() > max_buffer_size_) {
+            max_buffer_size_ = buffer_[buffer_index].size();
+            VLOG(1) << "MultiDeviceIterator: Max buffer size increased to: "
+                    << max_buffer_size_;
+          }
+        }
+      }
+    } else {
+      return errors::FailedPrecondition("Iterator not initialized");
+    }
+    return Status::OK();
+  }
+
+  const DataTypeVector& output_types() const { return output_types_; }
+
+  const std::vector<PartialTensorShape>& output_shapes() const {
+    return output_shapes_;
+  }
+
+  std::shared_ptr<const FunctionLibraryDefinition> function_library() {
+    tf_shared_lock l(mu_);
+    return lib_def_;
+  }
+
+ private:
+  struct HostBufferElement {
+    Status status;
+    bool end_of_sequence;
+    std::vector<Tensor> value;
+  };
+
+  mutex mu_;
+  const DataTypeVector output_types_;
+  const std::vector<PartialTensorShape> output_shapes_;
+  const std::vector<string> devices_;
+  int64 num_elements_ GUARDED_BY(mu_) = 0;
+  int64 max_buffer_size_ GUARDED_BY(mu_) = 0;
+  int64 incarnation_id_ GUARDED_BY(mu_) = 0;
+  std::vector<std::deque<HostBufferElement>> buffer_ GUARDED_BY(mu_);
+  std::unique_ptr<FunctionLibraryDefinition> flib_def_;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
+  FunctionLibraryRuntime* lib_ = nullptr;  // not owned.
+  std::shared_ptr<IteratorBase> host_iterator_;
+  std::shared_ptr<const FunctionLibraryDefinition> lib_def_ GUARDED_BY(mu_);
+};
+
+// Just creates a MultiDeviceIterator and returns it.
+class MultiDeviceIteratorHandleOp : public OpKernel {
+ public:
+  explicit MultiDeviceIteratorHandleOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx), graph_def_version_(ctx->graph_def_version()) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("shared_name", &name_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("container", &container_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("devices", &devices_));
+  }
+
+  // The resource is deleted from the resource manager only when it is private
+  // to kernel.
+  ~MultiDeviceIteratorHandleOp() override {
+    if (resource_ != nullptr) {
+      resource_->Unref();
+      if (cinfo_.resource_is_private_to_kernel()) {
+        if (!cinfo_.resource_manager()
+                 ->template Delete<MultiDeviceIterator>(cinfo_.container(),
+                                                        cinfo_.name())
+                 .ok()) {
+          // Do nothing; the resource can have been deleted by session resets.
+        }
+      }
+    }
+  }
+
+  void Compute(OpKernelContext* context) override LOCKS_EXCLUDED(mu_) {
+    {
+      mutex_lock l(mu_);
+      if (resource_ == nullptr) {
+        FunctionLibraryRuntime* lib;
+        std::unique_ptr<FunctionLibraryDefinition> flib_def(nullptr);
+        std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(nullptr);
+        OP_REQUIRES_OK(context, context->function_library()->Clone(
+                                    &flib_def, &pflr, &lib));
+        ResourceMgr* mgr = context->resource_manager();
+        OP_REQUIRES_OK(context, cinfo_.Init(mgr, def()));
+
+        MultiDeviceIterator* resource;
+        OP_REQUIRES_OK(
+            context,
+            mgr->LookupOrCreate<MultiDeviceIterator>(
+                cinfo_.container(), cinfo_.name(), &resource,
+                [this, lib, &flib_def, &pflr](MultiDeviceIterator** ret)
+                    EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                      *ret = new MultiDeviceIterator(
+                          output_types_, output_shapes_, devices_,
+                          std::move(flib_def), std::move(pflr), lib);
+                      return Status::OK();
+                    }));
+
+        Status s = VerifyResource(resource);
+        if (TF_PREDICT_FALSE(!s.ok())) {
+          resource->Unref();
+          context->SetStatus(s);
+          return;
+        }
+
+        resource_ = resource;
+      }
+    }
+    OP_REQUIRES_OK(context, MakeResourceHandleToOutput(
+                                context, 0, cinfo_.container(), cinfo_.name(),
+                                MakeTypeIndex<MultiDeviceIterator>()));
+  }
+
+ private:
+  // During the first Compute(), resource is either created or looked up using
+  // shared_name. In the latter case, the resource found should be verified if
+  // it is compatible with this op's configuration. The verification may fail in
+  // cases such as two graphs asking queues of the same shared name to have
+  // inconsistent capacities.
+  Status VerifyResource(MultiDeviceIterator* resource) {
+    TF_RETURN_IF_ERROR(
+        VerifyTypesMatch(output_types_, resource->output_types()));
+    TF_RETURN_IF_ERROR(
+        VerifyShapesCompatible(output_shapes_, resource->output_shapes()));
+    return Status::OK();
+  }
+
+  mutex mu_;
+  ContainerInfo cinfo_;  // Written once under mu_ then constant afterwards.
+  MultiDeviceIterator* resource_ GUARDED_BY(mu_) = nullptr;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+  const int graph_def_version_;
+  string name_;
+  string container_;
+  std::vector<string> devices_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("MultiDeviceIterator").Device(DEVICE_CPU),
+                        MultiDeviceIteratorHandleOp);
+
+// Calls init on the MultiDeviceIterator.
+class MultiDeviceIteratorInitOp : public OpKernel {
+ public:
+  explicit MultiDeviceIteratorInitOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    DatasetBase* dataset;
+    OP_REQUIRES_OK(ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset));
+    MultiDeviceIterator* resource;
+    OP_REQUIRES_OK(ctx,
+                   LookupResource(ctx, HandleFromInput(ctx, 1), &resource));
+    core::ScopedUnref unref(resource);
+
+    IteratorContext iter_ctx = dataset::MakeIteratorContext(ctx);
+    std::unique_ptr<IteratorBase> iterator;
+    OP_REQUIRES_OK(ctx,
+                   dataset->MakeIterator(&iter_ctx, "Iterator", &iterator));
+    int64 incarnation_id;
+    OP_REQUIRES_OK(ctx, resource->Init(std::move(iterator), &incarnation_id));
+    Tensor tensor_incarnation_id(DT_INT64, TensorShape({}));
+    tensor_incarnation_id.scalar<int64>()() = incarnation_id;
+    OP_REQUIRES_OK(ctx,
+                   ctx->set_output("incarnation_id", tensor_incarnation_id));
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("MultiDeviceIteratorInit").Device(DEVICE_CPU),
+                        MultiDeviceIteratorInitOp);
+
+// Calls GetNextFromShard(shard) and returns a vector of Tensors as output.
+// TODO(rohanj): Implement using BackgroundWorker that Derek built?
+class MultiDeviceIteratorGetNextFromShardOp : public AsyncOpKernel {
+ public:
+  explicit MultiDeviceIteratorGetNextFromShardOp(OpKernelConstruction* ctx)
+      : AsyncOpKernel(ctx),
+        thread_pool_(new thread::ThreadPool(
+            ctx->env(), ThreadOptions(),
+            strings::StrCat("multi_device_iterator_get_next_thread_",
+                            SanitizeThreadSuffix(name())),
+            1 /* num_threads */, false /* low_latency_hint */)) {}
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
+    const Tensor* tensor_shard_num;
+    OP_REQUIRES_OK_ASYNC(ctx, ctx->input("shard_num", &tensor_shard_num), done);
+    int32 shard_num = tensor_shard_num->scalar<int32>()();
+
+    const Tensor* tensor_incarnation_id;
+    OP_REQUIRES_OK_ASYNC(
+        ctx, ctx->input("incarnation_id", &tensor_incarnation_id), done);
+    int64 incarnation_id = tensor_incarnation_id->scalar<int64>()();
+
+    MultiDeviceIterator* iterator;
+    OP_REQUIRES_OK_ASYNC(
+        ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &iterator), done);
+    thread_pool_->Schedule(std::bind(
+        [ctx, iterator, shard_num, incarnation_id](DoneCallback done) {
+          std::vector<Tensor> components;
+          bool end_of_sequence = false;
+
+          IteratorContext::Params params;
+          params.env = ctx->env();
+          params.runner = *(ctx->runner());
+          params.function_library = iterator->function_library();
+          DeviceBase* device = ctx->function_library()->device();
+          params.allocator_getter = [device](AllocatorAttributes attrs) {
+            return device->GetAllocator(attrs);
+          };
+          IteratorContext iter_ctx(std::move(params));
+
+          Status s =
+              iterator->GetNextFromShard(&iter_ctx, shard_num, incarnation_id,
+                                         &components, &end_of_sequence);
+          iterator->Unref();
+
+          if (!s.ok()) {
+            ctx->SetStatus(s);
+          } else if (end_of_sequence) {
+            ctx->SetStatus(errors::OutOfRange("End of sequence"));
+          } else {
+            for (int i = 0; i < components.size(); ++i) {
+              // TODO(mrry): Check that the shapes match the shape attrs.
+              ctx->set_output(i, components[i]);
+            }
+          }
+          done();
+        },
+        std::move(done)));
+  }
+
+ private:
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("MultiDeviceIteratorGetNextFromShard").Device(DEVICE_CPU),
+    MultiDeviceIteratorGetNextFromShardOp);
+
+class MultiDeviceIteratorToStringHandleOp : public OpKernel {
+ public:
+  explicit MultiDeviceIteratorToStringHandleOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& resource_handle_t = ctx->input(0);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(resource_handle_t.shape()),
+                errors::InvalidArgument("resource_handle must be a scalar"));
+
+    // Validate that the handle corresponds to a real resource, and
+    // that it is an MultiDeviceIterator.
+    MultiDeviceIterator* resource;
+    OP_REQUIRES_OK(ctx,
+                   LookupResource(ctx, HandleFromInput(ctx, 0), &resource));
+    resource->Unref();
+
+    Tensor* string_handle_t;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output(0, TensorShape({}), &string_handle_t));
+    string_handle_t->scalar<string>()() =
+        resource_handle_t.scalar<ResourceHandle>()().SerializeAsString();
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("MultiDeviceIteratorToStringHandle").Device(DEVICE_CPU),
+    MultiDeviceIteratorToStringHandleOp);
+
+class MultiDeviceIteratorFromStringHandleOp : public OpKernel {
+ public:
+  explicit MultiDeviceIteratorFromStringHandleOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+    OP_REQUIRES(
+        ctx,
+        output_types_.empty() || output_shapes_.empty() ||
+            output_types_.size() == output_shapes_.size(),
+        errors::InvalidArgument("If both 'output_types' and 'output_shapes' "
+                                "are set, they must have the same length."));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& string_handle_t = ctx->input(0);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(string_handle_t.shape()),
+                errors::InvalidArgument("string_handle must be a scalar"));
+
+    ResourceHandle resource_handle;
+    OP_REQUIRES(
+        ctx,
+        resource_handle.ParseFromString(string_handle_t.scalar<string>()()),
+        errors::InvalidArgument(
+            "Could not parse string_handle as a valid ResourceHandle"));
+
+    OP_REQUIRES(
+        ctx, resource_handle.device() == ctx->device()->attributes().name(),
+        errors::InvalidArgument("Attempted create an iterator on device \"",
+                                ctx->device()->attributes().name(),
+                                "\" from handle defined on device \"",
+                                resource_handle.device(), "\""));
+
+    // Validate that the handle corresponds to a real resource, and
+    // that it is an MultiDeviceIterator.
+    MultiDeviceIterator* resource;
+    OP_REQUIRES_OK(ctx, LookupResource(ctx, resource_handle, &resource));
+    core::ScopedUnref unref_iterator(resource);
+    if (!output_types_.empty()) {
+      OP_REQUIRES_OK(ctx,
+                     VerifyTypesMatch(output_types_, resource->output_types()));
+    }
+    if (!output_shapes_.empty()) {
+      OP_REQUIRES_OK(ctx, VerifyShapesCompatible(output_shapes_,
+                                                 resource->output_shapes()));
+    }
+
+    Tensor* resource_handle_t;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_output(0, TensorShape({}), &resource_handle_t));
+    resource_handle_t->scalar<ResourceHandle>()() = resource_handle;
+  }
+
+ private:
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("MultiDeviceIteratorFromStringHandle").Device(DEVICE_CPU),
+    MultiDeviceIteratorFromStringHandleOp);
+
+}  // anonymous namespace
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/data/ops/dataset_ops.cc b/tensorflow/contrib/data/ops/dataset_ops.cc
index b5c6f2e241..66a7c7fdcd 100644
--- a/tensorflow/contrib/data/ops/dataset_ops.cc
+++ b/tensorflow/contrib/data/ops/dataset_ops.cc
@@ -145,6 +145,80 @@ Resets the FunctionBufferingResource.
 function_buffer_resource: The FunctionBufferingResource handle.
 )doc");
 
+REGISTER_OP("MultiDeviceIterator")
+    .Output("handle: resource")
+    .Attr("devices: list(string) >= 1")
+    .Attr("shared_name: string")
+    .Attr("container: string")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .Doc(R"doc(
+Creates a MultiDeviceIterator resource.
+
+handle: Handle to the resource created.
+devices: A list of devices the iterator works across.
+shared_name: If non-empty, this resource will be shared under the given name
+  across multiple sessions.
+container: If non-empty, this resource is placed in the given container.
+  Otherwise, a default container is used.
+output_types: The type list for the return values.
+output_shapes: The list of shapes being produced.
+)doc");
+
+REGISTER_OP("MultiDeviceIteratorInit")
+    .Input("dataset: variant")
+    .Input("multi_device_iterator: resource")
+    .Output("incarnation_id: int64")
+    .Doc(R"doc(
+Initializes the multi device iterator with the given dataset.
+incarnation_id: An int64 indicating which incarnation of the MultiDeviceIterator
+  is running.
+dataset: Dataset to be iterated upon.
+multi_device_iterator: A MultiDeviceIteratorResource.
+)doc");
+
+REGISTER_OP("MultiDeviceIteratorGetNextFromShard")
+    .Input("multi_device_iterator: resource")
+    .Input("shard_num: int32")
+    .Input("incarnation_id: int64")
+    .Output("components: output_types")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .Doc(R"doc(
+Gets next element for the provided shard number.
+
+multi_device_iterator: A MultiDeviceIterator resource.
+shard_num: Integer representing which shard to fetch data for.
+incarnation_id: Which incarnation of the MultiDeviceIterator is running.
+components: Result of the get_next on the dataset.
+output_types: The type list for the return values.
+output_shapes: The list of shapes being produced.
+)doc");
+
+REGISTER_OP("MultiDeviceIteratorToStringHandle")
+    .Input("multi_device_iterator: resource")
+    .Output("string_handle: string")
+    .Doc(R"doc(
+Produces a string handle for the given MultiDeviceIterator.
+
+multi_device_iterator: A MultiDeviceIterator resource.
+string_handle: A string representing the resource.
+)doc");
+
+REGISTER_OP("MultiDeviceIteratorFromStringHandle")
+    .Input("string_handle: string")
+    .Output("multi_device_iterator: resource")
+    .Attr("output_types: list(type) >= 0 = []")
+    .Attr("output_shapes: list(shape) >= 0 = []")
+    .Doc(R"doc(
+Generates a MultiDeviceIterator resource from its provided string handle.
+
+string_handle: String representing the resource.
+multi_device_iterator: A MultiDeviceIterator resource.
+output_types: The type list for the return values.
+output_shapes: The list of shapes being produced.
+)doc");
+
 REGISTER_OP("ThreadPoolDataset")
     .Input("input_dataset: variant")
     .Input("thread_pool: resource")
diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index 036dc795bb..2de1a79d28 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -192,6 +192,7 @@ py_test(
     deps = [
         "//tensorflow/contrib/data/python/ops:batching",
         "//tensorflow/contrib/data/python/ops:error_ops",
+        "//tensorflow/contrib/data/python/ops:optimization",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
@@ -213,6 +214,7 @@ py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
         "//tensorflow/python/data/ops:dataset_ops",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -234,7 +236,12 @@ cuda_py_test(
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:iterator_ops",
     ],
-    tags = ["no_windows_gpu"],
+    tags = [
+        "manual",
+        "no_oss",
+        "no_windows_gpu" +
+        "notap",
+    ],
 )
 
 py_test(
diff --git a/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
index b7025f3802..48adc98e9a 100644
--- a/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
@@ -26,6 +26,7 @@ import numpy as np
 
 from tensorflow.contrib.data.python.ops import batching
 from tensorflow.contrib.data.python.ops import error_ops
+from tensorflow.contrib.data.python.ops import optimization
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.data.ops import dataset_ops
@@ -82,15 +83,17 @@ class MapDatasetTest(test.TestCase):
     def write_string_to_file(value, filename):
       with open(filename, "w") as f:
         f.write(value)
-    filenames = [os.path.join(self.get_temp_dir(), "file_%d.txt" % i)
-                 for i in range(5)]
+
+    filenames = [
+        os.path.join(self.get_temp_dir(), "file_%d.txt" % i) for i in range(5)
+    ]
     for filename in filenames:
       write_string_to_file(filename, filename)
 
     dataset = (
         dataset_ops.Dataset.from_tensor_slices(filenames).map(
-            io_ops.read_file, num_parallel_calls=2).prefetch(2).apply(
-                error_ops.ignore_errors()))
+            io_ops.read_file,
+            num_parallel_calls=2).prefetch(2).apply(error_ops.ignore_errors()))
     iterator = dataset.make_initializable_iterator()
     init_op = iterator.initializer
     get_next = iterator.get_next()
@@ -264,5 +267,46 @@ class MapDatasetBenchmark(test.Benchmark):
     benchmark("Transformation parallelism evaluation", par_num_calls_series)
     benchmark("Threadpool size evaluation", par_inter_op_series)
 
+  # This benchmark compares the performance of pipeline with multiple chained
+  # maps with and without map fusion.
+  def benchmarkChainOfMaps(self):
+    chain_lengths = [0, 1, 2, 5, 10, 20, 50]
+    for chain_length in chain_lengths:
+      self._benchmarkChainOfMaps(chain_length, False)
+      self._benchmarkChainOfMaps(chain_length, True)
+
+  def _benchmarkChainOfMaps(self, chain_length, optimize_dataset):
+    with ops.Graph().as_default():
+      dataset = dataset_ops.Dataset.from_tensors(0).repeat(None)
+      for _ in range(chain_length):
+        dataset = dataset.map(lambda x: x)
+      if optimize_dataset:
+        dataset = dataset.apply(optimization.optimize(["map_fusion"]))
+
+      iterator = dataset.make_one_shot_iterator()
+      next_element = iterator.get_next()
+
+      with session.Session() as sess:
+        for _ in range(5):
+          sess.run(next_element.op)
+        deltas = []
+        for _ in range(100):
+          start = time.time()
+          for _ in range(100):
+            sess.run(next_element.op)
+          end = time.time()
+          deltas.append(end - start)
+
+        median_wall_time = np.median(deltas) / 100
+        opt_mark = "opt" if optimize_dataset else "no-opt"
+        print("Map dataset {} chain length: {} Median wall time: {}".format(
+            opt_mark, chain_length, median_wall_time))
+        self.report_benchmark(
+            iters=1000,
+            wall_time=median_wall_time,
+            name="benchmark_map_dataset_chain_latency_{}_{}".format(
+                opt_mark, chain_length))
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/optimize_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/optimize_dataset_op_test.py
index cfef40e192..d8156dc9c7 100644
--- a/tensorflow/contrib/data/python/kernel_tests/optimize_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/optimize_dataset_op_test.py
@@ -17,13 +17,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
+
 from tensorflow.contrib.data.python.ops import optimization
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
 from tensorflow.python.platform import test
 
 
-class OptimizeDatasetTest(test.TestCase):
+class OptimizeDatasetTest(test.TestCase, parameterized.TestCase):
 
   def testAssertSuffix(self):
     dataset = dataset_ops.Dataset.from_tensors(0).apply(
@@ -110,6 +112,53 @@ class OptimizeDatasetTest(test.TestCase):
                                    "Function .* is not defined."):
         sess.run(get_next)
 
+  @staticmethod
+  def map_functions():
+    identity = lambda x: x
+    increment = lambda x: x + 1
+
+    def increment_and_square(x):
+      y = x + 1
+      return y * y
+
+    functions = [identity, increment, increment_and_square]
+    tests = []
+
+    for fun1 in functions:
+      for fun2 in functions:
+        tests.append(([fun1, fun2],))
+        for fun3 in functions:
+          tests.append(([fun1, fun2, fun3],))
+
+    swap = lambda x, n: (n, x)
+    tests.append(([lambda x: (x, 42), swap],))
+    tests.append(([lambda x: (x, 42), swap, swap],))
+    return tuple(tests)
+
+  @parameterized.parameters(*map_functions.__func__())
+  def testMapFusion(self, functions):
+    dataset = dataset_ops.Dataset.range(5).apply(
+        optimization.assert_next(["Map", "Prefetch"]))
+    for function in functions:
+      dataset = dataset.map(function)
+
+    dataset = dataset.prefetch(0).apply(optimization.optimize(["map_fusion"]))
+    iterator = dataset.make_one_shot_iterator()
+    get_next = iterator.get_next()
+    with self.test_session() as sess:
+      for x in range(5):
+        result = sess.run(get_next)
+        r = x
+        for function in functions:
+          if isinstance(r, tuple):
+            r = function(*r)  # Pass tuple as multiple arguments.
+          else:
+            r = function(r)
+        self.assertAllEqual(r, result)
+
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py
index 82543b1039..2da6131e8e 100644
--- a/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import test
 
@@ -907,5 +908,155 @@ class CopyToDeviceTest(test.TestCase):
         sess.run(next_element)
 
 
+class MultiDeviceIteratorTest(test.TestCase):
+
+  def testBasic(self):
+    dataset = dataset_ops.Dataset.range(10)
+    multi_device_iterator = prefetching_ops.MultiDeviceIterator(
+        dataset, ["/cpu:1", "/cpu:2"])
+    elem_on_1, elem_on_2 = multi_device_iterator.get_next()
+
+    config = config_pb2.ConfigProto(device_count={"CPU": 3})
+    with self.test_session(config=config) as sess:
+      sess.run(multi_device_iterator.initializer)
+      for i in range(0, 10, 2):
+        self.assertEqual(i, sess.run(elem_on_1))
+        self.assertEqual(i + 1, sess.run(elem_on_2))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(elem_on_1)
+        sess.run(elem_on_2)
+
+  def testOneOnSameDevice(self):
+    with ops.device("/cpu:0"):
+      dataset = dataset_ops.Dataset.range(10)
+    multi_device_iterator = prefetching_ops.MultiDeviceIterator(
+        dataset, ["/cpu:0", "/cpu:1"])
+    elem_on_1, elem_on_2 = multi_device_iterator.get_next()
+
+    config = config_pb2.ConfigProto(device_count={"CPU": 2})
+    with self.test_session(config=config) as sess:
+      sess.run(multi_device_iterator.initializer)
+      for i in range(0, 10, 2):
+        self.assertEqual(i, sess.run(elem_on_1))
+        self.assertEqual(i + 1, sess.run(elem_on_2))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(elem_on_1)
+        sess.run(elem_on_2)
+
+  def testRepeatDevices(self):
+    with ops.device("/cpu:0"):
+      dataset = dataset_ops.Dataset.range(20)
+    multi_device_iterator = prefetching_ops.MultiDeviceIterator(
+        dataset, ["/cpu:1", "/cpu:2", "/cpu:1", "/cpu:2"])
+    elements = multi_device_iterator.get_next()
+    elem_on_1, elem_on_2, elem_on_3, elem_on_4 = elements
+
+    config = config_pb2.ConfigProto(device_count={"CPU": 3})
+    with self.test_session(config=config) as sess:
+      sess.run(multi_device_iterator.initializer)
+      for i in range(0, 20, 4):
+        self.assertEqual(i, sess.run(elem_on_1))
+        self.assertEqual(i + 1, sess.run(elem_on_2))
+        self.assertEqual(i + 2, sess.run(elem_on_3))
+        self.assertEqual(i + 3, sess.run(elem_on_4))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(elem_on_1)
+        sess.run(elem_on_2)
+        sess.run(elem_on_3)
+        sess.run(elem_on_4)
+
+  def testNotFullyDivisible(self):
+    dataset = dataset_ops.Dataset.range(9)
+    multi_device_iterator = prefetching_ops.MultiDeviceIterator(
+        dataset, ["/cpu:1", "/cpu:2"])
+    elem_on_1, elem_on_2 = multi_device_iterator.get_next()
+
+    config = config_pb2.ConfigProto(device_count={"CPU": 3})
+    with self.test_session(config=config) as sess:
+      sess.run(multi_device_iterator.initializer)
+      for i in range(0, 8, 2):
+        self.assertEqual(i, sess.run(elem_on_1))
+        self.assertEqual(i + 1, sess.run(elem_on_2))
+      self.assertEqual(8, sess.run(elem_on_1))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(elem_on_1)
+        sess.run(elem_on_2)
+
+  def testUneven(self):
+    dataset = dataset_ops.Dataset.range(10)
+    multi_device_iterator = prefetching_ops.MultiDeviceIterator(
+        dataset, ["/cpu:1", "/cpu:2"])
+    elem_on_1, elem_on_2 = multi_device_iterator.get_next()
+
+    config = config_pb2.ConfigProto(device_count={"CPU": 3})
+    with self.test_session(config=config) as sess:
+      sess.run(multi_device_iterator.initializer)
+      for i in range(0, 10, 2):
+        self.assertEqual(i, sess.run(elem_on_1))
+      for i in range(0, 10, 2):
+        self.assertEqual(i + 1, sess.run(elem_on_2))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(elem_on_1)
+        sess.run(elem_on_2)
+
+  def testMultipleInitializations(self):
+    with ops.device("/cpu:0"):
+      epoch = array_ops.placeholder(dtypes.int64, shape=[])
+      dataset1 = dataset_ops.Dataset.from_tensors(epoch).repeat(1000)
+      dataset2 = dataset_ops.Dataset.range(1000)
+      dataset = dataset_ops.Dataset.zip((dataset1, dataset2))
+    multi_device_iterator = prefetching_ops.MultiDeviceIterator(
+        dataset, ["/cpu:1", "/cpu:2"], prefetch_buffer_size=4)
+    elem_on_1, elem_on_2 = multi_device_iterator.get_next()
+    init_op = multi_device_iterator.initializer
+
+    config = config_pb2.ConfigProto(device_count={"CPU": 3})
+    with self.test_session(config=config) as sess:
+      for i in range(1000):
+        sess.run(init_op, feed_dict={epoch: i})
+        self.assertEqual([(i, 0), (i, 1)], sess.run([elem_on_1, elem_on_2]))
+
+  def testBasicGpu(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+
+    with compat.forward_compatibility_horizon(2018, 8, 4):
+      dataset = dataset_ops.Dataset.range(10)
+      multi_device_iterator = prefetching_ops.MultiDeviceIterator(
+          dataset, ["/cpu:1", "/gpu:0"])
+      elem_on_1, elem_on_2 = multi_device_iterator.get_next()
+
+      config = config_pb2.ConfigProto(device_count={"CPU": 2, "GPU": 1})
+      with self.test_session(config=config) as sess:
+        sess.run(multi_device_iterator.initializer)
+        for i in range(0, 10, 2):
+          self.assertEqual(i, sess.run(elem_on_1))
+          self.assertEqual(i + 1, sess.run(elem_on_2))
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(elem_on_1)
+          sess.run(elem_on_2)
+
+  def testUnevenGpu(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+
+    with compat.forward_compatibility_horizon(2018, 8, 4):
+      dataset = dataset_ops.Dataset.range(10)
+      multi_device_iterator = prefetching_ops.MultiDeviceIterator(
+          dataset, ["/cpu:1", "/gpu:0"])
+      elem_on_1, elem_on_2 = multi_device_iterator.get_next()
+
+      config = config_pb2.ConfigProto(device_count={"CPU": 2, "GPU": 1})
+      with self.test_session(config=config) as sess:
+        sess.run(multi_device_iterator.initializer)
+        for i in range(0, 10, 2):
+          self.assertEqual(i, sess.run(elem_on_1))
+        for i in range(0, 10, 2):
+          self.assertEqual(i + 1, sess.run(elem_on_2))
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(elem_on_1)
+          sess.run(elem_on_2)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/ops/batching.py b/tensorflow/contrib/data/python/ops/batching.py
index a4914f4cde..42fc20ec01 100644
--- a/tensorflow/contrib/data/python/ops/batching.py
+++ b/tensorflow/contrib/data/python/ops/batching.py
@@ -515,10 +515,7 @@ def batch_and_drop_remainder(batch_size):
 
   def _apply_fn(dataset):
     """Function from `Dataset` to `Dataset` that applies the transformation."""
-    # TODO(jsimsa): Switch to using `batch(..., drop_remainder=True)` any time
-    # after 6/30/2018.
-    batched = dataset.batch(batch_size)
-    return _filter_irregular_batches(batch_size)(batched)
+    return dataset.batch(batch_size, drop_remainder=True)
 
   return _apply_fn
 
@@ -553,11 +550,9 @@ def padded_batch_and_drop_remainder(batch_size,
 
   def _apply_fn(dataset):
     """Function from `Dataset` to `Dataset` that applies the transformation."""
-    # TODO(jsimsa): Switch to using `padded_batch(..., drop_remainder=True)`
-    # any time after 6/30/2018.
-    batched = dataset.padded_batch(
-        batch_size, padded_shapes=padded_shapes, padding_values=padding_values)
-    return _filter_irregular_batches(batch_size)(batched)
+    return dataset.padded_batch(
+        batch_size, padded_shapes=padded_shapes, padding_values=padding_values,
+        drop_remainder=True)
 
   return _apply_fn
 
diff --git a/tensorflow/contrib/data/python/ops/prefetching_ops.py b/tensorflow/contrib/data/python/ops/prefetching_ops.py
index 45abd6376c..0edd7c9fe9 100644
--- a/tensorflow/contrib/data/python/ops/prefetching_ops.py
+++ b/tensorflow/contrib/data/python/ops/prefetching_ops.py
@@ -31,6 +31,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_dataset_ops as core_gen_dataset_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -523,3 +524,174 @@ class _CopyToDeviceDataset(dataset_ops.Dataset):
   @property
   def output_classes(self):
     return self._input_dataset.output_classes
+
+
+class _PerDeviceGenerator(dataset_ops.Dataset):
+  """A `dummy` generator dataset."""
+
+  def __init__(self, shard_num, multi_device_iterator_resource, incarnation_id,
+               source_device, target_device, output_shapes, output_types,
+               output_classes):
+    self._target_device = target_device
+    self._output_types = output_types
+    self._output_shapes = output_shapes
+    self._output_classes = output_classes
+    self._flat_output_shapes = nest.flatten(
+        sparse.as_dense_shapes(self._output_shapes, self._output_classes))
+    self._flat_output_types = nest.flatten(
+        sparse.as_dense_types(self._output_types, self._output_classes))
+
+    multi_device_iterator_string_handle = (
+        gen_dataset_ops.multi_device_iterator_to_string_handle(
+            multi_device_iterator_resource))
+
+    @function.Defun()
+    def _init_func():
+      return multi_device_iterator_string_handle
+
+    @function.Defun()
+    def _remote_init_func():
+      return functional_ops.remote_call(
+          target=source_device,
+          args=_init_func.captured_inputs,
+          Tout=[dtypes.string],
+          f=_init_func)
+
+    self._init_func = _remote_init_func
+    self._init_captured_args = _remote_init_func.captured_inputs
+
+    @function.Defun(dtypes.string)
+    def _next_func(string_handle):
+      multi_device_iterator = (
+          gen_dataset_ops.multi_device_iterator_from_string_handle(
+              string_handle=string_handle,
+              output_types=self._flat_output_types,
+              output_shapes=self._flat_output_shapes))
+      return gen_dataset_ops.multi_device_iterator_get_next_from_shard(
+          multi_device_iterator=multi_device_iterator,
+          shard_num=shard_num,
+          incarnation_id=incarnation_id,
+          output_types=self._flat_output_types,
+          output_shapes=self._flat_output_shapes)
+
+    @function.Defun(dtypes.string)
+    def _remote_next_func(string_handle):
+      return functional_ops.remote_call(
+          target=source_device,
+          args=[string_handle] + _next_func.captured_inputs,
+          Tout=self._flat_output_types,
+          f=_next_func)
+
+    self._next_func = _remote_next_func
+    self._next_captured_args = _remote_next_func.captured_inputs
+
+    @function.Defun(dtypes.string)
+    def _finalize_func(unused_string_handle):
+      return array_ops.constant(0, dtypes.int64)
+
+    @function.Defun(dtypes.string)
+    def _remote_finalize_func(string_handle):
+      return functional_ops.remote_call(
+          target=source_device,
+          args=[string_handle] + _finalize_func.captured_inputs,
+          Tout=[dtypes.int64],
+          f=_finalize_func)
+
+    self._finalize_func = _remote_finalize_func
+    self._finalize_captured_args = _remote_finalize_func.captured_inputs
+
+  def _as_variant_tensor(self):
+    with ops.device(self._target_device):
+      return core_gen_dataset_ops.generator_dataset(
+          self._init_captured_args,
+          self._next_captured_args,
+          self._finalize_captured_args,
+          init_func=self._init_func,
+          next_func=self._next_func,
+          finalize_func=self._finalize_func,
+          output_types=self._flat_output_types,
+          output_shapes=self._flat_output_shapes)
+
+  @property
+  def output_types(self):
+    return self._output_types
+
+  @property
+  def output_shapes(self):
+    return self._output_shapes
+
+  @property
+  def output_classes(self):
+    return self._output_classes
+
+
+class MultiDeviceIterator(object):
+  """An iterator over multiple devices."""
+
+  def __init__(self,
+               dataset,
+               devices,
+               prefetch_buffer_size=1,
+               source_device="/cpu:0"):
+    self._dataset = dataset
+    self._devices = devices
+    self._source_device = source_device
+    self._source_device_tensor = ops.convert_to_tensor(source_device)
+
+    self._flat_output_shapes = nest.flatten(
+        sparse.as_dense_shapes(self._dataset.output_shapes,
+                               self._dataset.output_classes))
+    self._flat_output_types = nest.flatten(
+        sparse.as_dense_types(self._dataset.output_types,
+                              self._dataset.output_classes))
+
+    # Create the MultiDeviceIterator.
+    with ops.device(self._source_device):
+      self._multi_device_iterator_resource = (
+          gen_dataset_ops.multi_device_iterator(
+              devices=self._devices,
+              shared_name="",
+              container="",
+              output_types=self._flat_output_types,
+              output_shapes=self._flat_output_shapes))
+
+      # The incarnation ID is used to ensure consistency between the per-device
+      # iterators and the multi-device iterator.
+      self._incarnation_id = gen_dataset_ops.multi_device_iterator_init(
+          self._dataset._as_variant_tensor(),  # pylint: disable=protected-access
+          self._multi_device_iterator_resource)
+
+    # TODO(rohanj): Explore the possibility of the MultiDeviceIterator to
+    # initialize the device side of the pipeline. This would allow the
+    # MultiDeviceIterator to choose, for example, to move some transformations
+    # into the device side from its input. It might be useful in rewriting.
+    # Create the per device iterators.
+    self._device_iterators = []
+    i = 0
+    for device in self._devices:
+      ds = _PerDeviceGenerator(
+          i, self._multi_device_iterator_resource, self._incarnation_id,
+          self._source_device_tensor, device, self._dataset.output_shapes,
+          self._dataset.output_types, self._dataset.output_classes)
+      ds = ds.prefetch(prefetch_buffer_size)
+      with ops.device(device):
+        self._device_iterators.append(ds.make_initializable_iterator())
+      i += 1
+
+    device_iterator_initializers = [
+        iterator.initializer for iterator in self._device_iterators
+    ]
+    self._initializer = control_flow_ops.group(*device_iterator_initializers)
+
+  def get_next(self):
+    result = []
+    i = 0
+    for device in self._devices:
+      with ops.device(device):
+        result.append(self._device_iterators[i].get_next())
+      i += 1
+    return result
+
+  @property
+  def initializer(self):
+    return self._initializer
diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD
index f5d7e24ae2..cbe741de5a 100644
--- a/tensorflow/contrib/distribute/python/BUILD
+++ b/tensorflow/contrib/distribute/python/BUILD
@@ -101,6 +101,23 @@ py_library(
 )
 
 py_library(
+    name = "parameter_server_strategy",
+    srcs = ["parameter_server_strategy.py"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":cross_tower_ops",
+        ":mirrored_strategy",
+        ":values",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:util",
+    ],
+)
+
+py_library(
     name = "one_device_strategy",
     srcs = ["one_device_strategy.py"],
     visibility = ["//tensorflow:internal"],
@@ -207,6 +224,35 @@ py_test(
     ],
 )
 
+py_test(
+    name = "parameter_server_strategy_test",
+    srcs = ["parameter_server_strategy_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_pip",
+    ],
+    deps = [
+        ":combinations",
+        ":multi_worker_test_base",
+        ":parameter_server_strategy",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:layers",
+        "//tensorflow/python:session",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/estimator:run_config",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 cuda_py_test(
     name = "mirrored_strategy_multigpu_test",
     srcs = ["mirrored_strategy_multigpu_test.py"],
diff --git a/tensorflow/contrib/distribute/python/checkpoint_utils_test.py b/tensorflow/contrib/distribute/python/checkpoint_utils_test.py
index fe3df9cbb9..bcb977f640 100644
--- a/tensorflow/contrib/distribute/python/checkpoint_utils_test.py
+++ b/tensorflow/contrib/distribute/python/checkpoint_utils_test.py
@@ -49,17 +49,23 @@ class CheckpointUtilsWithDistributionStrategyTest(
   def testInitFromCheckpoint(self, distribution, in_tower_mode):
     checkpoint_dir = self.get_temp_dir()
     with self.test_session() as session:
-      v1_value, _, _, _ = checkpoint_utils_test._create_checkpoints(
+      v1_value, v2_value, _, _ = checkpoint_utils_test._create_checkpoints(
           session, checkpoint_dir)
 
     def init_and_verify(g):
       v1 = variable_scope.get_variable("new_var1", [1, 10])
+      v2 = variable_scope.get_variable(
+          "new_var2", [10, 10],
+          synchronization=variable_scope.VariableSynchronization.ON_READ,
+          aggregation=variable_scope.VariableAggregation.MEAN)
       checkpoint_utils.init_from_checkpoint(checkpoint_dir, {
           "var1": "new_var1",
+          "var2": "new_var2"
       })
       with self.test_session(graph=g) as session:
         session.run(variables.global_variables_initializer())
         self.assertAllEqual(v1_value, self.evaluate(v1))
+        self.assertAllEqual(v2_value, self.evaluate(v2))
 
     with ops.Graph().as_default() as g, distribution.scope():
       if in_tower_mode:
diff --git a/tensorflow/contrib/distribute/python/cross_tower_ops.py b/tensorflow/contrib/distribute/python/cross_tower_ops.py
index b0baf0dad1..b6037d2133 100644
--- a/tensorflow/contrib/distribute/python/cross_tower_ops.py
+++ b/tensorflow/contrib/distribute/python/cross_tower_ops.py
@@ -28,18 +28,37 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import device_util
 
 
+def check_destinations(destinations):
+  """Checks whether `destinations` is not None and not empty.
+
+  Args:
+    destinations: a DistributedValues, Variable, string or a list of strings.
+
+  Returns:
+    Boolean indicating whether `destinations` is not None and not empty.
+  """
+  # Calling bool() on a ResourceVariable is not allowed.
+  if isinstance(destinations, resource_variable_ops.ResourceVariable):
+    return bool(destinations.device)
+  return bool(destinations)
+
+
 def validate_destinations(destinations):
-  if not isinstance(destinations,
-                    (value_lib.DistributedValues, six.string_types, list)):
+  if not isinstance(
+      destinations,
+      (value_lib.DistributedValues, resource_variable_ops.ResourceVariable,
+       six.string_types, list)):
     raise ValueError("destinations must be one of a `DistributedValues` object,"
-                     " a device string, a list of device strings or None")
+                     " a tf.Variable object, a device string, a list of device "
+                     "strings or None")
 
-  if not destinations:
+  if not check_destinations(destinations):
     raise ValueError("destinations can not be empty")
 
 
@@ -59,6 +78,8 @@ def _validate_value_destination_pairs(value_destination_pairs):
 def get_devices_from(destinations):
   if isinstance(destinations, value_lib.DistributedValues):
     return list(destinations.devices)
+  elif isinstance(destinations, resource_variable_ops.ResourceVariable):
+    return [destinations.device]
   elif isinstance(destinations, six.string_types):
     return [device_util.resolve(destinations)]
   else:
@@ -225,7 +246,10 @@ class ReductionToOneDeviceCrossTowerOps(CrossTowerOps):
     super(ReductionToOneDeviceCrossTowerOps, self).__init__()
 
   def _reduce(self, aggregation, per_device_value, destinations):
-    devices = get_devices_from(destinations or per_device_value)
+    if check_destinations(destinations):
+      devices = get_devices_from(destinations)
+    else:
+      devices = get_devices_from(per_device_value)
     reduce_to_device = self.reduce_to_device or devices[0]
     reduced = _simple_reduce(per_device_value, reduce_to_device,
                              self.accumulation_fn, aggregation)
@@ -508,7 +532,10 @@ class AllReduceCrossTowerOps(CrossTowerOps):
             logging.WARN,
             "Efficient allreduce is not supported for IndexedSlices.", 10)
 
-      devices = get_devices_from(destinations or per_device_value)
+      if check_destinations(destinations):
+        devices = get_devices_from(destinations)
+      else:
+        devices = get_devices_from(per_device_value)
       reduce_to_device = devices[0]
       reduced = _simple_reduce(per_device_value, reduce_to_device,
                                math_ops.add_n, aggregation)
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy.py b/tensorflow/contrib/distribute/python/mirrored_strategy.py
index dcbc6b0878..eb2d102012 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import contextlib
 import threading
-import six
 
 from tensorflow.contrib.distribute.python import cross_tower_ops as cross_tower_ops_lib
 from tensorflow.contrib.distribute.python import shared_variable_creator
@@ -60,6 +59,156 @@ class _RequestedStop(Exception):
   pass
 
 
+# Make _call_for_each_tower and _reduce_non_distributed_value not members of
+# MirroredStrategy so that they are generally not allowed to use anything
+# specific to MirroredStrategy and thus can be shared with other distribution
+# strategies.
+
+
+# TODO(yuefengz): maybe create a common class for those who need to call this
+# _call_for_each_tower.
+def _call_for_each_tower(distribution, fn, *args, **kwargs):
+  """Run `fn` in separate threads, once per tower/worker device.
+
+  Args:
+    distribution: the DistributionStrategy object.
+    fn: function to run (will be run once per device, each in its own thread).
+    *args: positional arguments for `fn`
+    **kwargs: keyword arguments for `fn`.
+        `"run_concurrently"`: Boolean indicating whether executions of `fn`
+           can be run concurrently (under eager execution only), defaults to
+           `True`.
+
+  Returns:
+    Merged return value of `fn` across all towers.
+
+  Raises:
+    RuntimeError: If fn() calls get_tower_context().merge_call() a different
+        number of times from the available devices.
+  """
+  run_concurrently = kwargs.pop("run_concurrently", True)
+  if not context.executing_eagerly():
+    # Lots of TF library code isn't thread-safe in graph mode, and
+    # there is little to be gained by turning on multithreading when
+    # constructing a graph.
+    run_concurrently = False
+    # Needed for per-thread device, etc. contexts in graph mode.
+    ops.get_default_graph().switch_to_thread_local()
+  elif run_concurrently is None:
+    run_concurrently = True
+
+  coord = coordinator.Coordinator(clean_stop_exception_types=(_RequestedStop,))
+
+  shared_variable_store = {}
+
+  # TODO(isaprykin): Create these threads once instead of during every run()
+  # call.
+  threads = []
+  for index, d in enumerate(distribution.worker_devices):
+    variable_creator_fn = shared_variable_creator.make_fn(
+        shared_variable_store, index)
+    t = MirroredStrategy._MirroredTowerThread(  # pylint: disable=protected-access
+        distribution, coord, d, variable_creator_fn, fn,
+        *values.select_device(d, args), **values.select_device(d, kwargs))
+    threads.append(t)
+
+  for t in threads:
+    t.start()
+
+  # When `fn` starts `should_run` event is set on _MirroredTowerThread
+  # (`MTT`) threads. The execution waits until
+  # `MTT.has_paused` is set, which indicates that either `fn` is
+  # complete or a `get_tower_context().merge_call()` is called.  If `fn` is
+  # complete, then `MTT.done` is set to True.  Otherwise, arguments
+  # of `get_tower_context().merge_call` from all paused threads are grouped
+  # and the `merge_fn` is performed.  Results of the
+  # `get_tower_context().merge_call` are then set to `MTT.merge_result`.
+  # Each such `get_tower_context().merge_call` call returns the
+  # `MTT.merge_result` for that thread when `MTT.should_run` event
+  # is reset again. Execution of `fn` resumes.
+
+  try:
+    with coord.stop_on_exception():
+      all_done = False
+      while not all_done and not coord.should_stop():
+        done = []
+        if run_concurrently:
+          for t in threads:
+            t.should_run.set()
+          for t in threads:
+            t.has_paused.wait()
+            t.has_paused.clear()
+            if coord.should_stop():
+              return None
+            done.append(t.done)
+        else:
+          for t in threads:
+            t.should_run.set()
+            t.has_paused.wait()
+            t.has_paused.clear()
+            if coord.should_stop():
+              return None
+            done.append(t.done)
+        if coord.should_stop():
+          return None
+        all_done = all(done)
+        if not all_done:
+          if any(done):
+            raise RuntimeError("Some towers made a different number of "
+                               "tower_context().merge_call() calls.")
+          # get_tower_context().merge_call() case
+          merge_args = values.regroup({t.device: t.merge_args for t in threads})
+          merge_kwargs = values.regroup(
+              {t.device: t.merge_kwargs for t in threads})
+          # We capture the name_scope of the MTT when we call merge_fn
+          # to ensure that if we have opened a name scope in the MTT,
+          # it will be respected when executing the merge function. We only
+          # capture the name_scope from the first MTT and assume it is
+          # the same for all other MTTs.
+          mtt_captured_name_scope = threads[0].captured_name_scope
+          with ops.name_scope(mtt_captured_name_scope):
+            merge_result = threads[0].merge_fn(distribution, *merge_args,
+                                               **merge_kwargs)
+          for t in threads:
+            t.merge_result = values.select_device(t.device, merge_result)
+  finally:
+    for t in threads:
+      t.should_run.set()
+    coord.join(threads)
+
+  return values.regroup({t.device: t.main_result for t in threads})
+
+
+def _reduce_non_distributed_value(distribution, aggregation, value,
+                                  destinations):
+  """Reduce a non-DistributedValue `value` to `destinations`."""
+  if isinstance(value, values.DistributedValues):
+    raise ValueError("You are passing a `DistributedValue` to "
+                     "`_reduce_non_distributed_value`, which is not allowed.")
+
+  if value == 0:
+    return 0
+  if aggregation == variable_scope.VariableAggregation.MEAN:
+    return distribution.broadcast(value, destinations)
+
+  cross_tower_ops_lib.validate_destinations(destinations)
+  if (len(distribution.worker_devices) != 1 or
+      not cross_tower_ops_lib.check_destinations(destinations)):
+    raise ValueError("A non-DistributedValues value cannot be reduced with the "
+                     "given aggregation.")
+  # TODO(anjalisridhar): Moves these methods to a device utility file?
+  devices = cross_tower_ops_lib.get_devices_from(destinations)
+  if len(devices) == 1:
+    with ops.device(devices[0]):
+      return array_ops.identity(value)
+  else:
+    value_updates = {}
+    for d in devices:
+      with ops.device(d):
+        value_updates[d] = array_ops.identity(value)
+    return values.Mirrored(value_updates)
+
+
 class MirroredStrategy(distribute_lib.DistributionStrategy):
   """Mirrors vars to distribute across multiple devices on a single machine.
 
@@ -198,116 +347,7 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
                                                  self._devices)
 
   def _call_for_each_tower(self, fn, *args, **kwargs):
-    """Run `fn` in separate threads, once per tower/worker device.
-
-    Args:
-      fn: function to run (will be run once per device, each in its own thread).
-      *args: positional arguments for `fn`
-      **kwargs: keyword arguments for `fn`.
-          `"run_concurrently"`: Boolean indicating whether executions of `fn`
-             can be run concurrently (under eager execution only), defaults to
-             `True`.
-
-    Returns:
-      Merged return value of `fn` across all towers.
-
-    Raises:
-      RuntimeError: If fn() calls get_tower_context().merge_call() a different
-          number of times for when called for different devices.
-    """
-    run_concurrently = kwargs.pop("run_concurrently", True)
-    if not context.executing_eagerly():
-      # Lots of TF library code isn't thread-safe in graph mode, and
-      # there is little to be gained by turning on multithreading when
-      # constructing a graph.
-      run_concurrently = False
-      # Needed for per-thread device, etc. contexts in graph mode.
-      ops.get_default_graph().switch_to_thread_local()
-    elif run_concurrently is None:
-      run_concurrently = True
-
-    coord = coordinator.Coordinator(
-        clean_stop_exception_types=(_RequestedStop,))
-
-    shared_variable_store = {}
-
-    # TODO(isaprykin): Create these threads once instead of during every run()
-    # call.
-    threads = []
-    for index, d in enumerate(self._devices):
-      variable_creator_fn = shared_variable_creator.make_fn(
-          shared_variable_store, index)
-      t = MirroredStrategy._MirroredTowerThread(
-          self, coord, d, variable_creator_fn, fn,
-          *values.select_device(d, args), **values.select_device(d, kwargs))
-      threads.append(t)
-
-    for t in threads:
-      t.start()
-
-    # When `fn` starts `should_run` event is set on _MirroredTowerThread
-    # (`MTT`) threads. The execution waits until
-    # `MTT.has_paused` is set, which indicates that either `fn` is
-    # complete or a `get_tower_context().merge_call()` is called.  If `fn` is
-    # complete, then `MTT.done` is set to True.  Otherwise, arguments
-    # of `get_tower_context().merge_call` from all paused threads are grouped
-    # and the `merge_fn` is performed.  Results of the
-    # `get_tower_context().merge_call` are then set to `MTT.merge_result`.
-    # Each such `get_tower_context().merge_call` call returns the
-    # `MTT.merge_result` for that thread when `MTT.should_run` event
-    # is reset again. Execution of `fn` resumes.
-
-    try:
-      with coord.stop_on_exception():
-        all_done = False
-        while not all_done and not coord.should_stop():
-          done = []
-          if run_concurrently:
-            for t in threads:
-              t.should_run.set()
-            for t in threads:
-              t.has_paused.wait()
-              t.has_paused.clear()
-              if coord.should_stop():
-                return None
-              done.append(t.done)
-          else:
-            for t in threads:
-              t.should_run.set()
-              t.has_paused.wait()
-              t.has_paused.clear()
-              if coord.should_stop():
-                return None
-              done.append(t.done)
-          if coord.should_stop():
-            return None
-          all_done = all(done)
-          if not all_done:
-            if any(done):
-              raise RuntimeError("Some towers made a different number of "
-                                 "tower_context().merge_call() calls.")
-            # get_tower_context().merge_call() case
-            merge_args = values.regroup(
-                {t.device: t.merge_args for t in threads})
-            merge_kwargs = values.regroup(
-                {t.device: t.merge_kwargs for t in threads})
-            # We capture the name_scope of the MTT when we call merge_fn
-            # to ensure that if we have opened a name scope in the MTT,
-            # it will be respected when executing the merge function. We only
-            # capture the name_scope from the first MTT and assume it is
-            # the same for all other MTTs.
-            mtt_captured_name_scope = threads[0].captured_name_scope
-            with ops.name_scope(mtt_captured_name_scope):
-              merge_result = threads[0].merge_fn(
-                  self, *merge_args, **merge_kwargs)
-            for t in threads:
-              t.merge_result = values.select_device(t.device, merge_result)
-    finally:
-      for t in threads:
-        t.should_run.set()
-      coord.join(threads)
-
-    return values.regroup({t.device: t.main_result for t in threads})
+    return _call_for_each_tower(self, fn, *args, **kwargs)
 
   def map(self, map_over, fn, *args, **kwargs):
     # TODO(josh11b): In eager mode, use one thread per device.
@@ -337,29 +377,9 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
 
   def _reduce(self, aggregation, value, destinations):
     assert not isinstance(value, values.Mirrored)
-    if not isinstance(value, values.PerDevice):
-      if value == 0:
-        return 0
-      if aggregation == variable_scope.VariableAggregation.MEAN:
-        return self._broadcast(value, destinations)
-
-      cross_tower_ops_lib.validate_destinations(destinations)
-      if len(self._devices) == 1:
-        if destinations:
-          # TODO(anjalisridhar): Moves these methods to a device utility file?
-          devices = cross_tower_ops_lib.get_devices_from(destinations)
-          if len(devices) == 1:
-            with ops.device(devices[0]):
-              return array_ops.identity(value)
-          else:
-            value_updates = {}
-            for d in devices:
-              with ops.device(d):
-                value_updates[d] = array_ops.identity(value)
-            return values.Mirrored(value_updates)
-      raise ValueError("A non PerDevice value cannot be reduced with the given "
-                       "aggregation.")
-
+    if not isinstance(value, values.DistributedValues):
+      return _reduce_non_distributed_value(self, aggregation, value,
+                                           destinations)
     return self._get_cross_tower_ops().reduce(
         aggregation, value, destinations=destinations)
 
@@ -433,15 +453,8 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
   def _get_devices_from(self, colocate_with=None):
     if colocate_with is None:
       return self._devices
-    elif isinstance(colocate_with, values.DistributedValues):
-      # pylint: disable=protected-access
-      return list(colocate_with._index.keys())
-    elif isinstance(colocate_with, six.string_types):
-      return [device_util.resolve(colocate_with)]
-    elif isinstance(colocate_with, list):
-      return [device_util.resolve(d) for d in colocate_with]
     else:
-      return colocate_with
+      return cross_tower_ops_lib.get_devices_from(colocate_with)
 
   class _MirroredTowerThread(threading.Thread):
     """A thread that runs() a function on a device."""
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
index 6a14b833d2..aab7119901 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
@@ -792,8 +792,8 @@ class MirroredVariableUpdateTest(test.TestCase):
         return mirrored_var.assign(5.0)
 
       with self.assertRaisesRegexp(
-          ValueError, "A non PerDevice value cannot be reduced with the given "
-          "aggregation."):
+          ValueError, "A non-DistributedValues value cannot be reduced with "
+          "the given aggregation."):
         self.evaluate(dist.unwrap(dist.call_for_each_tower(model_fn)))
 
   @test_util.run_in_graph_and_eager_modes(config=config)
@@ -967,5 +967,74 @@ class MirroredAndTowerLocalVariableInitializerTest(test.TestCase):
         self.evaluate(tower_local_var.initializer)
         self.assertTrue(self.evaluate(tower_local_var.is_initialized()))
 
+
+class TowerLocalVariableAssignTest(test.TestCase):
+  config = config_pb2.ConfigProto()
+  config.allow_soft_placement = True
+
+  def _skip_eager_if_gpus_less_than(self, num_gpus):
+    if context.num_gpus() < num_gpus and context.executing_eagerly():
+      self.skipTest("Enough GPUs not available for this test in eager mode.")
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testAssignTowerLocalVarSumAggregation(self):
+    self._skip_eager_if_gpus_less_than(1)
+    def model_fn():
+      v_sum = variable_scope.variable(
+          1.0,
+          synchronization=variable_scope.VariableSynchronization.ON_READ,
+          aggregation=variable_scope.VariableAggregation.SUM)
+      return v_sum
+
+    dist = mirrored_strategy.MirroredStrategy(
+        ["/device:GPU:0", "/device:CPU:0"])
+
+    with dist.scope():
+      tower_local_var = dist.call_for_each_tower(model_fn,
+                                                 run_concurrently=False)
+      self.assertTrue(isinstance(tower_local_var, values.TowerLocalVariable))
+      self.evaluate(variables.global_variables_initializer())
+      # Each tower has a value of 1.0 assigned to it in tower context.
+      # When we read the value using `read_var` we should see the SUM of each of
+      # values on each of the towers.
+      self.assertEqual(2.0, self.evaluate(dist.read_var(tower_local_var)))
+      # Assigning 6.0 in cross tower context will assign a value of
+      # 6.0/num_towers to each tower.
+      tlv_ops = tower_local_var.assign(6.0)
+      self.evaluate(tlv_ops)
+      # On reading the tower local var we should get the assigned value back.
+      # The value on all the towers are added before being returned by
+      # `read_var`.
+      self.assertEqual(6.0, self.evaluate(dist.read_var(tower_local_var)))
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testAssignTowerLocalVarMeanAggregation(self):
+    self._skip_eager_if_gpus_less_than(1)
+    def model_fn():
+      v_sum = variable_scope.variable(
+          1.0,
+          synchronization=variable_scope.VariableSynchronization.ON_READ,
+          aggregation=variable_scope.VariableAggregation.MEAN)
+      return v_sum
+
+    dist = mirrored_strategy.MirroredStrategy(
+        ["/device:GPU:0", "/device:CPU:0"])
+
+    with dist.scope():
+      tower_local_var = dist.call_for_each_tower(model_fn,
+                                                 run_concurrently=False)
+      self.assertTrue(isinstance(tower_local_var, values.TowerLocalVariable))
+      self.evaluate(variables.global_variables_initializer())
+      # Each tower has a value of 1.0 assigned to it in tower context.
+      # When we read the value using `read_var` we should see the MEAN of values
+      # on all towers which is the value assigned in tower context.
+      self.assertEqual(1.0, self.evaluate(dist.read_var(tower_local_var)))
+      tlv_ops = tower_local_var.assign(6.0)
+      self.evaluate(tlv_ops)
+      # On reading the tower local var we should get the MEAN of all values
+      # which is equal to the value assigned.
+      self.assertEqual(6.0, self.evaluate(dist.read_var(tower_local_var)))
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distribute/python/multi_worker_test_base.py b/tensorflow/contrib/distribute/python/multi_worker_test_base.py
index f659be5f42..fa479918bd 100644
--- a/tensorflow/contrib/distribute/python/multi_worker_test_base.py
+++ b/tensorflow/contrib/distribute/python/multi_worker_test_base.py
@@ -28,23 +28,39 @@ from tensorflow.python.eager import test
 from tensorflow.python.framework import test_util
 
 
+def create_in_process_cluster(num_workers, num_ps):
+  """Create an in-process cluster that consists of only standard server."""
+  # Leave some memory for cuda runtime.
+  gpu_mem_frac = 0.7 / num_workers
+  worker_config = config_pb2.ConfigProto()
+  worker_config.gpu_options.per_process_gpu_memory_fraction = gpu_mem_frac
+
+  ps_config = config_pb2.ConfigProto()
+  ps_config.device_count['GPU'] = 0
+
+  # Create in-process servers. Once an in-process tensorflow server is created,
+  # there is no way to terminate it. So we create one cluster per test process.
+  # We could've started the server in another process, we could then kill that
+  # process to terminate the server. The reasons why we don't want multiple
+  # processes are
+  # 1) it is more difficult to manage these processes
+  # 2) there is something global in CUDA such that if we initialize CUDA in the
+  # parent process, the child process cannot initialize it again and thus cannot
+  # use GPUs (https://stackoverflow.com/questions/22950047).
+  return test_util.create_local_cluster(
+      num_workers,
+      num_ps=num_ps,
+      worker_config=worker_config,
+      ps_config=ps_config)
+
+
 class MultiWorkerTestBase(test.TestCase):
   """Base class for testing multi node strategy and dataset."""
 
   @classmethod
   def setUpClass(cls):
     """Create a local cluster with 2 workers."""
-    num_workers = 2
-    # Leave some memory for cuda runtime.
-    gpu_mem_frac = 0.7 / num_workers
-    default_config = config_pb2.ConfigProto()
-    default_config.gpu_options.per_process_gpu_memory_fraction = gpu_mem_frac
-
-    # The local cluster takes some portion of the local GPUs and there is no way
-    # for the cluster to terminate unless using multiple processes. Therefore,
-    # we have to only create only one cluster throughout a test process.
-    workers, _ = test_util.create_local_cluster(
-        num_workers, num_ps=0, worker_config=default_config)
+    workers, _ = create_in_process_cluster(num_workers=2, num_ps=0)
     cls._master_target = workers[0].target
 
   @contextlib.contextmanager
diff --git a/tensorflow/contrib/distribute/python/parameter_server_strategy.py b/tensorflow/contrib/distribute/python/parameter_server_strategy.py
new file mode 100644
index 0000000000..d53582279e
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/parameter_server_strategy.py
@@ -0,0 +1,353 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Classes implementing a multi-worker ps DistributionStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+import os
+
+from tensorflow.contrib.distribute.python import cross_tower_ops as cross_tower_ops_lib
+from tensorflow.contrib.distribute.python import mirrored_strategy
+from tensorflow.contrib.distribute.python import values
+from tensorflow.core.protobuf import cluster_pb2
+from tensorflow.python.framework import device as tf_device
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.training import device_setter
+from tensorflow.python.training import device_util
+from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import server_lib
+from tensorflow.python.util import nest
+
+_LOCAL_CPU = "/device:CPU:0"
+_LOCAL_GPU_0 = "/device:GPU:0"
+
+
+def _normalize_cluster_spec(cluster_spec):
+  if isinstance(cluster_spec, (dict, cluster_pb2.ClusterDef)):
+    return server_lib.ClusterSpec(cluster_spec)
+  elif not isinstance(cluster_spec, server_lib.ClusterSpec):
+    raise ValueError(
+        "`cluster_spec' should be dict or a `tf.train.ClusterSpec` or a "
+        "`tf.train.ClusterDef` object")
+
+
+# TODO(yuefengz): maybe cache variables on local CPU.
+# TODO(yuefengz): we may want to set session options to disallow communication
+# between workers.
+class ParameterServerStrategy(distribute_lib.DistributionStrategy):
+  """A parameter server DistributionStrategy.
+
+  This strategy class works for both local training and between-graph replicated
+  training for multiple workers. If `cluster_spec` is specified, either passed
+  in to __init__() method or parsed from the
+  ["TF_CONFIG" environment
+  variable](https://www.tensorflow.org/api_docs/python/tf/estimator/RunConfig),
+  variables and updates to those variables are assigned to parameter servers and
+  other operations are assigned to workers. If `cluster_spec` is not set, it
+  becomes local training where variables are assigned to local CPU or the only
+  GPU. When each worker has more than one GPU, operations will be replicated on
+  these GPUs. In both cases, operations are replicated but variables are not and
+  these workers share a common view for which paramater server a variable is
+  assigned to.
+
+  This class assumes between-graph replication will be used and works on a graph
+  for a particular worker.
+
+  It is expected to call `call_for_each_tower(fn, *args, **kwargs)` for any
+  operations which potentially can be replicated across towers (i.e. multiple
+  GPUs) even if there is only CPU or one GPU. When defining the `fn`, extra
+  caution needs to be taken:
+
+  1) Always use @{tf.get_variable} instead of @{tf.Variable} which is not able
+  to refer to the same variable on different towers.
+
+  2) It is generally not recommended to open a device scope under the strategy's
+  scope. A device scope (i.e. calling @{tf.device}) will be merged with or
+  override the device for operations but will not change the device for
+  variables.
+
+  3) It is also not recommended to open a colocation scope (i.e. calling
+  @{tf.colocate_with}) under the strategy's scope. For colocating variables,
+  use `distribution.colocate_vars_with` instead. Colocation of ops will possibly
+  create conflicts of device assignement.
+  """
+
+  def __init__(self,
+               num_gpus_per_worker=0,
+               cluster_spec=None,
+               task_type=None,
+               task_id=None):
+    """Initiailizes this strategy.
+
+    Args:
+      num_gpus_per_worker: number of local GPUs or GPUs per worker.
+      cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the
+        cluster configurations.
+      task_type: the current task type.
+      task_id: the current task id.
+    """
+    super(ParameterServerStrategy, self).__init__()
+    self._num_gpus_per_worker = num_gpus_per_worker
+    if cluster_spec:
+      cluster_spec = _normalize_cluster_spec(cluster_spec)
+    self._cluster_spec = cluster_spec
+
+    # We typically don't need to do all-reduce in this strategy.
+    self._cross_tower_ops = (
+        cross_tower_ops_lib.ReductionToOneDeviceCrossTowerOps(
+            reduce_to_device=_LOCAL_CPU))
+
+    self._initialize_devices(num_gpus_per_worker, cluster_spec, task_type,
+                             task_id)
+
+  def _initialize_devices(self, num_gpus_per_worker, cluster_spec, task_type,
+                          task_id):
+    """Initialize internal devices.
+
+    It creates variable devices and compute devices. Variables and operations
+    will be assigned to them respectively. We have one compute device per tower.
+    The variable device is a device function or device string. The default
+    variable device assigns variables to parameter servers in a round-robin
+    fashion.
+
+    Args:
+      num_gpus_per_worker: number of local GPUs or GPUs per worker.
+      cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the
+        cluster configurations.
+      task_type: the current task type.
+      task_id: the current task id.
+
+    Raises:
+      ValueError: if the cluster_spec doesn't have ps jobs.
+    """
+    self._task_type = task_type or "worker"
+    self._task_id = task_id or 0
+    self._worker_device = "/job:%s/task:%d" % (self._task_type, self._task_id)
+
+    # TODO(yuefengz): maybe clearer to split it into two classes, one for
+    # the distribuetd case and one for the local case, once we have the factory
+    # class/method.
+
+    # Define compute devices which is a list of device strings and one for each
+    # tower. When there are GPUs, replicate operations on these GPUs. Otherwise,
+    # place operations on CPU.
+    if cluster_spec is None:
+      # Local mode.
+      if num_gpus_per_worker > 0:
+        self._compute_devices = list(
+            map("/device:GPU:{}".format, range(num_gpus_per_worker)))
+      else:
+        self._compute_devices = [_LOCAL_CPU]
+    else:
+      # Distributed mode.
+      if num_gpus_per_worker > 0:
+        self._compute_devices = [
+            "%s/device:GPU:%d" % (self._worker_device, i)
+            for i in range(num_gpus_per_worker)
+        ]
+      else:
+        self._compute_devices = [self._worker_device]
+
+    self._compute_devices = list(
+        map(device_util.resolve, self._compute_devices))
+    self._canonical_compute_device_set = set(self._compute_devices)
+
+    # Define variable device which is a device string in the local case and a
+    # device function in the distributed case. It is used to open a device scope
+    # where varibles are defined.
+    # The `_parameter_devices` is needed for the `parameter_devices` property
+    # and is a list of all variable devices.
+    if cluster_spec is None:
+      # Local mode. If there is only one GPU, put everything on that GPU.
+      # Otherwise, place variables on CPU.
+      if num_gpus_per_worker == 1:
+        assert len(list(self._compute_devices)) == 1
+        self._variable_device = _LOCAL_GPU_0
+        self._parameter_devices = [_LOCAL_GPU_0]
+      else:
+        self._variable_device = _LOCAL_CPU
+        self._parameter_devices = [_LOCAL_CPU]
+    else:
+      # Distributed mode. Place variables on ps jobs in a round-robin fashion.
+      # Note that devices returned from `replica_device_setter` are not
+      # canonical and therefore we don't canonicalize all variable devices to
+      # make them consistent.
+      # TODO(yuefengz): support passing a strategy object to control variable
+      # assignment.
+      # TODO(yuefengz): merge the logic of replica_device_setter into this
+      # class.
+      num_ps_replicas = len(cluster_spec.as_dict().get("ps", []))
+      if num_ps_replicas == 0:
+        raise ValueError("The cluster spec needs to have `ps` jobs.")
+      self._variable_device = device_setter.replica_device_setter(
+          ps_tasks=num_ps_replicas,
+          worker_device=self._worker_device,
+          merge_devices=True,
+          cluster=cluster_spec)
+
+      # Parameter devices are all tasks of the "ps" job.
+      self._parameter_devices = map("/job:ps/task:{}".format,
+                                    range(num_ps_replicas))
+
+    # Define the default device in cross-tower mode. In the distributed case, we
+    # set the default device to the corresponding worker to prevent these ops
+    # from being placed on other workers.
+    if cluster_spec is None:
+      self._default_device = None
+    else:
+      self._default_device = self._worker_device
+
+  def distribute_dataset(self, dataset_fn):
+    """Distributes the dataset to each local GPU."""
+    return values.PerDeviceDataset(
+        self._call_dataset_fn(dataset_fn), self._compute_devices, True)
+
+  def _broadcast(self, tensor, destinations):
+    if not cross_tower_ops_lib.check_destinations(destinations):
+      destinations = self._compute_devices
+    return self._cross_tower_ops.broadcast(tensor, destinations)
+
+  # TODO(yuefengz): not all ops in device_setter.STANDARD_PS_OPS will go through
+  # this creator, such as "MutableHashTable".
+  def _create_variable(self, next_creator, *args, **kwargs):
+    if "colocate_with" in kwargs:
+      with ops.device(None):
+        with ops.colocate_with(kwargs["colocate_with"]):
+          return next_creator(*args, **kwargs)
+
+    with ops.colocate_with(None, ignore_existing=True):
+      with ops.device(self._variable_device):
+        return next_creator(*args, **kwargs)
+
+  def _call_for_each_tower(self, fn, *args, **kwargs):
+    # pylint: disable=protected-access
+    return mirrored_strategy._call_for_each_tower(self, fn, *args, **kwargs)
+
+  def _verify_destinations_not_different_worker(self, destinations):
+    if destinations is None:
+      return
+    for d in cross_tower_ops_lib.get_devices_from(destinations):
+      d_spec = tf_device.DeviceSpec.from_string(d)
+      if d_spec.job == self._task_type and d_spec.task != self._task_id:
+        raise ValueError(
+            "Cannot reduce to another worker: %r, current worker is %r" %
+            (d, self._worker_device))
+
+  def _reduce(self, aggregation, value, destinations):
+    self._verify_destinations_not_different_worker(destinations)
+    if not isinstance(value, values.DistributedValues):
+      # pylint: disable=protected-access
+      return mirrored_strategy._reduce_non_distributed_value(
+          self, aggregation, value, destinations)
+
+    return self._cross_tower_ops.reduce(
+        aggregation, value, destinations=destinations)
+
+  def _batch_reduce(self, aggregation, value_destination_pairs):
+    for _, destinations in value_destination_pairs:
+      self._verify_destinations_not_different_worker(destinations)
+    return self._cross_tower_ops.batch_reduce(aggregation,
+                                              value_destination_pairs)
+
+  def _select_single_value(self, structured):
+    """Select any single values in `structured`."""
+
+    def _select_fn(x):  # pylint: disable=g-missing-docstring
+      if isinstance(x, values.Mirrored):
+        if len(x.devices) == 1:
+          return list(x._index.values())[0]  # pylint: disable=protected-access
+        else:
+          raise ValueError(
+              "You cannot update variable with a Mirrored object with multiple "
+              "components %r when using ParameterServerStrategy. You must "
+              "specify a single value or a Mirrored with a single value." % x)
+      elif isinstance(x, values.PerDevice):
+        raise ValueError(
+            "You cannot update variable with a PerDevice object %r when using "
+            "ParameterServerStrategy. You must specify a single value or a "
+            "Mirrored with a single value" % x)
+      else:
+        return x
+
+    return nest.map_structure(_select_fn, structured)
+
+  def _update(self, var, fn, *args, **kwargs):
+    if not isinstance(var, resource_variable_ops.ResourceVariable):
+      raise ValueError(
+          "You can not update `var` %r. It must be a Variable." % var)
+    with ops.colocate_with(var), distribute_lib.UpdateContext(var.device):
+      return fn(var, *self._select_single_value(args),
+                **self._select_single_value(kwargs))
+
+  # TODO(yuefengz): does it need to call _select_single_value?
+  def _update_non_slot(self, colocate_with, fn, *args, **kwargs):
+    with ops.device(
+        colocate_with.device), distribute_lib.UpdateContext(colocate_with):
+      return fn(*args, **kwargs)
+
+  def _unwrap(self, val):
+    if isinstance(val, values.DistributedValues):
+      # Return in a deterministic order.
+      if set(val.devices) == self._canonical_compute_device_set:
+        return [val.get(device=d) for d in self._compute_devices]
+      return [val.get(device=d) for d in sorted(val.devices)]
+    return [val]
+
+  def read_var(self, var):
+    # No need to distinguish between normal variables and tower-local variables.
+    return array_ops.identity(var)
+
+  def configure(self, session_config=None):
+    del session_config
+
+    # Use TF_CONFIG to get the cluster spec and the current job.
+    tf_config = json.loads(os.environ.get("TF_CONFIG", "{}"))
+    cluster_spec = _normalize_cluster_spec(tf_config.get("cluster", {}))
+
+    task_env = tf_config.get("task", {})
+    if task_env:
+      task_type = task_env.get("type", "worker")
+      task_id = int(task_env.get("index", "0"))
+    else:
+      task_type = "worker"
+      task_id = None
+
+    # Set the devices if cluster_spec is defined in TF_CONFIG but not passed in
+    # the constructor.
+    if not self._cluster_spec and cluster_spec:
+      self._cluster_spec = cluster_spec
+      self._initialize_devices(self._num_gpus_per_worker, cluster_spec,
+                               task_type, task_id)
+
+  @property
+  def num_towers(self):
+    return len(self._compute_devices)
+
+  @property
+  def worker_devices(self):
+    # Make a copy to prevent users from accidentally mutating our copy.
+    return list(self._compute_devices)
+
+  @property
+  def parameter_devices(self):
+    return list(self._parameter_devices)
+
+  def non_slot_devices(self, var_list):
+    return min(var_list, key=lambda x: x.name)
diff --git a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
new file mode 100644
index 0000000000..ad538b9e8e
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
@@ -0,0 +1,455 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ParameterServerStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+import json
+import threading
+from absl.testing import parameterized
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.contrib.distribute.python import multi_worker_test_base
+from tensorflow.contrib.distribute.python import parameter_server_strategy
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session
+from tensorflow.python.eager import context
+from tensorflow.python.estimator import run_config
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.layers import core
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gradients
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import device_util
+from tensorflow.python.training import distribute as distribute_lib
+
+
+class ParameterServerStrategyTest(test.TestCase, parameterized.TestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    cls._workers, cls._ps = multi_worker_test_base.create_in_process_cluster(
+        num_workers=3, num_ps=2)
+
+  def setUp(self):
+    self._result = 0
+    self._lock = threading.Lock()
+    self._init_condition = threading.Condition()
+    self._init_reached = 0
+    self._finish_condition = threading.Condition()
+    self._finish_reached = 0
+
+  def _get_ps_distribution_strategy(self, task_type, task_index, num_gpus=0):
+    tf_config = {
+        'cluster': {
+            run_config.TaskType.WORKER: [
+                'fake_worker_0', 'fake_worker_1', 'fake_worker_2'
+            ],
+            run_config.TaskType.PS: ['fake_ps_0', 'fake_ps_1']
+        },
+        'task': {
+            'type': task_type,
+            'index': task_index
+        }
+    }
+    distribution = parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=num_gpus)
+    with self._lock:
+      # Accessing environment variables should be protected by locks because
+      # environment variables are shared by all threads.
+      with test.mock.patch.dict('os.environ',
+                                {'TF_CONFIG': json.dumps(tf_config)}):
+        distribution.configure()
+    return distribution
+
+  @contextlib.contextmanager
+  def _test_session(self, target):
+    config = config_pb2.ConfigProto(allow_soft_placement=True)
+    config.graph_options.optimizer_options.opt_level = -1
+    with session.Session(graph=None, config=config, target=target) as sess:
+      yield sess
+
+  def _test_device_assignment_distributed(self, d, num_gpus=0):
+    with ops.Graph().as_default(), \
+         self._test_session(target=self._workers[0].target) as sess, \
+         d.scope():
+
+      # Define a variable outside the call_for_each_tower scope. This is not
+      # recommended.
+      n = variable_scope.get_variable('n', initializer=10.0)
+      self.assertEqual(n.device, '/job:ps/task:0')
+
+      def model_fn():
+        if num_gpus == 0:
+          last_part_device = 'device:CPU:0'
+        else:
+          last_part_device = (
+              'device:GPU:%d' % distribute_lib.get_tower_context().tower_id)
+
+        a = constant_op.constant(1.0)
+        b = constant_op.constant(2.0)
+        c = a + b
+        self.assertEqual(a.device,
+                         '/job:worker/replica:0/task:1/%s' % last_part_device)
+        self.assertEqual(b.device,
+                         '/job:worker/replica:0/task:1/%s' % last_part_device)
+        self.assertEqual(c.device,
+                         '/job:worker/replica:0/task:1/%s' % last_part_device)
+
+        # The device scope is ignored for variables but not for normal ops.
+        with ops.device('/job:worker/task:0'):
+          x = variable_scope.get_variable('x', initializer=10.0)
+          x_add = x.assign_add(c)
+          e = a + c
+        # The variable x is on the task 1 since the device_function has been
+        # called once before the model_fn.
+        self.assertEqual(x.device, '/job:ps/task:1')
+        self.assertEqual(x_add.device, x.device)
+        self.assertEqual(e.device,
+                         '/job:worker/replica:0/task:0/%s' % last_part_device)
+
+        # The colocate_vars_with can override the distribution's device.
+        with d.colocate_vars_with(x):
+          y = variable_scope.get_variable('y', initializer=20.0)
+        y_add = y.assign_add(x_add)
+        self.assertEqual(y.device, '/job:ps/task:1')
+        self.assertEqual(y_add.device, y.device)
+        self.assertEqual(y.device, x.device)
+
+        z = variable_scope.get_variable('z', initializer=10.0)
+        self.assertEqual(z.device, '/job:ps/task:0')
+        self.assertNotEqual(z.device, x.device)
+
+        with ops.control_dependencies([y_add]):
+          z_add = z.assign_add(y)
+        with ops.control_dependencies([z_add]):
+          f = z + c
+        self.assertEqual(f.device,
+                         '/job:worker/replica:0/task:1/%s' % last_part_device)
+
+        # The device scope would merge with the default worker device.
+        with ops.device('/CPU:1'):
+          g = e + 1.0
+        self.assertEqual(g.device, '/job:worker/replica:0/task:1/device:CPU:1')
+
+        # Ths ops.colocate_with will be ignored when defining a variale but not
+        # for a normal tensor.
+        with ops.colocate_with(x):
+          u = variable_scope.get_variable('u', initializer=30.0)
+          v = variable_scope.get_variable('v', initializer=30.0)
+          h = f + 1.0
+        self.assertIn('/job:ps/', u.device)
+        self.assertIn('/job:ps/', v.device)
+        # u and v are on different parameter servers.
+        self.assertTrue(u.device != x.device or v.device != x.device)
+        self.assertTrue(u.device == x.device or v.device == x.device)
+        # Here h is not on one worker. Note h.device is canonical while x.device
+        # is not but.
+        self.assertIn('/job:ps/', h.device)
+        return y_add, z_add, f
+
+      y, z, f = d.call_for_each_tower(model_fn)
+      self.assertNotEqual(y, None)
+      self.assertNotEqual(z, None)
+      self.assertNotEqual(f, None)
+
+      if context.num_gpus() >= 1 and num_gpus <= 1:
+        variables.global_variables_initializer().run()
+        y_val, z_val, f_val = sess.run([y, z, f])
+        self.assertEqual(y_val, 33.0)
+        self.assertEqual(z_val, 43.0)
+        self.assertEqual(f_val, 46.0)
+
+  @combinations.generate(
+      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
+  def testDeviceAssignmentDistributed(self, num_gpus):
+    d = self._get_ps_distribution_strategy('worker', 1, num_gpus=num_gpus)
+    self._test_device_assignment_distributed(d, num_gpus=num_gpus)
+
+  def _test_device_assignment_local(self,
+                                    d,
+                                    compute_device='CPU',
+                                    variable_device='CPU',
+                                    num_gpus=0):
+    with ops.Graph().as_default(), \
+         self._test_session(target=self._workers[0].target) as sess, \
+         d.scope():
+
+      def model_fn():
+        if 'CPU' in compute_device:
+          tower_compute_device = '/device:CPU:0'
+        else:
+          tower_compute_device = (
+              '/device:GPU:%d' % distribute_lib.get_tower_context().tower_id)
+        tower_compute_device = device_util.canonicalize(tower_compute_device)
+
+        if 'CPU' in variable_device:
+          tower_variable_device = '/device:CPU:0'
+        else:
+          tower_variable_device = (
+              '/device:GPU:%d' % distribute_lib.get_tower_context().tower_id)
+        tower_variable_device = device_util.canonicalize(tower_variable_device)
+
+        a = constant_op.constant(1.0)
+        b = constant_op.constant(2.0)
+        c = a + b
+        self.assertEqual(a.device, tower_compute_device)
+        self.assertEqual(b.device, tower_compute_device)
+        self.assertEqual(c.device, tower_compute_device)
+
+        # The device scope is ignored for variables but not for normal ops.
+        with ops.device('/device:GPU:2'):
+          x = variable_scope.get_variable('x', initializer=10.0)
+          x_add = x.assign_add(c)
+          e = a + c
+        self.assertEqual(
+            device_util.canonicalize(x.device), tower_variable_device)
+        self.assertEqual(x_add.device, x.device)
+        self.assertEqual(e.device, device_util.canonicalize('/device:GPU:2'))
+
+        # The colocate_vars_with can override the distribution's device.
+        with d.colocate_vars_with(x):
+          y = variable_scope.get_variable('y', initializer=20.0)
+        y_add = y.assign_add(x_add)
+        self.assertEqual(
+            device_util.canonicalize(y.device), tower_variable_device)
+        self.assertEqual(y_add.device, y.device)
+        self.assertEqual(y.device, x.device)
+
+        z = variable_scope.get_variable('z', initializer=10.0)
+        self.assertEqual(
+            device_util.canonicalize(z.device), tower_variable_device)
+
+        with ops.control_dependencies([y_add]):
+          z_add = z.assign_add(y)
+        with ops.control_dependencies([z_add]):
+          f = z + c
+        self.assertEqual(f.device, tower_compute_device)
+
+        # The device scope would merge with the default worker device.
+        with ops.device('/CPU:1'):
+          g = e + 1.0
+        self.assertEqual(g.device, device_util.canonicalize('/device:CPU:1'))
+
+        # Ths ops.colocate_with will be ignored when defining a variale but not
+        # for a normal tensor.
+        with ops.colocate_with(x):
+          u = variable_scope.get_variable('u', initializer=30.0)
+          h = f + 1.0
+        self.assertEqual(
+            device_util.canonicalize(u.device), tower_variable_device)
+        self.assertEqual(device_util.canonicalize(x.device), h.device)
+        return y_add, z_add, f
+
+      y, z, f = d.call_for_each_tower(model_fn)
+      self.assertNotEqual(y, None)
+      self.assertNotEqual(z, None)
+      self.assertNotEqual(f, None)
+
+      if context.num_gpus() >= 1 and num_gpus <= 1:
+        variables.global_variables_initializer().run()
+        y_val, z_val, f_val = sess.run([y, z, f])
+        self.assertEqual(y_val, 33.0)
+        self.assertEqual(z_val, 43.0)
+        self.assertEqual(f_val, 46.0)
+
+  def testDeviceAssignmentLocal(self):
+    distribution = parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=0)
+    self._test_device_assignment_local(
+        distribution, compute_device='CPU', variable_device='CPU', num_gpus=0)
+
+    distribution = parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=1)
+    self._test_device_assignment_local(
+        distribution, compute_device='GPU', variable_device='GPU', num_gpus=1)
+
+    distribution = parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=2)
+    self._test_device_assignment_local(
+        distribution, compute_device='GPU', variable_device='CPU', num_gpus=2)
+
+  def _test_simple_increment(self, d, task_type, task_index, master_target):
+    if hasattr(d, '_cluster_spec') and d._cluster_spec:
+      num_workers = len(d._cluster_spec.as_dict().get('worker',
+                                                      ['dummy_worker']))
+    else:
+      num_workers = 1
+    with ops.Graph().as_default(), \
+         self._test_session(target=master_target) as sess, \
+         d.scope():
+
+      def model_fn():
+        x = variable_scope.get_variable('x', initializer=10.0)
+        y = variable_scope.get_variable('y', initializer=20.0)
+
+        x_add = x.assign_add(1.0, use_locking=True)
+        y_add = y.assign_add(1.0, use_locking=True)
+
+        train_op = control_flow_ops.group([x_add, y_add])
+        return x, y, train_op
+
+      x, y, train_op = d.call_for_each_tower(model_fn)
+      train_op = d.group(d.unwrap(train_op))
+
+      if context.num_gpus() < d._num_gpus_per_worker:
+        return True
+
+      if task_index == 0:
+        variables.global_variables_initializer().run()
+
+      # Workers waiting for chief worker's initializing variables.
+      self._init_condition.acquire()
+      self._init_reached += 1
+      while self._init_reached != num_workers:
+        self._init_condition.wait()
+      self._init_condition.notify_all()
+      self._init_condition.release()
+
+      sess.run(train_op)
+
+      # Wait for other workers to finish training.
+      self._finish_condition.acquire()
+      self._finish_reached += 1
+      while self._finish_reached != num_workers:
+        self._finish_condition.wait()
+      self._finish_condition.notify_all()
+      self._finish_condition.release()
+
+      x_val, y_val = sess.run([x, y])
+      self.assertEqual(x_val, 10.0 + 1.0 * num_workers * d.num_towers)
+      self.assertEqual(y_val, 20.0 + 1.0 * num_workers * d.num_towers)
+      return (x_val == 10.0 + 1.0 * num_workers * d.num_towers and
+              y_val == 20.0 + 1.0 * num_workers * d.num_towers)
+
+  def _test_minimize_loss_graph(self, d, task_type, task_index, master_target):
+    with ops.Graph().as_default(), \
+         self._test_session(target=master_target) as sess, \
+         d.scope():
+      l = core.Dense(1, use_bias=False)
+
+      def loss_fn(x):
+        y = array_ops.reshape(l(x), []) - constant_op.constant(1.)
+        return y * y
+
+      # TODO(yuefengz, apassos): eager.backprop.implicit_grad is not safe for
+      # multiple graphs (b/111216820).
+      def grad_fn(x):
+        loss = loss_fn(x)
+        var_list = (
+            variables.trainable_variables() + ops.get_collection(
+                ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES))
+        grads = gradients.gradients(loss, var_list)
+        ret = list(zip(grads, var_list))
+        return ret
+
+      def update(v, g):
+        return v.assign_sub(0.05 * g, use_locking=True)
+
+      one = d.broadcast(constant_op.constant([[1.]]))
+
+      def step():
+        """Perform one optimization step."""
+        # Run forward & backward to get gradients, variables list.
+        g_v = d.call_for_each_tower(grad_fn, one)
+        # Update the variables using the gradients and the update() function.
+        before_list = []
+        after_list = []
+        for g, v in g_v:
+          fetched = d.read_var(v)
+          before_list.append(fetched)
+          with ops.control_dependencies([fetched]):
+            # TODO(yuefengz): support non-Mirrored variable as destinations.
+            g = d.reduce(
+                variable_scope.VariableAggregation.SUM, g, destinations=v)
+            with ops.control_dependencies(d.unwrap(d.update(v, update, g))):
+              after_list.append(d.read_var(v))
+        return before_list, after_list
+
+      before_out, after_out = step()
+
+      if context.num_gpus() < d._num_gpus_per_worker:
+        return True
+
+      if task_index == 0:
+        variables.global_variables_initializer().run()
+
+      # Workers waiting for chief worker's initializing variables.
+      self._init_condition.acquire()
+      self._init_reached += 1
+      while self._init_reached != 3:
+        self._init_condition.wait()
+      self._init_condition.notify_all()
+      self._init_condition.release()
+
+      for i in range(10):
+        b, a = sess.run((before_out, after_out))
+        if i == 0:
+          before, = b
+        after, = a
+
+      error_before = abs(before - 1)
+      error_after = abs(after - 1)
+      # Error should go down
+      self.assertLess(error_after, error_before)
+      return error_after < error_before
+
+  def _run_client(self, index, model_fn, num_gpus):
+    task_type = run_config.TaskType.WORKER
+    result = model_fn(
+        self._get_ps_distribution_strategy(task_type, index, num_gpus=num_gpus),
+        task_type, index, self._workers[index].target)
+    if result:
+      with self._lock:
+        self._result += 1
+
+  def _run_multiple_clients(self, num_clients, model_fn, num_gpus=0):
+    threads = []
+    for i in range(num_clients):
+      t = threading.Thread(
+          target=self._run_client, args=(i, model_fn, num_gpus))
+      t.start()
+      threads.append(t)
+    for t in threads:
+      t.join()
+
+  def testSimpleBetweenGraph(self):
+    self._run_multiple_clients(3, self._test_simple_increment)
+    self.assertEqual(self._result, 3)
+
+  @combinations.generate(
+      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
+  def testLocalSimpleIncrement(self, num_gpus):
+    d = parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=num_gpus)
+    self._test_simple_increment(d, 'dummy_worker', 0, '')
+
+  @combinations.generate(
+      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
+  def testMinimizeLossGraph(self, num_gpus):
+    self._run_multiple_clients(
+        3, self._test_minimize_loss_graph, num_gpus=num_gpus)
+    self.assertEqual(self._result, 3)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/values.py b/tensorflow/contrib/distribute/python/values.py
index 1761a43251..4018b1e023 100644
--- a/tensorflow/contrib/distribute/python/values.py
+++ b/tensorflow/contrib/distribute/python/values.py
@@ -30,6 +30,7 @@ from tensorflow.contrib.distribute.python import prefetching_ops_v2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import device as tf_device
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
@@ -77,6 +78,13 @@ class DistributedValues(object):
   def devices(self):
     return list(self._index.keys())
 
+  @property
+  def is_tensor_like(self):
+    for v in self._index.values():
+      if not tensor_util.is_tensor(v):
+        return False
+    return True
+
   def __str__(self):
     return "%s:%s" % (self.__class__.__name__, self._index)
 
@@ -196,11 +204,54 @@ class DistributedVariable(DistributedDelegate):
     # to the container without introducing a reference cycle.
     for v in six.itervalues(index):
       v._distributed_container = weakref.ref(self)  # pylint: disable=protected-access
+    # tf.keras keeps track of variables initialized using this attribute. When
+    # tf.keras gets the default session, it initializes all uninitialized vars.
+    # We need to make _keras_initialized a member of DistributedVariable because
+    # without this it will use `__getattr__` which will delegate to a component
+    # variable.
+    self._keras_initialized = False
+    # Typically, a `DistributedVariable`'s initializer is composed of the
+    # initializers of the components variables. However, in some cases, such as
+    # when restoring from a checkpoint, we may set the _initializer_op
+    # property on the entire `DistributedVariable`.
+    self._initializer_op = None
     super(DistributedVariable, self).__init__(index)
 
+  def is_initialized(self, name=None):
+    """Identifies if all the component variables are initialized.
+
+    Args:
+      name: Name of the final `logical_and` op.
+
+    Returns:
+      The op that evaluates to True or False depending on if all the
+      component variables are initialized.
+    """
+    # We have to cast the self._index.values() to a `list` because when we
+    # use `model_to_estimator` to run tf.keras models, self._index.values() is
+    # of type `dict_values` and not `list`.
+    values_list = list(self._index.values())
+    result = values_list[0].is_initialized()
+    # We iterate through the list of values except the last one to allow us to
+    # name the final `logical_and` op the same name that is passed by the user
+    # to the `is_initialized` op. For distributed variables, the
+    # `is_initialized` op is a `logical_and` op.
+    for v in values_list[1:-1]:
+      result = math_ops.logical_and(result, v.is_initialized())
+    result = math_ops.logical_and(result, values_list[-1].is_initialized(),
+                                  name=name)
+    return result
+
   @property
   def initializer(self):
-    return control_flow_ops.group([v.initializer for v in self._index.values()])
+    if self._initializer_op:
+      init_op = self._initializer_op
+    else:
+      # return grouped ops of all the var initializations of component values of
+      # the mirrored variable
+      init_op = control_flow_ops.group(
+          [v.initializer for v in self._index.values()])
+    return init_op
 
   @property
   def graph(self):
@@ -296,12 +347,6 @@ class MirroredVariable(DistributedVariable, Mirrored,
     for v in six.itervalues(index):
       v._mirrored_container = weakref.ref(self)  # pylint: disable=protected-access
     self._primary_var = primary_var
-    # tf.keras keeps track of variables initialized using this attribute. When
-    # tf.keras gets the default session, it initializes all uninitialized vars.
-    # We need to make _keras_initialized a member of MirroredVariable because
-    # without this it will use `__getattr__` which will delegate to a component
-    # variable.
-    self._keras_initialized = False
     self._aggregation = aggregation
     super(MirroredVariable, self).__init__(index)
 
@@ -325,6 +370,7 @@ class MirroredVariable(DistributedVariable, Mirrored,
       return distribute_lib.get_distribution_strategy().update(
           self, f, *args, **kwargs)
     else:
+      _assert_tower_context()
       # We are calling an assign function on the mirrored variable in tower
       # context.
       # We reduce the value we want to assign/add/sub. More details about how we
@@ -357,28 +403,6 @@ class MirroredVariable(DistributedVariable, Mirrored,
     assign_fn = lambda var, *a, **kw: var.assign(*a, **kw)
     return self._assign_func(f=assign_fn, *args, **kwargs)
 
-  def is_initialized(self, name=None):
-    # We have to cast the self._index.values() to a `list` because when we
-    # use `model_to_estimator` to run tf.keras models, self._index.values() is
-    # of type `dict_values` and not `list`.
-    values_list = list(self._index.values())
-    result = values_list[0].is_initialized()
-    # We iterate through the list of values except the last one to allow us to
-    # name the final `logical_and` op the same name that is passed by the user
-    # to the `is_initialized` op. For mirrored variables, the `is_initialized`
-    # op is a `logical_and` op.
-    for v in values_list[1:-1]:
-      result = math_ops.logical_and(result, v.is_initialized())
-    result = math_ops.logical_and(result, values_list[-1].is_initialized(),
-                                  name=name)
-    return result
-
-  @property
-  def initializer(self):
-    # return grouped ops of all the var initializations of component values of
-    # the mirrored variable
-    return control_flow_ops.group([v.initializer for v in self._index.values()])
-
   @property
   def aggregation(self):
     return self._aggregation
@@ -443,14 +467,7 @@ class _TowerLocalSaveable(saver.BaseSaverBuilder.SaveableObject):
   def restore(self, restored_tensors, restored_shapes):
     """Restore the same value into all variables."""
     tensor, = restored_tensors
-    # To preserve the sum across save and restore, we have to divide the
-    # total across all devices when restoring a variable that was summed
-    # when saving.
-    if self._tower_local_variable.aggregation == vs.VariableAggregation.SUM:
-      tensor *= 1. / len(self._tower_local_variable.devices)
-    return control_flow_ops.group([
-        _assign_on_device(d, v, tensor)
-        for d, v in six.iteritems(self._tower_local_variable._index)])  # pylint: disable=protected-access
+    return self._tower_local_variable.assign(tensor)
 
 
 def _assert_tower_context():
@@ -466,12 +483,6 @@ class TowerLocalVariable(DistributedVariable, PerDevice,
   def __init__(self, index, primary_var, aggregation):
     self._primary_var = primary_var
     self._aggregation = aggregation
-    # tf.keras keeps track of variables initialized using this attribute. When
-    # tf.keras gets the default session, it initializes all uninitialized vars.
-    # We need to make _keras_initialized a member of TowerLocalVariable because
-    # without this it will use `__getattr__` which will delegate to a component
-    # variable.
-    self._keras_initialized = False
     super(TowerLocalVariable, self).__init__(index)
 
   def assign_sub(self, *args, **kwargs):
@@ -483,30 +494,19 @@ class TowerLocalVariable(DistributedVariable, PerDevice,
     return self.get().assign_add(*args, **kwargs)
 
   def assign(self, *args, **kwargs):
-    _assert_tower_context()
-    return self.get().assign(*args, **kwargs)
-
-  def is_initialized(self, name=None):
-    # We have to cast the self._index.values() to a `list` because when we
-    # use `model_to_estimator` to run tf.keras models, self._index.values() is
-    # of type `dict_values` and not `list`.
-    values_list = list(self._index.values())
-    result = values_list[0].is_initialized()
-    # We iterate through the list of values except the last one to allow us to
-    # name the final `logical_and` op the same name that is passed by the user
-    # to the `is_initialized` op. For tower local variables, the
-    # `is_initialized` op is a `logical_and` op.
-    for v in values_list[1:-1]:
-      result = math_ops.logical_and(result, v.is_initialized())
-    result = math_ops.logical_and(result, values_list[-1].is_initialized(),
-                                  name=name)
-    return result
-
-  @property
-  def initializer(self):
-    # return grouped ops of all the var initializations of component values of
-    # the tower local variable
-    return control_flow_ops.group([v.initializer for v in self._index.values()])
+    if distribute_lib.get_cross_tower_context():
+      # To preserve the sum across save and restore, we have to divide the
+      # total across all devices when restoring a variable that was summed
+      # when saving.
+      tensor = args[0]
+      if self._aggregation == vs.VariableAggregation.SUM:
+        tensor *= 1. / len(self.devices)
+      return control_flow_ops.group(
+          [_assign_on_device(d, v, tensor)
+           for d, v in six.iteritems(self._index)])
+    else:
+      _assert_tower_context()
+      return self.get().assign(*args, **kwargs)
 
   @property
   def aggregation(self):
diff --git a/tensorflow/contrib/distribute/python/values_test.py b/tensorflow/contrib/distribute/python/values_test.py
index 8e44f2fea1..91a43d4999 100644
--- a/tensorflow/contrib/distribute/python/values_test.py
+++ b/tensorflow/contrib/distribute/python/values_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
@@ -79,6 +80,30 @@ class DistributedValuesTest(test.TestCase):
     with self.assertRaises(AssertionError):
       v = values.DistributedValues({"/device:cpu:0": 42})
 
+  def testIsTensorLike(self):
+    with context.graph_mode(), \
+         ops.Graph().as_default(), \
+         ops.device("/device:CPU:0"):
+      one = constant_op.constant(1)
+      two = constant_op.constant(2)
+      v = values.DistributedValues({"/device:CPU:0": one, "/device:GPU:0": two})
+      self.assertEqual(two, v.get("/device:GPU:0"))
+      self.assertEqual(one, v.get())
+      self.assertTrue(v.is_tensor_like)
+      self.assertTrue(tensor_util.is_tensor(v))
+
+  def testIsTensorLikeWithAConstant(self):
+    with context.graph_mode(), \
+         ops.Graph().as_default(), \
+         ops.device("/device:CPU:0"):
+      one = constant_op.constant(1)
+      two = 2.0
+      v = values.DistributedValues({"/device:CPU:0": one, "/device:GPU:0": two})
+      self.assertEqual(two, v.get("/device:GPU:0"))
+      self.assertEqual(one, v.get())
+      self.assertFalse(v.is_tensor_like)
+      self.assertFalse(tensor_util.is_tensor(v))
+
 
 class DistributedDelegateTest(test.TestCase):
 
diff --git a/tensorflow/contrib/distributions/python/ops/quantized_distribution.py b/tensorflow/contrib/distributions/python/ops/quantized_distribution.py
index ef3bdfa75f..18a0f754e6 100644
--- a/tensorflow/contrib/distributions/python/ops/quantized_distribution.py
+++ b/tensorflow/contrib/distributions/python/ops/quantized_distribution.py
@@ -326,6 +326,21 @@ class QuantizedDistribution(distributions.Distribution):
         graph_parents=graph_parents,
         name=name)
 
+  @property
+  def distribution(self):
+    """Base distribution, p(x)."""
+    return self._dist
+
+  @property
+  def low(self):
+    """Lowest value that quantization returns."""
+    return self._low
+
+  @property
+  def high(self):
+    """Highest value that quantization returns."""
+    return self._high
+
   def _batch_shape_tensor(self):
     return self.distribution.batch_shape_tensor()
 
@@ -569,8 +584,3 @@ class QuantizedDistribution(distributions.Distribution):
       dependencies = [distribution_util.assert_integer_form(
           value, message="value has non-integer components.")]
       return control_flow_ops.with_dependencies(dependencies, value)
-
-  @property
-  def distribution(self):
-    """Base distribution, p(x)."""
-    return self._dist
diff --git a/tensorflow/contrib/eager/python/examples/generative_examples/dcgan.ipynb b/tensorflow/contrib/eager/python/examples/generative_examples/dcgan.ipynb
index 232f9a8ef0..44ff43a111 100644
--- a/tensorflow/contrib/eager/python/examples/generative_examples/dcgan.ipynb
+++ b/tensorflow/contrib/eager/python/examples/generative_examples/dcgan.ipynb
@@ -27,9 +27,9 @@
         "id": "ITZuApL56Mny"
       },
       "source": [
-        "This notebook demonstrates how to generate images of handwritten digits using [tf.keras](https://www.tensorflow.org/programmers_guide/keras) and [eager execution](https://www.tensorflow.org/programmers_guide/eager). To do this, we use Deep Convolutional Generative Adverserial Networks ([DCGAN](https://arxiv.org/pdf/1511.06434.pdf)).\n",
+        "This notebook demonstrates how to generate images of handwritten digits using [tf.keras](https://www.tensorflow.org/programmers_guide/keras) and [eager execution](https://www.tensorflow.org/programmers_guide/eager). To do so, we use Deep Convolutional Generative Adverserial Networks ([DCGAN](https://arxiv.org/pdf/1511.06434.pdf)).\n",
         "\n",
-        "On a colab GPU(Tesla K80), the model takes around 40 seconds per epoch to train.\n",
+        "This model takes about ~30 seconds per epoch (using tf.contrib.eager.defun to create graph functions) to train on a single Tesla K80 on Colab, as of July 2018.\n",
         "\n",
         "Below is the output generated after training the generator and discriminator models for 150 epochs.\n",
         "\n",
@@ -80,6 +80,8 @@
       },
       "outputs": [],
       "source": [
+        "from __future__ import absolute_import, division, print_function\n",
+        "\n",
         "# Import TensorFlow \u003e= 1.9 and enable eager execution\n",
         "import tensorflow as tf\n",
         "tf.enable_eager_execution()\n",
@@ -201,13 +203,13 @@
         "## Write the generator and discriminator models\n",
         "\n",
         "* **Generator** \n",
-        "  * It is responsible for **creating the convincing images good enough to fool the discriminator**.\n",
-        "  * It consists of Conv2DTranspose(Upsampling) layers. We start with a fully connected layer and upsample the image 2 times so as to reach the desired image size(mnist image size) which is (28, 28, 1). \n",
+        "  * It is responsible for **creating convincing images that are good enough to fool the discriminator**.\n",
+        "  * It consists of Conv2DTranspose (Upsampling) layers. We start with a fully connected layer and upsample the image 2 times so as to reach the desired image size (mnist image size) which is (28, 28, 1). \n",
         "  * We use **leaky relu** activation except for the **last layer** which uses **tanh** activation.\n",
         "  \n",
         "* **Discriminator**\n",
         "  * **The discriminator is responsible for classifying the fake images from the real images.**\n",
-        "  * In other words, the discriminator is given generated images(from the generator) and the real MNIST images. The job of the discriminator is to classify these images into fake(generated) and real(MNIST images).\n",
+        "  * In other words, the discriminator is given generated images (from the generator) and the real MNIST images. The job of the discriminator is to classify these images into fake (generated) and real (MNIST images).\n",
         "  * **Basically the generator should be good enough to fool the discriminator that the generated images are real**."
       ]
     },
@@ -313,6 +315,26 @@
       ]
     },
     {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "k1HpMSLImuRi"
+      },
+      "outputs": [],
+      "source": [
+        "# Defun gives 10 secs/epoch performance boost\n",
+        "generator.call = tf.contrib.eager.defun(generator.call)\n",
+        "discriminator.call = tf.contrib.eager.defun(discriminator.call)"
+      ]
+    },
+    {
       "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
@@ -323,8 +345,8 @@
         "\n",
         "* **Discriminator loss**\n",
         "  * The discriminator loss function takes 2 inputs; **real images, generated images**\n",
-        "  * real_loss is a sigmoid cross entropy loss of the **real images** and an **array of ones(since these are the real images)**\n",
-        "  * generated_loss is a sigmoid cross entropy loss of the **generated images** and an **array of zeros(since these are the fake images)**\n",
+        "  * real_loss is a sigmoid cross entropy loss of the **real images** and an **array of ones (since these are the real images)**\n",
+        "  * generated_loss is a sigmoid cross entropy loss of the **generated images** and an **array of zeros (since these are the fake images)**\n",
         "  * Then the total_loss is the sum of real_loss and the generated_loss\n",
         "  \n",
         "* **Generator loss**\n",
@@ -411,9 +433,9 @@
         "\n",
         "* We start by iterating over the dataset\n",
         "* The generator is given **noise as an input** which when passed through the generator model will output a image looking like a handwritten digit\n",
-        "* The discriminator is given the **real MNIST images as well as the generated images(from the generator)**.\n",
+        "* The discriminator is given the **real MNIST images as well as the generated images (from the generator)**.\n",
         "* Next, we calculate the generator and the discriminator loss.\n",
-        "* Then, we calculate the gradients of loss with respect to both the generator and the discriminator variables(inputs) and apply those to the optimizer.\n",
+        "* Then, we calculate the gradients of loss with respect to both the generator and the discriminator variables (inputs) and apply those to the optimizer.\n",
         "\n",
         "## Generate Images\n",
         "\n",
@@ -442,7 +464,7 @@
         "noise_dim = 100\n",
         "num_examples_to_generate = 100\n",
         "\n",
-        "# keeping the random vector constant for generation(prediction) so\n",
+        "# keeping the random vector constant for generation (prediction) so\n",
         "# it will be easier to see the improvement of the gan.\n",
         "random_vector_for_generation = tf.random_normal([num_examples_to_generate,\n",
         "                                                 noise_dim])"
diff --git a/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb b/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb
index 6be09f98df..b173f856c6 100644
--- a/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb
+++ b/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb
@@ -1,31 +1,11 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "name": "text_generation.ipynb",
-      "version": "0.3.2",
-      "views": {},
-      "default_view": {},
-      "provenance": [],
-      "private_outputs": true,
-      "collapsed_sections": [],
-      "toc_visible": true
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "language": "python",
-      "name": "python3"
-    },
-    "accelerator": "GPU"
-  },
   "cells": [
     {
+      "cell_type": "markdown",
       "metadata": {
-        "id": "hcD2nPQvPOFM",
-        "colab_type": "text"
+        "colab_type": "text",
+        "id": "hcD2nPQvPOFM"
       },
-      "cell_type": "markdown",
       "source": [
         "##### Copyright 2018 The TensorFlow Authors.\n",
         "\n",
@@ -33,19 +13,19 @@
         "\n",
         "# Text Generation using a RNN\n",
         "\n",
-        "<table class=\"tfo-notebook-buttons\" align=\"left\"><td>\n",
-        "<a target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb\">\n",
-        "    <img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>  \n",
-        "</td><td>\n",
-        "<a target=\"_blank\"  href=\"https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb\"><img width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on Github</a></td></table>"
+        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\u003ctd\u003e\n",
+        "\u003ca target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb\"\u003e\n",
+        "    \u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e  \n",
+        "\u003c/td\u003e\u003ctd\u003e\n",
+        "\u003ca target=\"_blank\"  href=\"https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb\"\u003e\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on Github\u003c/a\u003e\u003c/td\u003e\u003c/table\u003e"
       ]
     },
     {
+      "cell_type": "markdown",
       "metadata": {
-        "id": "BwpJ5IffzRG6",
-        "colab_type": "text"
+        "colab_type": "text",
+        "id": "BwpJ5IffzRG6"
       },
-      "cell_type": "markdown",
       "source": [
         "This notebook demonstrates how to generate text using an RNN using [tf.keras](https://www.tensorflow.org/programmers_guide/keras) and [eager execution](https://www.tensorflow.org/programmers_guide/eager). If you like, you can write a similar [model](https://github.com/fchollet/deep-learning-with-python-notebooks/blob/master/8.1-text-generation-with-lstm.ipynb) using less code. Here, we show a lower-level impementation that's useful to understand as prework before diving in to deeper examples in a similar, like [Neural Machine Translation with Attention](https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb).\n",
         "\n",
@@ -102,58 +82,60 @@
       ]
     },
     {
+      "cell_type": "markdown",
       "metadata": {
-        "id": "R3p22DBDsaCA",
-        "colab_type": "text"
+        "colab_type": "text",
+        "id": "R3p22DBDsaCA"
       },
-      "cell_type": "markdown",
       "source": [
         "## Install unidecode library\n",
         "A helpful library to convert unicode to ASCII."
       ]
     },
     {
+      "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
-        "id": "wZ6LOM12wKGH",
-        "colab_type": "code",
         "colab": {
           "autoexec": {
             "startup": false,
             "wait_interval": 0
           }
-        }
+        },
+        "colab_type": "code",
+        "id": "wZ6LOM12wKGH"
       },
-      "cell_type": "code",
+      "outputs": [],
       "source": [
         "!pip install unidecode"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
+      "cell_type": "markdown",
       "metadata": {
-        "id": "WGyKZj3bzf9p",
-        "colab_type": "text"
+        "colab_type": "text",
+        "id": "WGyKZj3bzf9p"
       },
-      "cell_type": "markdown",
       "source": [
         "## Import tensorflow and enable eager execution."
       ]
     },
     {
+      "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
-        "id": "yG_n40gFzf9s",
-        "colab_type": "code",
         "colab": {
           "autoexec": {
             "startup": false,
             "wait_interval": 0
           }
-        }
+        },
+        "colab_type": "code",
+        "id": "yG_n40gFzf9s"
       },
-      "cell_type": "code",
+      "outputs": [],
       "source": [
-        "# Import TensorFlow >= 1.9 and enable eager execution\n",
+        "# Import TensorFlow \u003e= 1.9 and enable eager execution\n",
         "import tensorflow as tf\n",
         "\n",
         "# Note: Once you enable eager execution, it cannot be disabled. \n",
@@ -164,16 +146,14 @@
         "import random\n",
         "import unidecode\n",
         "import time"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
+      "cell_type": "markdown",
       "metadata": {
-        "id": "EHDoRoc5PKWz",
-        "colab_type": "text"
+        "colab_type": "text",
+        "id": "EHDoRoc5PKWz"
       },
-      "cell_type": "markdown",
       "source": [
         "## Download the dataset\n",
         "\n",
@@ -182,76 +162,78 @@
       ]
     },
     {
+      "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
-        "id": "pD_55cOxLkAb",
-        "colab_type": "code",
         "colab": {
           "autoexec": {
             "startup": false,
             "wait_interval": 0
           }
-        }
+        },
+        "colab_type": "code",
+        "id": "pD_55cOxLkAb"
       },
-      "cell_type": "code",
+      "outputs": [],
       "source": [
-        "path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/yashkatariya/shakespeare.txt')"
-      ],
-      "execution_count": 0,
-      "outputs": []
+        "path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')"
+      ]
     },
     {
+      "cell_type": "markdown",
       "metadata": {
-        "id": "UHjdCjDuSvX_",
-        "colab_type": "text"
+        "colab_type": "text",
+        "id": "UHjdCjDuSvX_"
       },
-      "cell_type": "markdown",
       "source": [
         "## Read the dataset\n",
         "\n"
       ]
     },
     {
+      "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
-        "id": "-E5JvY3wzf94",
-        "colab_type": "code",
         "colab": {
           "autoexec": {
             "startup": false,
             "wait_interval": 0
           }
-        }
+        },
+        "colab_type": "code",
+        "id": "-E5JvY3wzf94"
       },
-      "cell_type": "code",
+      "outputs": [],
       "source": [
         "text = unidecode.unidecode(open(path_to_file).read())\n",
         "# length of text is the number of characters in it\n",
         "print (len(text))"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
+      "cell_type": "markdown",
       "metadata": {
-        "id": "Il9ww98izf-D",
-        "colab_type": "text"
+        "colab_type": "text",
+        "id": "Il9ww98izf-D"
       },
-      "cell_type": "markdown",
       "source": [
         "Creating dictionaries to map from characters to their indices and vice-versa, which will be used to vectorize the inputs"
       ]
     },
     {
+      "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
-        "id": "IalZLbvOzf-F",
-        "colab_type": "code",
         "colab": {
           "autoexec": {
             "startup": false,
             "wait_interval": 0
           }
-        }
+        },
+        "colab_type": "code",
+        "id": "IalZLbvOzf-F"
       },
-      "cell_type": "code",
+      "outputs": [],
       "source": [
         "# unique contains all the unique characters in the file\n",
         "unique = sorted(set(text))\n",
@@ -259,22 +241,22 @@
         "# creating a mapping from unique characters to indices\n",
         "char2idx = {u:i for i, u in enumerate(unique)}\n",
         "idx2char = {i:u for i, u in enumerate(unique)}"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
+      "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
-        "id": "1v_qUYfAzf-I",
-        "colab_type": "code",
         "colab": {
           "autoexec": {
             "startup": false,
             "wait_interval": 0
           }
-        }
+        },
+        "colab_type": "code",
+        "id": "1v_qUYfAzf-I"
       },
-      "cell_type": "code",
+      "outputs": [],
       "source": [
         "# setting the maximum length sentence we want for a single input in characters\n",
         "max_length = 100\n",
@@ -293,16 +275,14 @@
         "\n",
         "# buffer size to shuffle our dataset\n",
         "BUFFER_SIZE = 10000"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
+      "cell_type": "markdown",
       "metadata": {
-        "id": "LFjSVAlWzf-N",
-        "colab_type": "text"
+        "colab_type": "text",
+        "id": "LFjSVAlWzf-N"
       },
-      "cell_type": "markdown",
       "source": [
         "## Creating the input and output tensors\n",
         "\n",
@@ -319,17 +299,19 @@
       ]
     },
     {
+      "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
-        "id": "0UHJDA39zf-O",
-        "colab_type": "code",
         "colab": {
           "autoexec": {
             "startup": false,
             "wait_interval": 0
           }
-        }
+        },
+        "colab_type": "code",
+        "id": "0UHJDA39zf-O"
       },
-      "cell_type": "code",
+      "outputs": [],
       "source": [
         "input_text = []\n",
         "target_text = []\n",
@@ -343,45 +325,43 @@
         "    \n",
         "print (np.array(input_text).shape)\n",
         "print (np.array(target_text).shape)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
+      "cell_type": "markdown",
       "metadata": {
-        "id": "MJdfPmdqzf-R",
-        "colab_type": "text"
+        "colab_type": "text",
+        "id": "MJdfPmdqzf-R"
       },
-      "cell_type": "markdown",
       "source": [
         "## Creating batches and shuffling them using tf.data"
       ]
     },
     {
+      "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
-        "id": "p2pGotuNzf-S",
-        "colab_type": "code",
         "colab": {
           "autoexec": {
             "startup": false,
             "wait_interval": 0
           }
-        }
+        },
+        "colab_type": "code",
+        "id": "p2pGotuNzf-S"
       },
-      "cell_type": "code",
+      "outputs": [],
       "source": [
         "dataset = tf.data.Dataset.from_tensor_slices((input_text, target_text)).shuffle(BUFFER_SIZE)\n",
         "dataset = dataset.apply(tf.contrib.data.batch_and_drop_remainder(BATCH_SIZE))"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
+      "cell_type": "markdown",
       "metadata": {
-        "id": "m8gPwEjRzf-Z",
-        "colab_type": "text"
+        "colab_type": "text",
+        "id": "m8gPwEjRzf-Z"
       },
-      "cell_type": "markdown",
       "source": [
         "## Creating the model\n",
         "\n",
@@ -393,17 +373,19 @@
       ]
     },
     {
+      "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
-        "id": "P3KTiiInzf-a",
-        "colab_type": "code",
         "colab": {
           "autoexec": {
             "startup": false,
             "wait_interval": 0
           }
-        }
+        },
+        "colab_type": "code",
+        "id": "P3KTiiInzf-a"
       },
-      "cell_type": "code",
+      "outputs": [],
       "source": [
         "class Model(tf.keras.Model):\n",
         "  def __init__(self, vocab_size, embedding_dim, units, batch_size):\n",
@@ -447,66 +429,64 @@
         "    x = self.fc(output)\n",
         "\n",
         "    return x, states"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
+      "cell_type": "markdown",
       "metadata": {
-        "id": "trpqTWyvk0nr",
-        "colab_type": "text"
+        "colab_type": "text",
+        "id": "trpqTWyvk0nr"
       },
-      "cell_type": "markdown",
       "source": [
         "## Call the model and set the optimizer and the loss function"
       ]
     },
     {
+      "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
-        "id": "7t2XrzEOzf-e",
-        "colab_type": "code",
         "colab": {
           "autoexec": {
             "startup": false,
             "wait_interval": 0
           }
-        }
+        },
+        "colab_type": "code",
+        "id": "7t2XrzEOzf-e"
       },
-      "cell_type": "code",
+      "outputs": [],
       "source": [
         "model = Model(vocab_size, embedding_dim, units, BATCH_SIZE)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
+      "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
-        "id": "dkjWIATszf-h",
-        "colab_type": "code",
         "colab": {
           "autoexec": {
             "startup": false,
             "wait_interval": 0
           }
-        }
+        },
+        "colab_type": "code",
+        "id": "dkjWIATszf-h"
       },
-      "cell_type": "code",
+      "outputs": [],
       "source": [
         "optimizer = tf.train.AdamOptimizer()\n",
         "\n",
         "# using sparse_softmax_cross_entropy so that we don't have to create one-hot vectors\n",
         "def loss_function(real, preds):\n",
         "    return tf.losses.sparse_softmax_cross_entropy(labels=real, logits=preds)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
+      "cell_type": "markdown",
       "metadata": {
-        "id": "lPrP0XMUzf-p",
-        "colab_type": "text"
+        "colab_type": "text",
+        "id": "lPrP0XMUzf-p"
       },
-      "cell_type": "markdown",
       "source": [
         "## Train the model\n",
         "\n",
@@ -531,17 +511,19 @@
       ]
     },
     {
+      "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
-        "id": "d4tSNwymzf-q",
-        "colab_type": "code",
         "colab": {
           "autoexec": {
             "startup": false,
             "wait_interval": 0
           }
-        }
+        },
+        "colab_type": "code",
+        "id": "d4tSNwymzf-q"
       },
-      "cell_type": "code",
+      "outputs": [],
       "source": [
         "# Training step\n",
         "\n",
@@ -574,16 +556,14 @@
         "    \n",
         "    print ('Epoch {} Loss {:.4f}'.format(epoch+1, loss))\n",
         "    print('Time taken for 1 epoch {} sec\\n'.format(time.time() - start))"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
+      "cell_type": "markdown",
       "metadata": {
-        "id": "DjGz1tDkzf-u",
-        "colab_type": "text"
+        "colab_type": "text",
+        "id": "DjGz1tDkzf-u"
       },
-      "cell_type": "markdown",
       "source": [
         "## Predicting using our trained model\n",
         "\n",
@@ -601,17 +581,19 @@
       ]
     },
     {
+      "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
-        "id": "WvuwZBX5Ogfd",
-        "colab_type": "code",
         "colab": {
           "autoexec": {
             "startup": false,
             "wait_interval": 0
           }
-        }
+        },
+        "colab_type": "code",
+        "id": "WvuwZBX5Ogfd"
       },
-      "cell_type": "code",
+      "outputs": [],
       "source": [
         "# Evaluation step(generating text using the model learned)\n",
         "\n",
@@ -648,16 +630,14 @@
         "    text_generated += idx2char[predicted_id]\n",
         "\n",
         "print (start_string + text_generated)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
+      "cell_type": "markdown",
       "metadata": {
-        "id": "AM2Uma_-yVIq",
-        "colab_type": "text"
+        "colab_type": "text",
+        "id": "AM2Uma_-yVIq"
       },
-      "cell_type": "markdown",
       "source": [
         "## Next steps\n",
         "\n",
@@ -668,22 +648,42 @@
       ]
     },
     {
+      "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
-        "id": "gtEd86sX5cB2",
-        "colab_type": "code",
         "colab": {
           "autoexec": {
             "startup": false,
             "wait_interval": 0
           }
-        }
+        },
+        "colab_type": "code",
+        "id": "gtEd86sX5cB2"
       },
-      "cell_type": "code",
+      "outputs": [],
       "source": [
         ""
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     }
-  ]
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "collapsed_sections": [],
+      "default_view": {},
+      "name": "text_generation.ipynb",
+      "private_outputs": true,
+      "provenance": [],
+      "toc_visible": true,
+      "version": "0.3.2",
+      "views": {}
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
 }
diff --git a/tensorflow/contrib/eager/python/examples/l2hmc/README.md b/tensorflow/contrib/eager/python/examples/l2hmc/README.md
new file mode 100644
index 0000000000..d6a2ff7558
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/l2hmc/README.md
@@ -0,0 +1,54 @@
+# L2HMC with TensorFlow eager execution
+
+This folder contains an implementation of [L2HMC](https://arxiv.org/pdf/1711.09268.pdf)  adapted from the released implementation by the authors. The presented implementation runs in both eager and graph mode.
+With eager execution enabled, longer sample chains can be handled compared to graph mode, since no graph is explicitly stored. Moreover, with eager execution enabled, there is no need to use a `tf.while_loop`.
+
+## What is L2HMC?
+L2HMC is an algorithm that learns a non-volume preserving transformation
+for an HMC-like sampling algorithm. More specifically, the non-volume preserving
+transformation is learned with neural nets instantiated within Normalizing Flows
+(more precisely, real-NVPs).
+
+##  Content
+
+- `l2hmc.py`: Dynamics definitions and example energy functions,
+including the 2D strongly correlated Gaussian, the rough well energy function,
+and a Gaussian mixture model.
+- `l2hmc_test.py`: Unit tests and benchmarks for training a sampler on the energy functions in both eager and graph mode.
+- `neural_nets.py`: The neural net for learning the kernel on the 2D strongly correlated example.
+- `main.py`: Run to train a samplers on 2D energy landscapes.
+
+## To run
+- Make sure you have installed TensorFlow 1.9+ or the latest `tf-nightly` or `tf-nightly-gpu` pip package.
+- Execute the command
+
+```bash
+python main.py --train_dir ${PWD}/dump --use_defun
+```
+
+Specifying the optional argument `train_dir` will store event files for
+tensorboard and a plot of sampled chain from the trained sampler.
+
+Specifying the optional argument `use_defun` will let the program use compiled
+graphs when running specific sections and improve the overall speed.
+
+## Boosting Performance with `defun`
+Currently, some models may experience increased overhead with eager execution enabled.
+To improve performance, we could wrap certain functions with the decorator `@tfe.defun`.
+For example, we could wrap the function that does the sampling step:
+
+```python
+@tfe.defun
+def apply_transition(old_sample):
+  new_sample = ...
+  return new_sample
+```
+
+We could also explicitly wrap the desired function with `tfe.defun`:
+
+```python
+apply_transition = tfe.defun(apply_transition)
+```
+
+## Reference
+Generalizing Hamiltonian Monte Carlo with Neural Networks. Levy, Daniel, Hoffman, Matthew D, and Sohl-Dickstein, Jascha. International Conference on Learning Representations (ICLR), 2018.
diff --git a/tensorflow/contrib/eager/python/examples/l2hmc/l2hmc.py b/tensorflow/contrib/eager/python/examples/l2hmc/l2hmc.py
index 275aee5130..14b8324e48 100644
--- a/tensorflow/contrib/eager/python/examples/l2hmc/l2hmc.py
+++ b/tensorflow/contrib/eager/python/examples/l2hmc/l2hmc.py
@@ -32,20 +32,28 @@ from tensorflow.contrib.eager.python.examples.l2hmc import neural_nets
 
 
 class Dynamics(tf.keras.Model):
-  """Dynamics engine of naive L2HMC sampler.
-
-  Args:
-    x_dim: dimensionality of observed data
-    loglikelihood_fn: log-likelihood function of conditional probability
-    n_steps: number of leapfrog steps within each transition
-    eps: initial value learnable scale of step size
-  """
-
-  def __init__(self, x_dim, loglikelihood_fn, n_steps=25, eps=.1):
+  """Dynamics engine of naive L2HMC sampler."""
+
+  def __init__(self,
+               x_dim,
+               minus_loglikelihood_fn,
+               n_steps=25,
+               eps=.1,
+               np_seed=1):
+    """Initialization.
+
+    Args:
+      x_dim: dimensionality of observed data
+      minus_loglikelihood_fn: log-likelihood function of conditional probability
+      n_steps: number of leapfrog steps within each transition
+      eps: initial value learnable scale of step size
+      np_seed: Random seed for numpy; used to control sampled masks.
+    """
     super(Dynamics, self).__init__()
 
+    npr.seed(np_seed)
     self.x_dim = x_dim
-    self.potential = loglikelihood_fn
+    self.potential = minus_loglikelihood_fn
     self.n_steps = n_steps
 
     self._construct_time()
@@ -68,8 +76,8 @@ class Dynamics(tf.keras.Model):
         position, forward=False)
 
     # Decide direction uniformly
-    forward_mask = tf.cast(
-        tf.random_uniform(shape=[tf.shape(position)[0]]) > .5, tf.float32)
+    batch_size = tf.shape(position)[0]
+    forward_mask = tf.cast(tf.random_uniform((batch_size,)) > .5, tf.float32)
     backward_mask = 1. - forward_mask
 
     # Obtain proposed states
@@ -108,7 +116,6 @@ class Dynamics(tf.keras.Model):
       position_post, momentum_post, logdet = lf_fn(position_post, momentum_post,
                                                    i)
       sumlogdet += logdet
-
     accept_prob = self._compute_accept_prob(position, momentum, position_post,
                                             momentum_post, sumlogdet)
 
@@ -125,17 +132,17 @@ class Dynamics(tf.keras.Model):
     sumlogdet += logdet
 
     position, logdet = self._update_position_forward(position, momentum, t,
-                                                     mask)
+                                                     mask, mask_inv)
     sumlogdet += logdet
 
     position, logdet = self._update_position_forward(position, momentum, t,
-                                                     mask_inv)
+                                                     mask_inv, mask)
     sumlogdet += logdet
 
     momentum, logdet = self._update_momentum_forward(position, momentum, t)
     sumlogdet += logdet
 
-    return position, momentum, tf.reduce_sum(sumlogdet, axis=1)
+    return position, momentum, sumlogdet
 
   def _backward_lf(self, position, momentum, i):
     """One backward augmented leapfrog step. See Appendix A in paper."""
@@ -149,17 +156,17 @@ class Dynamics(tf.keras.Model):
     sumlogdet += logdet
 
     position, logdet = self._update_position_backward(position, momentum, t,
-                                                      mask)
+                                                      mask_inv, mask)
     sumlogdet += logdet
 
     position, logdet = self._update_position_backward(position, momentum, t,
-                                                      mask_inv)
+                                                      mask, mask_inv)
     sumlogdet += logdet
 
     momentum, logdet = self._update_momentum_backward(position, momentum, t)
     sumlogdet += logdet
 
-    return position, momentum, tf.reduce_sum(sumlogdet, axis=1)
+    return position, momentum, sumlogdet
 
   def _update_momentum_forward(self, position, momentum, t):
     """Update v in the forward leapfrog step."""
@@ -172,12 +179,11 @@ class Dynamics(tf.keras.Model):
         momentum * tf.exp(scale) -
         .5 * self.eps * (tf.exp(transformed) * grad - translation))
 
-    return momentum, scale
+    return momentum, tf.reduce_sum(scale, axis=1)
 
-  def _update_position_forward(self, position, momentum, t, mask):
+  def _update_position_forward(self, position, momentum, t, mask, mask_inv):
     """Update x in the forward leapfrog step."""
 
-    mask_inv = 1. - mask
     scale, translation, transformed = self.position_fn(
         [momentum, mask * position, t])
     scale *= self.eps
@@ -186,8 +192,7 @@ class Dynamics(tf.keras.Model):
         mask * position +
         mask_inv * (position * tf.exp(scale) + self.eps *
                     (tf.exp(transformed) * momentum + translation)))
-
-    return position, mask_inv * scale
+    return position, tf.reduce_sum(mask_inv * scale, axis=1)
 
   def _update_momentum_backward(self, position, momentum, t):
     """Update v in the backward leapfrog step. Inverting the forward update."""
@@ -200,21 +205,20 @@ class Dynamics(tf.keras.Model):
         tf.exp(scale) * (momentum + .5 * self.eps *
                          (tf.exp(transformed) * grad - translation)))
 
-    return momentum, scale
+    return momentum, tf.reduce_sum(scale, axis=1)
 
-  def _update_position_backward(self, position, momentum, t, mask):
+  def _update_position_backward(self, position, momentum, t, mask, mask_inv):
     """Update x in the backward leapfrog step. Inverting the forward update."""
 
-    mask_inv = 1. - mask
     scale, translation, transformed = self.position_fn(
-        [momentum, mask_inv * position, t])
+        [momentum, mask * position, t])
     scale *= -self.eps
     transformed *= self.eps
     position = (
-        mask_inv * position + mask * tf.exp(scale) *
-        (position - self.eps * tf.exp(transformed) * momentum + translation))
+        mask * position + mask_inv * tf.exp(scale) *
+        (position - self.eps * (tf.exp(transformed) * momentum + translation)))
 
-    return position, mask * scale
+    return position, tf.reduce_sum(mask_inv * scale, axis=1)
 
   def _compute_accept_prob(self, position, momentum, position_post,
                            momentum_post, sumlogdet):
@@ -222,8 +226,10 @@ class Dynamics(tf.keras.Model):
 
     old_hamil = self.hamiltonian(position, momentum)
     new_hamil = self.hamiltonian(position_post, momentum_post)
+    prob = tf.exp(tf.minimum(old_hamil - new_hamil + sumlogdet, 0.))
 
-    return tf.exp(tf.minimum(old_hamil - new_hamil + sumlogdet, 0.))
+    # Ensure numerical stability as well as correct gradients
+    return tf.where(tf.is_finite(prob), prob, tf.zeros_like(prob))
 
   def _construct_time(self):
     """Convert leapfrog step index into sinusoidal time."""
@@ -248,6 +254,8 @@ class Dynamics(tf.keras.Model):
 
     self.masks = []
     for _ in range(self.n_steps):
+      # Need to use npr here because tf would generated different random
+      # values across different `sess.run`
       idx = npr.permutation(np.arange(self.x_dim))[:self.x_dim // 2]
       mask = np.zeros((self.x_dim,))
       mask[idx] = 1.
@@ -273,19 +281,15 @@ class Dynamics(tf.keras.Model):
   def grad_potential(self, position, check_numerics=True):
     """Get gradient of potential function at current location."""
 
-    if not tf.executing_eagerly():
-      # TODO(lxuechen): Change this to tfe.gradients_function when it works
-      grad = tf.gradients(self.potential(position), position)[0]
-    else:
+    if tf.executing_eagerly():
       grad = tfe.gradients_function(self.potential)(position)[0]
-
-    if check_numerics:
-      return tf.check_numerics(grad, message="gradient of potential")
+    else:
+      grad = tf.gradients(self.potential(position), position)[0]
 
     return grad
 
 
-# Examples of unnormalized log density/probabilities
+# Examples of unnormalized log densities
 def get_scg_energy_fn():
   """Get energy function for 2d strongly correlated Gaussian."""
 
@@ -295,32 +299,53 @@ def get_scg_energy_fn():
   sigma_inv = tf.matrix_inverse(sigma)
 
   def energy(x):
-    """Unnormalized log density/energy of 2d strongly correlated Gaussian."""
+    """Unnormalized minus log density of 2d strongly correlated Gaussian."""
 
     xmmu = x - mu
     return .5 * tf.diag_part(
         tf.matmul(tf.matmul(xmmu, sigma_inv), tf.transpose(xmmu)))
 
-  return energy
+  return energy, mu, sigma
 
 
-def get_multivariate_gaussian_energy_fn(x_dim=2):
-  """Get energy function for 2d strongly correlated Gaussian."""
-
-  mu = tf.random_normal(shape=[x_dim])
-  # Lower triangularize and positive diagonal
-  l = tf.sigmoid(
-      tf.matrix_band_part(tf.random_normal(shape=[x_dim, x_dim]), -1, 0))
-  # Exploit Cholesky decomposition
-  sigma = tf.matmul(l, tf.transpose(l))
-  sigma *= 100.  # Small covariance causes extreme numerical instability
-  sigma_inv = tf.matrix_inverse(sigma)
+def get_rw_energy_fn():
+  """Get energy function for rough well distribution."""
+  # For small eta, the density underlying the rough-well energy is very close to
+  # a unit Gaussian; however, the gradient is greatly affected by the small
+  # cosine perturbations
+  eta = 1e-2
+  mu = tf.constant([0., 0.])
+  sigma = tf.constant([[1., 0.], [0., 1.]])
 
   def energy(x):
-    """Unnormalized log density/energy of 2d strongly correlated Gaussian."""
+    ip = tf.reduce_sum(x**2., axis=1)
+    return .5 * ip + eta * tf.reduce_sum(tf.cos(x / eta), axis=1)
 
-    xmmu = x - mu
-    return .5 * tf.diag_part(
-        tf.matmul(tf.matmul(xmmu, sigma_inv), tf.transpose(xmmu)))
+  return energy, mu, sigma
+
+
+# Loss function
+def compute_loss(dynamics, x, scale=.1, eps=1e-4):
+  """Compute loss defined in equation (8)."""
+
+  z = tf.random_normal(tf.shape(x))  # Auxiliary variable
+  x_, _, x_accept_prob, x_out = dynamics.apply_transition(x)
+  z_, _, z_accept_prob, _ = dynamics.apply_transition(z)
+
+  # Add eps for numerical stability; following released impl
+  x_loss = tf.reduce_sum((x - x_)**2, axis=1) * x_accept_prob + eps
+  z_loss = tf.reduce_sum((z - z_)**2, axis=1) * z_accept_prob + eps
+
+  loss = tf.reduce_mean(
+      (1. / x_loss + 1. / z_loss) * scale - (x_loss + z_loss) / scale, axis=0)
+
+  return loss, x_out, x_accept_prob
+
+
+def loss_and_grads(dynamics, x, loss_fn=compute_loss):
+  """Obtain loss value and gradients."""
+  with tf.GradientTape() as tape:
+    loss_val, out, accept_prob = loss_fn(dynamics, x)
+  grads = tape.gradient(loss_val, dynamics.trainable_variables)
 
-  return energy
+  return loss_val, grads, out, accept_prob
diff --git a/tensorflow/contrib/eager/python/examples/l2hmc/l2hmc_test.py b/tensorflow/contrib/eager/python/examples/l2hmc/l2hmc_test.py
index e33b4cae4c..9557479885 100644
--- a/tensorflow/contrib/eager/python/examples/l2hmc/l2hmc_test.py
+++ b/tensorflow/contrib/eager/python/examples/l2hmc/l2hmc_test.py
@@ -37,63 +37,37 @@ def get_default_hparams():
       n_warmup_iters=3)
 
 
-# Relevant functions for benchmarking
-def compute_loss(dynamics, x, scale=.1, eps=1e-4):
-  """Compute loss defined in equation (8)."""
-
-  z = tf.random_normal(tf.shape(x))
-  x_, _, x_accept_prob, x_out = dynamics.apply_transition(x)
-  z_, _, z_accept_prob, _ = dynamics.apply_transition(z)
-
-  # Add eps for numerical stability; following released impl
-  x_loss = tf.reduce_sum((x - x_)**2, axis=1) * x_accept_prob + eps
-  z_loss = tf.reduce_sum((z - z_)**2, axis=1) * z_accept_prob + eps
-
-  loss = tf.reduce_mean(
-      (1. / x_loss + 1. / z_loss) * scale - (x_loss + z_loss) / scale, axis=0)
-
-  return loss, x_out
-
-
-def loss_and_grads(dynamics, x, loss_fn=compute_loss):
-  """Obtain loss value and gradients."""
-
-  with tf.GradientTape() as tape:
-    loss_val, x_out = loss_fn(dynamics, x)
-  grads = tape.gradient(loss_val, dynamics.variables)
-
-  return loss_val, grads, x_out
-
-
-def warmup(dynamics, optimizer, n_iters=1, n_samples=200, loss_fn=compute_loss):
+def warmup(dynamics,
+           optimizer,
+           n_iters=1,
+           n_samples=200,
+           loss_fn=l2hmc.compute_loss):
   """Warmup optimization to reduce overhead."""
 
   samples = tf.random_normal(
       shape=[n_samples, dynamics.x_dim], dtype=tf.float32)
 
   for _ in range(n_iters):
-    _, grads, samples = loss_and_grads(dynamics, samples, loss_fn=loss_fn)
+    _, grads, samples, _ = l2hmc.loss_and_grads(
+        dynamics, samples, loss_fn=loss_fn)
     optimizer.apply_gradients(zip(grads, dynamics.variables))
 
 
 def fit(dynamics,
         samples,
         optimizer,
-        loss_fn=compute_loss,
+        loss_fn=l2hmc.compute_loss,
         n_iters=5000,
         verbose=True,
-        logdir=None,
-        decay_lr=True):
+        logdir=None):
   """Fit L2HMC sampler with given log-likelihood function."""
 
   if logdir:
     summary_writer = tf.contrib.summary.create_file_writer(logdir)
 
   for i in range(n_iters):
-    loss, grads, samples = loss_and_grads(dynamics, samples, loss_fn=loss_fn)
-    # TODO(lxuechen): Proper learning rate decay
-    if decay_lr:
-      grads = [grad * .96**(i // 1000) for grad in grads]
+    loss, grads, samples, _ = l2hmc.loss_and_grads(
+        dynamics, samples, loss_fn=loss_fn)
     optimizer.apply_gradients(zip(grads, dynamics.variables))
     if verbose:
       print("Iteration %d: loss %.4f" % (i, loss))
@@ -112,9 +86,10 @@ class L2hmcTest(tf.test.TestCase):
 
     # Eager mode testing
     hparams = get_default_hparams()
+    energy_fn, _, _ = l2hmc.get_scg_energy_fn()
     dynamics = l2hmc.Dynamics(
         x_dim=hparams.x_dim,
-        loglikelihood_fn=l2hmc.get_scg_energy_fn(),
+        minus_loglikelihood_fn=energy_fn,
         n_steps=hparams.n_steps,
         eps=hparams.eps)
     samples = tf.random_normal(shape=[hparams.n_samples, hparams.x_dim])
@@ -127,9 +102,10 @@ class L2hmcTest(tf.test.TestCase):
 
     # Graph mode testing
     with tf.Graph().as_default():
+      energy_fn, _, _ = l2hmc.get_scg_energy_fn()
       dynamics = l2hmc.Dynamics(
           x_dim=hparams.x_dim,
-          loglikelihood_fn=l2hmc.get_scg_energy_fn(),
+          minus_loglikelihood_fn=energy_fn,
           n_steps=hparams.n_steps,
           eps=hparams.eps)
       x = tf.placeholder(tf.float32, shape=[None, hparams.x_dim])
@@ -150,32 +126,20 @@ class L2hmcTest(tf.test.TestCase):
 class L2hmcBenchmark(tf.test.Benchmark):
   """Eager and graph benchmarks for l2hmc."""
 
-  def _get_energy_fn(self):
-    """Get specific energy function according to FLAGS."""
-
-    if FLAGS.energy_fn == "scg":
-      energy_fn = l2hmc.get_scg_energy_fn()
-    elif FLAGS.energy_fn == "multivariate_gaussian":
-      energy_fn = l2hmc.get_multivariate_gaussian_energy_fn(x_dim=FLAGS.x_dim)
-    else:
-      raise ValueError("No such energy function %s" % FLAGS.energy_fn)
-
-    return energy_fn
-
   def benchmark_graph(self):
     """Benchmark Graph performance."""
 
     hparams = get_default_hparams()
     tf.reset_default_graph()
     with tf.Graph().as_default():
-      energy_fn = self._get_energy_fn()
+      energy_fn, _, _ = l2hmc.get_scg_energy_fn()
       dynamics = l2hmc.Dynamics(
           x_dim=hparams.x_dim,
-          loglikelihood_fn=energy_fn,
+          minus_loglikelihood_fn=energy_fn,
           n_steps=hparams.n_steps,
           eps=hparams.eps)
       x = tf.placeholder(tf.float32, shape=[None, hparams.x_dim])
-      loss, x_out = compute_loss(dynamics, x)
+      loss, x_out, _ = l2hmc.compute_loss(dynamics, x)
 
       global_step = tf.Variable(0., name="global_step", trainable=False)
       learning_rate = tf.train.exponential_decay(
@@ -183,7 +147,11 @@ class L2hmcBenchmark(tf.test.Benchmark):
       optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
       train_op = optimizer.minimize(loss, global_step=global_step)
 
-      with tf.Session() as sess:
+      # Single thread; fairer comparison against eager
+      session_conf = tf.ConfigProto(
+          intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+
+      with tf.Session(config=session_conf) as sess:
         sess.run(tf.global_variables_initializer())
 
         # Warmup to reduce initialization effect when timing
@@ -218,14 +186,14 @@ class L2hmcBenchmark(tf.test.Benchmark):
     """Benchmark Eager performance."""
 
     hparams = get_default_hparams()
-    energy_fn = self._get_energy_fn()
+    energy_fn, _, _ = l2hmc.get_scg_energy_fn()
     dynamics = l2hmc.Dynamics(
         x_dim=hparams.x_dim,
-        loglikelihood_fn=energy_fn,
+        minus_loglikelihood_fn=energy_fn,
         n_steps=hparams.n_steps,
         eps=hparams.eps)
     optimizer = tf.train.AdamOptimizer(learning_rate=hparams.learning_rate)
-    loss_fn = tfe.defun(compute_loss) if defun else compute_loss
+    loss_fn = tfe.defun(l2hmc.compute_loss) if defun else l2hmc.compute_loss
 
     # Warmup to reduce initialization effect when timing
     warmup(dynamics, optimizer, n_iters=hparams.n_warmup_iters, loss_fn=loss_fn)
@@ -234,12 +202,7 @@ class L2hmcBenchmark(tf.test.Benchmark):
     samples = tf.random_normal(
         shape=[hparams.n_samples, hparams.x_dim], dtype=tf.float32)
     start_time = time.time()
-    fit(dynamics,
-        samples,
-        optimizer,
-        loss_fn=loss_fn,
-        n_iters=hparams.n_iters,
-        decay_lr=True)
+    fit(dynamics, samples, optimizer, loss_fn=loss_fn, n_iters=hparams.n_iters)
     wall_time = time.time() - start_time
     examples_per_sec = hparams.n_samples / wall_time
 
@@ -251,14 +214,8 @@ class L2hmcBenchmark(tf.test.Benchmark):
         wall_time=wall_time)
 
     del dynamics
-    del loss_fn
 
 
 if __name__ == "__main__":
-  tf.flags.DEFINE_string("energy_fn", "scg",
-                         ("The energy function/unnormalized log-probability. "
-                          "Either be `scg` or `multivariate_gaussian`"))
-  tf.flags.DEFINE_integer("x_dim", 2, "Dimensionality of observation space.")
-  FLAGS = tf.flags.FLAGS
   tf.enable_eager_execution()
   tf.test.main()
diff --git a/tensorflow/contrib/eager/python/examples/l2hmc/main.py b/tensorflow/contrib/eager/python/examples/l2hmc/main.py
new file mode 100644
index 0000000000..45e1f98429
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/l2hmc/main.py
@@ -0,0 +1,235 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""L2HMC on simple Gaussian mixture model with TensorFlow eager."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+
+from absl import flags
+import numpy as np
+import tensorflow as tf
+from tensorflow.contrib.eager.python.examples.l2hmc import l2hmc
+try:
+  import matplotlib.pyplot as plt  # pylint: disable=g-import-not-at-top
+  HAS_MATPLOTLIB = True
+except ImportError:
+  HAS_MATPLOTLIB = False
+tfe = tf.contrib.eager
+
+
+def main(_):
+  tf.enable_eager_execution()
+  global_step = tf.train.get_or_create_global_step()
+  global_step.assign(1)
+
+  energy_fn, mean, covar = {
+      "scg": l2hmc.get_scg_energy_fn(),
+      "rw": l2hmc.get_rw_energy_fn()
+  }[FLAGS.energy_fn]
+
+  x_dim = 2
+  train_iters = 5000
+  eval_iters = 2000
+  eps = 0.1
+  n_steps = 10  # Chain length
+  n_samples = 200
+  record_loss_every = 100
+
+  dynamics = l2hmc.Dynamics(
+      x_dim=x_dim, minus_loglikelihood_fn=energy_fn, n_steps=n_steps, eps=eps)
+  learning_rate = tf.train.exponential_decay(
+      1e-3, global_step, 1000, 0.96, staircase=True)
+  optimizer = tf.train.AdamOptimizer(learning_rate)
+  checkpointer = tf.train.Checkpoint(
+      optimizer=optimizer, dynamics=dynamics, global_step=global_step)
+
+  if FLAGS.train_dir:
+    summary_writer = tf.contrib.summary.create_file_writer(FLAGS.train_dir)
+    if FLAGS.restore:
+      latest_path = tf.train.latest_checkpoint(FLAGS.train_dir)
+      checkpointer.restore(latest_path)
+      print("Restored latest checkpoint at path:\"{}\" ".format(latest_path))
+      sys.stdout.flush()
+
+  if not FLAGS.restore:
+    # Training
+    if FLAGS.use_defun:
+      # Use `tfe.deun` to boost performance when there are lots of small ops
+      loss_fn = tfe.defun(l2hmc.compute_loss)
+    else:
+      loss_fn = l2hmc.compute_loss
+
+    samples = tf.random_normal(shape=[n_samples, x_dim])
+    for i in range(1, train_iters + 1):
+      loss, samples, accept_prob = train_one_iter(
+          dynamics,
+          samples,
+          optimizer,
+          loss_fn=loss_fn,
+          global_step=global_step)
+
+      if i % record_loss_every == 0:
+        print("Iteration {}, loss {:.4f}, x_accept_prob {:.4f}".format(
+            i, loss.numpy(),
+            accept_prob.numpy().mean()))
+        if FLAGS.train_dir:
+          with summary_writer.as_default():
+            with tf.contrib.summary.always_record_summaries():
+              tf.contrib.summary.scalar("Training loss", loss, step=global_step)
+    print("Training complete.")
+    sys.stdout.flush()
+
+    if FLAGS.train_dir:
+      saved_path = checkpointer.save(
+          file_prefix=os.path.join(FLAGS.train_dir, "ckpt"))
+      print("Saved checkpoint at path: \"{}\" ".format(saved_path))
+      sys.stdout.flush()
+
+  # Evaluation
+  if FLAGS.use_defun:
+    # Use tfe.deun to boost performance when there are lots of small ops
+    apply_transition = tfe.defun(dynamics.apply_transition)
+  else:
+    apply_transition = dynamics.apply_transition
+
+  samples = tf.random_normal(shape=[n_samples, x_dim])
+  samples_history = []
+  for i in range(eval_iters):
+    samples_history.append(samples.numpy())
+    _, _, _, samples = apply_transition(samples)
+  samples_history = np.array(samples_history)
+  print("Sampling complete.")
+  sys.stdout.flush()
+
+  # Mean and covariance of target distribution
+  mean = mean.numpy()
+  covar = covar.numpy()
+  ac_spectrum = compute_ac_spectrum(samples_history, mean, covar)
+  print("First 25 entries of the auto-correlation spectrum: {}".format(
+      ac_spectrum[:25]))
+  ess = compute_ess(ac_spectrum)
+  print("Effective sample size per Metropolis-Hastings step: {}".format(ess))
+  sys.stdout.flush()
+
+  if FLAGS.train_dir:
+    # Plot autocorrelation spectrum in tensorboard
+    plot_step = tfe.Variable(1, trainable=False, dtype=tf.int64)
+
+    for ac in ac_spectrum:
+      with summary_writer.as_default():
+        with tf.contrib.summary.always_record_summaries():
+          tf.contrib.summary.scalar("Autocorrelation", ac, step=plot_step)
+      plot_step.assign(plot_step + n_steps)
+
+    if HAS_MATPLOTLIB:
+      # Choose a single chain and plot the trajectory
+      single_chain = samples_history[:, 0, :]
+      xs = single_chain[:100, 0]
+      ys = single_chain[:100, 1]
+      plt.figure()
+      plt.plot(xs, ys, color="orange", marker="o", alpha=0.6)  # Trained chain
+      plt.savefig(os.path.join(FLAGS.train_dir, "single_chain.png"))
+
+
+def train_one_iter(dynamics,
+                   x,
+                   optimizer,
+                   loss_fn=l2hmc.compute_loss,
+                   global_step=None):
+  """Train the sampler for one iteration."""
+  loss, grads, out, accept_prob = l2hmc.loss_and_grads(
+      dynamics, x, loss_fn=loss_fn)
+  optimizer.apply_gradients(
+      zip(grads, dynamics.trainable_variables), global_step=global_step)
+
+  return loss, out, accept_prob
+
+
+def compute_ac_spectrum(samples_history, target_mean, target_covar):
+  """Compute autocorrelation spectrum.
+
+  Follows equation 15 from the L2HMC paper.
+
+  Args:
+    samples_history: Numpy array of shape [T, B, D], where T is the total
+        number of time steps, B is the batch size, and D is the dimensionality
+        of sample space.
+    target_mean: 1D Numpy array of the mean of target(true) distribution.
+    target_covar: 2D Numpy array representing a symmetric matrix for variance.
+  Returns:
+    Autocorrelation spectrum, Numpy array of shape [T-1].
+  """
+
+  # Using numpy here since eager is a bit slow due to the loop
+  time_steps = samples_history.shape[0]
+  trace = np.trace(target_covar)
+
+  rhos = []
+  for t in range(time_steps - 1):
+    rho_t = 0.
+    for tau in range(time_steps - t):
+      v_tau = samples_history[tau, :, :] - target_mean
+      v_tau_plus_t = samples_history[tau + t, :, :] - target_mean
+      # Take dot product over observation dims and take mean over batch dims
+      rho_t += np.mean(np.sum(v_tau * v_tau_plus_t, axis=1))
+
+    rho_t /= trace * (time_steps - t)
+    rhos.append(rho_t)
+
+  return np.array(rhos)
+
+
+def compute_ess(ac_spectrum):
+  """Compute the effective sample size based on autocorrelation spectrum.
+
+  This follows equation 16 from the L2HMC paper.
+
+  Args:
+    ac_spectrum: Autocorrelation spectrum
+  Returns:
+    The effective sample size
+  """
+  # Cutoff from the first value less than 0.05
+  cutoff = np.argmax(ac_spectrum[1:] < .05)
+  if cutoff == 0:
+    cutoff = len(ac_spectrum)
+  ess = 1. / (1. + 2. * np.sum(ac_spectrum[1:cutoff]))
+  return ess
+
+
+if __name__ == "__main__":
+  flags.DEFINE_string(
+      "train_dir",
+      default=None,
+      help="[Optional] Directory to store the training information")
+  flags.DEFINE_boolean(
+      "restore",
+      default=False,
+      help="[Optional] Restore the latest checkpoint from `train_dir` if True")
+  flags.DEFINE_boolean(
+      "use_defun",
+      default=False,
+      help="[Optional] Use `tfe.defun` to boost performance")
+  flags.DEFINE_string(
+      "energy_fn",
+      default="scg",
+      help="[Optional] The energy function used for experimentation"
+      "Other options include `rw`")
+  FLAGS = flags.FLAGS
+  tf.app.run(main)
diff --git a/tensorflow/contrib/eager/python/examples/revnet/BUILD b/tensorflow/contrib/eager/python/examples/revnet/BUILD
index 0c0e4c0eb9..4f0d46b1ba 100644
--- a/tensorflow/contrib/eager/python/examples/revnet/BUILD
+++ b/tensorflow/contrib/eager/python/examples/revnet/BUILD
@@ -43,6 +43,27 @@ py_library(
     ],
 )
 
+py_library(
+    name = "resnet_preprocessing",
+    srcs = ["resnet_preprocessing.py"],
+    srcs_version = "PY2AND3",
+    tags = ["local"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_library(
+    name = "imagenet_input",
+    srcs = ["imagenet_input.py"],
+    srcs_version = "PY2AND3",
+    tags = ["local"],
+    deps = [
+        ":resnet_preprocessing",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
 # Tests
 cuda_py_test(
     name = "ops_test",
@@ -113,3 +134,39 @@ py_binary(
         "//tensorflow:tensorflow_py",
     ],
 )
+
+py_binary(
+    name = "main_estimator",
+    srcs = ["main_estimator.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":cifar_input",
+        ":main",
+        ":revnet",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_library(
+    name = "main_estimator_lib",
+    srcs = ["main_estimator.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":cifar_input",
+        ":main",
+        ":revnet",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_library(
+    name = "main_estimator_tpu_lib",
+    srcs = ["main_estimator_tpu.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":cifar_input",
+        ":main",
+        ":revnet",
+        "//tensorflow:tensorflow_py",
+    ],
+)
diff --git a/tensorflow/contrib/eager/python/examples/revnet/blocks.py b/tensorflow/contrib/eager/python/examples/revnet/blocks.py
index 306096e9f8..89712f2c45 100644
--- a/tensorflow/contrib/eager/python/examples/revnet/blocks.py
+++ b/tensorflow/contrib/eager/python/examples/revnet/blocks.py
@@ -24,6 +24,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+import operator
+
 import tensorflow as tf
 from tensorflow.contrib.eager.python.examples.revnet import ops
 
@@ -45,7 +48,7 @@ class RevBlock(tf.keras.Model):
                bottleneck=False,
                fused=True,
                dtype=tf.float32):
-    """Initialize RevBlock.
+    """Initialization.
 
     Args:
       n_res: number of residual blocks
@@ -88,49 +91,27 @@ class RevBlock(tf.keras.Model):
       h = block(h, training=training)
     return h
 
-  def backward_grads_and_vars(self, x, y, dy, training=True):
+  def backward_grads(self, x, y, dy, training=True):
     """Apply reversible block backward to outputs."""
 
     grads_all = []
-    vars_all = []
-
     for i in reversed(range(len(self.blocks))):
       block = self.blocks[i]
       if i == 0:
         # First block usually contains downsampling that can't be reversed
-        with tf.GradientTape() as tape:
-          x = tf.identity(x)
-          tape.watch(x)
-          y = block(x, training=training)
-
-        grads_combined = tape.gradient(
-            y, [x] + block.trainable_variables, output_gradients=dy)
-        dy = grads_combined[0]
-        grads_all += grads_combined[1:]
-        vars_all += block.trainable_variables
+        dy, grads = block.backward_grads_with_downsample(
+            x, y, dy, training=True)
       else:
-        y, dy, grads, vars_ = block.backward_grads_and_vars(
-            y, dy, training=training)
-        grads_all += grads
-        vars_all += vars_
+        y, dy, grads = block.backward_grads(y, dy, training=training)
+      grads_all = grads + grads_all
 
-    return dy, grads_all, vars_all
+    return dy, grads_all
 
 
 class _Residual(tf.keras.Model):
   """Single residual block contained in a _RevBlock. Each `_Residual` object has
   two _ResidualInner objects, corresponding to the `F` and `G` functions in the
   paper.
-
-  Args:
-    filters: output filter size
-    strides: length 2 list/tuple of integers for height and width strides
-    input_shape: length 3 list/tuple of integers
-    batch_norm_first: whether to apply activation and batch norm before conv
-    data_format: tensor data format, "NCHW"/"NHWC",
-    bottleneck: use bottleneck residual if True
-    fused: use fused batch normalization if True
-    dtype: float16, float32, or float64
   """
 
   def __init__(self,
@@ -142,6 +123,18 @@ class _Residual(tf.keras.Model):
                bottleneck=False,
                fused=True,
                dtype=tf.float32):
+    """Initialization.
+
+    Args:
+      filters: output filter size
+      strides: length 2 list/tuple of integers for height and width strides
+      input_shape: length 3 list/tuple of integers
+      batch_norm_first: whether to apply activation and batch norm before conv
+      data_format: tensor data format, "NCHW"/"NHWC",
+      bottleneck: use bottleneck residual if True
+      fused: use fused batch normalization if True
+      dtype: float16, float32, or float64
+    """
     super(_Residual, self).__init__()
 
     self.filters = filters
@@ -191,167 +184,324 @@ class _Residual(tf.keras.Model):
 
     return tf.concat([y1, y2], axis=self.axis)
 
-  def backward_grads_and_vars(self, y, dy, training=True):
+  def backward_grads(self, y, dy, training=True):
     """Manually compute backward gradients given input and output grads."""
     dy1, dy2 = tf.split(dy, num_or_size_splits=2, axis=self.axis)
+    y1, y2 = tf.split(y, num_or_size_splits=2, axis=self.axis)
 
-    with tf.GradientTape(persistent=True) as tape:
-      y = tf.identity(y)
-      tape.watch(y)
-      y1, y2 = tf.split(y, num_or_size_splits=2, axis=self.axis)
-      z1 = y1
-      gz1 = self.g(z1, training=training)
-      x2 = y2 - gz1
+    with tf.GradientTape() as gtape:
+      gtape.watch(y1)
+      gy1 = self.g(y1, training=training)
+    grads_combined = gtape.gradient(
+        gy1, [y1] + self.g.trainable_variables, output_gradients=dy2)
+    dg = grads_combined[1:]
+    dx1 = dy1 + grads_combined[0]
+    # This doesn't affect eager execution, but improves memory efficiency with
+    # graphs
+    with tf.control_dependencies(dg + [dx1]):
+      x2 = y2 - gy1
+
+    with tf.GradientTape() as ftape:
+      ftape.watch(x2)
       fx2 = self.f(x2, training=training)
-      x1 = z1 - fx2
+    grads_combined = ftape.gradient(
+        fx2, [x2] + self.f.trainable_variables, output_gradients=dx1)
+    df = grads_combined[1:]
+    dx2 = dy2 + grads_combined[0]
+    # Same behavior as above
+    with tf.control_dependencies(df + [dx2]):
+      x1 = y1 - fx2
 
-    grads_combined = tape.gradient(
-        gz1, [z1] + self.g.trainable_variables, output_gradients=dy2)
-    dz1 = dy1 + grads_combined[0]
+    x = tf.concat([x1, x2], axis=self.axis)
+    dx = tf.concat([dx1, dx2], axis=self.axis)
+    grads = df + dg
+
+    return x, dx, grads
+
+  def backward_grads_with_downsample(self, x, y, dy, training=True):
+    """Manually compute backward gradients given input and output grads."""
+    # Splitting this from `backward_grads` for better readability
+    x1, x2 = tf.split(x, num_or_size_splits=2, axis=self.axis)
+    y1, _ = tf.split(y, num_or_size_splits=2, axis=self.axis)
+    dy1, dy2 = tf.split(dy, num_or_size_splits=2, axis=self.axis)
+
+    with tf.GradientTape() as gtape:
+      gtape.watch(y1)
+      gy1 = self.g(y1, training=training)
+    grads_combined = gtape.gradient(
+        gy1, [y1] + self.g.trainable_variables, output_gradients=dy2)
     dg = grads_combined[1:]
-    dx1 = dz1
+    dz1 = dy1 + grads_combined[0]
 
-    grads_combined = tape.gradient(
+    # dx1 need one more step to backprop through downsample
+    with tf.GradientTape() as x1tape:
+      x1tape.watch(x1)
+      z1 = ops.downsample(x1, self.filters // 2, self.strides, axis=self.axis)
+    dx1 = x1tape.gradient(z1, x1, output_gradients=dz1)
+
+    with tf.GradientTape() as ftape:
+      ftape.watch(x2)
+      fx2 = self.f(x2, training=training)
+    grads_combined = ftape.gradient(
         fx2, [x2] + self.f.trainable_variables, output_gradients=dz1)
-    dx2 = dy2 + grads_combined[0]
-    df = grads_combined[1:]
+    dx2, df = grads_combined[0], grads_combined[1:]
 
-    del tape
+    # dx2 need one more step to backprop through downsample
+    with tf.GradientTape() as x2tape:
+      x2tape.watch(x2)
+      z2 = ops.downsample(x2, self.filters // 2, self.strides, axis=self.axis)
+    dx2 += x2tape.gradient(z2, x2, output_gradients=dy2)
 
-    grads = df + dg
-    vars_ = self.f.trainable_variables + self.g.trainable_variables
-
-    x = tf.concat([x1, x2], axis=self.axis)
     dx = tf.concat([dx1, dx2], axis=self.axis)
+    grads = df + dg
 
-    return x, dx, grads, vars_
+    return dx, grads
 
 
-def _BottleneckResidualInner(filters,
-                             strides,
-                             input_shape,
-                             batch_norm_first=True,
-                             data_format="channels_first",
-                             fused=True,
-                             dtype=tf.float32):
+# Ideally, the following should be wrapped in `tf.keras.Sequential`, however
+# there are subtle issues with its placeholder insertion policy and batch norm
+class _BottleneckResidualInner(tf.keras.Model):
   """Single bottleneck residual inner function contained in _Resdual.
 
   Corresponds to the `F`/`G` functions in the paper.
   Suitable for training on ImageNet dataset.
-
-  Args:
-    filters: output filter size
-    strides: length 2 list/tuple of integers for height and width strides
-    input_shape: length 3 list/tuple of integers
-    batch_norm_first: whether to apply activation and batch norm before conv
-    data_format: tensor data format, "NCHW"/"NHWC"
-    fused: use fused batch normalization if True
-    dtype: float16, float32, or float64
-
-  Returns:
-    A keras model
   """
 
-  axis = 1 if data_format == "channels_first" else 3
-  model = tf.keras.Sequential()
-  if batch_norm_first:
-    model.add(
-        tf.keras.layers.BatchNormalization(
-            axis=axis, input_shape=input_shape, fused=fused, dtype=dtype))
-    model.add(tf.keras.layers.Activation("relu"))
-  model.add(
-      tf.keras.layers.Conv2D(
-          filters=filters // 4,
-          kernel_size=1,
-          strides=strides,
-          input_shape=input_shape,
-          data_format=data_format,
-          use_bias=False,
-          padding="SAME",
-          dtype=dtype))
-
-  model.add(
-      tf.keras.layers.BatchNormalization(axis=axis, fused=fused, dtype=dtype))
-  model.add(tf.keras.layers.Activation("relu"))
-  model.add(
-      tf.keras.layers.Conv2D(
-          filters=filters // 4,
-          kernel_size=3,
-          strides=(1, 1),
-          data_format=data_format,
-          use_bias=False,
-          padding="SAME",
-          dtype=dtype))
-
-  model.add(
-      tf.keras.layers.BatchNormalization(axis=axis, fused=fused, dtype=dtype))
-  model.add(tf.keras.layers.Activation("relu"))
-  model.add(
-      tf.keras.layers.Conv2D(
-          filters=filters,
-          kernel_size=1,
-          strides=(1, 1),
-          data_format=data_format,
-          use_bias=False,
-          padding="SAME",
-          dtype=dtype))
+  def __init__(self,
+               filters,
+               strides,
+               input_shape,
+               batch_norm_first=True,
+               data_format="channels_first",
+               fused=True,
+               dtype=tf.float32):
+    """Initialization.
 
-  return model
+    Args:
+      filters: output filter size
+      strides: length 2 list/tuple of integers for height and width strides
+      input_shape: length 3 list/tuple of integers
+      batch_norm_first: whether to apply activation and batch norm before conv
+      data_format: tensor data format, "NCHW"/"NHWC"
+      fused: use fused batch normalization if True
+      dtype: float16, float32, or float64
+    """
+    super(_BottleneckResidualInner, self).__init__()
+    axis = 1 if data_format == "channels_first" else 3
+    if batch_norm_first:
+      self.batch_norm_0 = tf.keras.layers.BatchNormalization(
+          axis=axis, input_shape=input_shape, fused=fused, dtype=dtype)
+    self.conv2d_1 = tf.keras.layers.Conv2D(
+        filters=filters // 4,
+        kernel_size=1,
+        strides=strides,
+        input_shape=input_shape,
+        data_format=data_format,
+        use_bias=False,
+        padding="SAME",
+        dtype=dtype)
 
+    self.batch_norm_1 = tf.keras.layers.BatchNormalization(
+        axis=axis, fused=fused, dtype=dtype)
+    self.conv2d_2 = tf.keras.layers.Conv2D(
+        filters=filters // 4,
+        kernel_size=3,
+        strides=(1, 1),
+        data_format=data_format,
+        use_bias=False,
+        padding="SAME",
+        dtype=dtype)
+
+    self.batch_norm_2 = tf.keras.layers.BatchNormalization(
+        axis=axis, fused=fused, dtype=dtype)
+    self.conv2d_3 = tf.keras.layers.Conv2D(
+        filters=filters,
+        kernel_size=1,
+        strides=(1, 1),
+        data_format=data_format,
+        use_bias=False,
+        padding="SAME",
+        dtype=dtype)
+
+    self.batch_norm_first = batch_norm_first
+
+  def call(self, x, training=True):
+    net = x
+    if self.batch_norm_first:
+      net = self.batch_norm_0(net, training=training)
+      net = tf.nn.relu(net)
+    net = self.conv2d_1(net)
 
-def _ResidualInner(filters,
-                   strides,
-                   input_shape,
-                   batch_norm_first=True,
-                   data_format="channels_first",
-                   fused=True,
-                   dtype=tf.float32):
+    net = self.batch_norm_1(net, training=training)
+    net = tf.nn.relu(net)
+    net = self.conv2d_2(net)
+
+    net = self.batch_norm_2(net, training=training)
+    net = tf.nn.relu(net)
+    net = self.conv2d_3(net)
+
+    return net
+
+
+class _ResidualInner(tf.keras.Model):
   """Single residual inner function contained in _ResdualBlock.
 
   Corresponds to the `F`/`G` functions in the paper.
-
-  Args:
-    filters: output filter size
-    strides: length 2 list/tuple of integers for height and width strides
-    input_shape: length 3 list/tuple of integers
-    batch_norm_first: whether to apply activation and batch norm before conv
-    data_format: tensor data format, "NCHW"/"NHWC"
-    fused: use fused batch normalization if True
-    dtype: float16, float32, or float64
-
-  Returns:
-    A keras model
   """
 
-  axis = 1 if data_format == "channels_first" else 3
-  model = tf.keras.Sequential()
-  if batch_norm_first:
-    model.add(
-        tf.keras.layers.BatchNormalization(
-            axis=axis, input_shape=input_shape, fused=fused, dtype=dtype))
-    model.add(tf.keras.layers.Activation("relu"))
-  model.add(
-      tf.keras.layers.Conv2D(
-          filters=filters,
-          kernel_size=3,
-          strides=strides,
-          input_shape=input_shape,
-          data_format=data_format,
-          use_bias=False,
-          padding="SAME",
-          dtype=dtype))
-
-  model.add(
-      tf.keras.layers.BatchNormalization(axis=axis, fused=fused, dtype=dtype))
-  model.add(tf.keras.layers.Activation("relu"))
-  model.add(
-      tf.keras.layers.Conv2D(
-          filters=filters,
-          kernel_size=3,
-          strides=(1, 1),
-          data_format=data_format,
-          use_bias=False,
+  def __init__(self,
+               filters,
+               strides,
+               input_shape,
+               batch_norm_first=True,
+               data_format="channels_first",
+               fused=True,
+               dtype=tf.float32):
+    """Initialization.
+
+    Args:
+      filters: output filter size
+      strides: length 2 list/tuple of integers for height and width strides
+      input_shape: length 3 list/tuple of integers
+      batch_norm_first: whether to apply activation and batch norm before conv
+      data_format: tensor data format, "NCHW"/"NHWC"
+      fused: use fused batch normalization if True
+      dtype: float16, float32, or float64
+    """
+    super(_ResidualInner, self).__init__()
+    axis = 1 if data_format == "channels_first" else 3
+    if batch_norm_first:
+      self.batch_norm_0 = tf.keras.layers.BatchNormalization(
+          axis=axis, input_shape=input_shape, fused=fused, dtype=dtype)
+    self.conv2d_1 = tf.keras.layers.Conv2D(
+        filters=filters,
+        kernel_size=3,
+        strides=strides,
+        input_shape=input_shape,
+        data_format=data_format,
+        use_bias=False,
+        padding="SAME",
+        dtype=dtype)
+
+    self.batch_norm_1 = tf.keras.layers.BatchNormalization(
+        axis=axis, fused=fused, dtype=dtype)
+    self.conv2d_2 = tf.keras.layers.Conv2D(
+        filters=filters,
+        kernel_size=3,
+        strides=(1, 1),
+        data_format=data_format,
+        use_bias=False,
+        padding="SAME",
+        dtype=dtype)
+
+    self.batch_norm_first = batch_norm_first
+
+  def call(self, x, training=True):
+    net = x
+    if self.batch_norm_first:
+      net = self.batch_norm_0(net, training=training)
+      net = tf.nn.relu(net)
+    net = self.conv2d_1(net)
+
+    net = self.batch_norm_1(net, training=training)
+    net = tf.nn.relu(net)
+    net = self.conv2d_2(net)
+
+    return net
+
+
+class InitBlock(tf.keras.Model):
+  """Initial block of RevNet."""
+
+  def __init__(self, config):
+    """Initialization.
+
+    Args:
+      config: tf.contrib.training.HParams object; specifies hyperparameters
+    """
+    super(InitBlock, self).__init__()
+    self.config = config
+    self.axis = 1 if self.config.data_format == "channels_first" else 3
+    self.conv2d = tf.keras.layers.Conv2D(
+        filters=self.config.init_filters,
+        kernel_size=self.config.init_kernel,
+        strides=(self.config.init_stride, self.config.init_stride),
+        data_format=self.config.data_format,
+        use_bias=False,
+        padding="SAME",
+        input_shape=self.config.input_shape,
+        dtype=self.config.dtype)
+    self.batch_norm = tf.keras.layers.BatchNormalization(
+        axis=self.axis, fused=self.config.fused, dtype=self.config.dtype)
+    self.activation = tf.keras.layers.Activation("relu")
+
+    if self.config.init_max_pool:
+      self.max_pool = tf.keras.layers.MaxPooling2D(
+          pool_size=(3, 3),
+          strides=(2, 2),
           padding="SAME",
-          dtype=dtype))
+          data_format=self.config.data_format,
+          dtype=self.config.dtype)
+
+  def call(self, x, training=True):
+    net = x
+    net = self.conv2d(net)
+    net = self.batch_norm(net, training=training)
+    net = self.activation(net)
+
+    if self.config.init_max_pool:
+      net = self.max_pool(net)
+
+    return net
 
-  return model
+
+class FinalBlock(tf.keras.Model):
+  """Final block of RevNet."""
+
+  def __init__(self, config):
+    """Initialization.
+
+    Args:
+      config: tf.contrib.training.HParams object; specifies hyperparameters
+
+    Raises:
+      ValueError: Unsupported data format
+    """
+    super(FinalBlock, self).__init__()
+    self.config = config
+    self.axis = 1 if self.config.data_format == "channels_first" else 3
+
+    f = self.config.filters[-1]  # Number of filters
+    r = functools.reduce(operator.mul, self.config.strides, 1)  # Reduce ratio
+    r *= self.config.init_stride
+    if self.config.init_max_pool:
+      r *= 2
+
+    if self.config.data_format == "channels_first":
+      w, h = self.config.input_shape[1], self.config.input_shape[2]
+      input_shape = (f, w // r, h // r)
+    elif self.config.data_format == "channels_last":
+      w, h = self.config.input_shape[0], self.config.input_shape[1]
+      input_shape = (w // r, h // r, f)
+    else:
+      raise ValueError("Data format should be either `channels_first`"
+                       " or `channels_last`")
+    self.batch_norm = tf.keras.layers.BatchNormalization(
+        axis=self.axis,
+        input_shape=input_shape,
+        fused=self.config.fused,
+        dtype=self.config.dtype)
+    self.activation = tf.keras.layers.Activation("relu")
+    self.global_avg_pool = tf.keras.layers.GlobalAveragePooling2D(
+        data_format=self.config.data_format, dtype=self.config.dtype)
+    self.dense = tf.keras.layers.Dense(
+        self.config.n_classes, dtype=self.config.dtype)
+
+  def call(self, x, training=True):
+    net = x
+    net = self.batch_norm(net, training=training)
+    net = self.activation(net)
+    net = self.global_avg_pool(net)
+    net = self.dense(net)
+
+    return net
diff --git a/tensorflow/contrib/eager/python/examples/revnet/blocks_test.py b/tensorflow/contrib/eager/python/examples/revnet/blocks_test.py
index d74785c8fe..3c6ea63e48 100644
--- a/tensorflow/contrib/eager/python/examples/revnet/blocks_test.py
+++ b/tensorflow/contrib/eager/python/examples/revnet/blocks_test.py
@@ -179,7 +179,7 @@ class RevBlockTest(tf.test.TestCase):
       degree = compute_degree(g1, g2)
       self.assertLessEqual(degree, atol)
 
-  def test_backward_grads_and_vars_channels_first(self):
+  def test_backward_grads_channels_first(self):
     """Test `backward` function with `channels_first` data format."""
     if not tf.test.is_gpu_available():
       self.skipTest("GPU not available")
@@ -201,7 +201,8 @@ class RevBlockTest(tf.test.TestCase):
         tape.watch(x)
         y = block(x, training=True)
       # Compute grads from reconstruction
-      dx, dw, vars_ = block.backward_grads_and_vars(x, y, dy, training=True)
+      dx, dw = block.backward_grads(x, y, dy, training=True)
+      vars_ = block.trainable_variables
       # Compute true grads
       grads = tape.gradient(y, [x] + vars_, output_gradients=dy)
       dx_true, dw_true = grads[0], grads[1:]
@@ -224,7 +225,8 @@ class RevBlockTest(tf.test.TestCase):
         tape.watch(x)
         y = block(x, training=True)
       # Compute grads from reconstruction
-      dx, dw, vars_ = block.backward_grads_and_vars(x, y, dy, training=True)
+      dx, dw = block.backward_grads(x, y, dy, training=True)
+      vars_ = block.trainable_variables
       # Compute true grads
       grads = tape.gradient(y, [x] + vars_, output_gradients=dy)
       dx_true, dw_true = grads[0], grads[1:]
@@ -245,7 +247,7 @@ class _ResidualTest(tf.test.TestCase):
     _validate_block_call_channels_first(blocks._Residual, self)
     _validate_block_call_channels_last(blocks._Residual, self)
 
-  def test_backward_grads_and_vars_channels_first(self):
+  def test_backward_grads_channels_first(self):
     """Test `backward_grads` function with `channels_first` data format."""
     if not tf.test.is_gpu_available():
       self.skipTest("GPU not available")
@@ -269,9 +271,8 @@ class _ResidualTest(tf.test.TestCase):
         y = residual(x_true, training=True)
 
       # Gradients computed due to reversibility
-      x, dx, dw, vars_ = residual.backward_grads_and_vars(
-          y, dy=dy, training=True)
-
+      x, dx, dw = residual.backward_grads(y, dy=dy, training=True)
+      vars_ = residual.trainable_variables
       # True gradients computed by the tape
       grads = tape.gradient(y, [x_true] + vars_, output_gradients=dy)
       dx_true, dw_true = grads[0], grads[1:]
diff --git a/tensorflow/contrib/eager/python/examples/revnet/cifar_input.py b/tensorflow/contrib/eager/python/examples/revnet/cifar_input.py
index b6d4c35bfd..e9672f13e1 100644
--- a/tensorflow/contrib/eager/python/examples/revnet/cifar_input.py
+++ b/tensorflow/contrib/eager/python/examples/revnet/cifar_input.py
@@ -111,6 +111,6 @@ def get_ds_from_tfrecords(data_dir,
     }[split]
     dataset = dataset.shuffle(size)
 
-  dataset = dataset.batch(batch_size)
+  dataset = dataset.batch(batch_size, drop_remainder=True)
 
   return dataset
diff --git a/tensorflow/contrib/eager/python/examples/revnet/config.py b/tensorflow/contrib/eager/python/examples/revnet/config.py
index 3d93fa955a..821a4878c1 100644
--- a/tensorflow/contrib/eager/python/examples/revnet/config.py
+++ b/tensorflow/contrib/eager/python/examples/revnet/config.py
@@ -27,17 +27,17 @@ from __future__ import division
 from __future__ import print_function
 
 import tensorflow as tf
-tfe = tf.contrib.eager
 
 
 def get_hparams_cifar_38():
   """RevNet-38 configurations for CIFAR-10/CIFAR-100."""
 
   config = tf.contrib.training.HParams()
+  config.add_hparam("num_train_images", 50000)
+  config.add_hparam("num_eval_images", 10000)
   config.add_hparam("init_filters", 32)
   config.add_hparam("init_kernel", 3)
   config.add_hparam("init_stride", 1)
-  config.add_hparam("n_classes", 10)
   config.add_hparam("n_rev_blocks", 3)
   config.add_hparam("n_res", [3, 3, 3])
   config.add_hparam("filters", [32, 64, 112])
@@ -46,7 +46,7 @@ def get_hparams_cifar_38():
   config.add_hparam("bottleneck", False)
   config.add_hparam("fused", True)
   config.add_hparam("init_max_pool", False)
-  if tfe.num_gpus() > 0:
+  if tf.test.is_gpu_available():
     config.add_hparam("input_shape", (3, 32, 32))
     config.add_hparam("data_format", "channels_first")
   else:
@@ -68,9 +68,21 @@ def get_hparams_cifar_38():
   config.add_hparam("div255", True)
   # This is imprecise, when training with validation set,
   # we only have 40k images in training data
-  config.add_hparam("iters_per_epoch", 50000 // config.batch_size)
+  config.add_hparam("iters_per_epoch",
+                    config.num_train_images // config.batch_size)
   config.add_hparam("epochs", config.max_train_iter // config.iters_per_epoch)
 
+  # Customized TPU hyperparameters due to differing batch size caused by
+  # TPU architecture specifics
+  # Suggested batch sizes to reduce overhead from excessive tensor padding
+  # https://cloud.google.com/tpu/docs/troubleshooting
+  config.add_hparam("tpu_batch_size", 1024)
+  config.add_hparam("tpu_eval_batch_size", 1024)
+  config.add_hparam("tpu_iters_per_epoch",
+                    config.num_train_images // config.tpu_batch_size)
+  config.add_hparam("tpu_epochs",
+                    config.max_train_iter // config.tpu_iters_per_epoch)
+
   return config
 
 
@@ -98,15 +110,18 @@ def get_hparams_imagenet_56():
   """RevNet-56 configurations for ImageNet."""
 
   config = tf.contrib.training.HParams()
+  config.add_hparam("n_classes", 1000)
+  config.add_hparam("dataset", "ImageNet")
+  config.add_hparam("num_train_images", 1281167)
+  config.add_hparam("num_eval_images", 50000)
   config.add_hparam("init_filters", 128)
   config.add_hparam("init_kernel", 7)
   config.add_hparam("init_stride", 2)
-  config.add_hparam("n_classes", 1000)
   config.add_hparam("n_rev_blocks", 4)
   config.add_hparam("n_res", [2, 2, 2, 2])
   config.add_hparam("filters", [128, 256, 512, 832])
   config.add_hparam("strides", [1, 2, 2, 2])
-  config.add_hparam("batch_size", 16)
+  config.add_hparam("batch_size", 256)
   config.add_hparam("bottleneck", True)
   config.add_hparam("fused", True)
   config.add_hparam("init_max_pool", True)
@@ -116,6 +131,9 @@ def get_hparams_imagenet_56():
   else:
     config.add_hparam("input_shape", (224, 224, 3))
     config.add_hparam("data_format", "channels_last")
+  # Due to bottleneck residual blocks
+  filters = [f * 4 for f in config.filters]
+  config.filters = filters
 
   # Training details
   config.add_hparam("weight_decay", 1e-4)
@@ -125,16 +143,31 @@ def get_hparams_imagenet_56():
   config.add_hparam("max_train_iter", 600000)
   config.add_hparam("seed", 1234)
   config.add_hparam("shuffle", True)
-  config.add_hparam("log_every", 50)
-  config.add_hparam("save_every", 50)
+  config.add_hparam("log_every", 500)
+  config.add_hparam("save_every", 500)
   config.add_hparam("dtype", tf.float32)
-  config.add_hparam("eval_batch_size", 1000)
+  config.add_hparam("eval_batch_size", 256)
   config.add_hparam("div255", True)
-  # TODO(lxuechen): Update this according to ImageNet data
-  config.add_hparam("iters_per_epoch", 50000 // config.batch_size)
+  config.add_hparam("iters_per_epoch",
+                    config.num_train_images // config.batch_size)
   config.add_hparam("epochs", config.max_train_iter // config.iters_per_epoch)
-  # Due to bottleneck residual blocks
-  filters = [f * 4 for f in config.filters]
-  config.filters = filters
+
+  # Customized TPU hyperparameters due to differing batch size caused by
+  # TPU architecture specifics
+  # Suggested batch sizes to reduce overhead from excessive tensor padding
+  # https://cloud.google.com/tpu/docs/troubleshooting
+  config.add_hparam("tpu_batch_size", 1024)
+  config.add_hparam("tpu_eval_batch_size", 1024)
+  config.add_hparam("tpu_iters_per_epoch",
+                    config.num_train_images // config.tpu_batch_size)
+  config.add_hparam("tpu_epochs",
+                    config.max_train_iter // config.tpu_iters_per_epoch)
+
+  return config
+
+
+def get_hparams_imagenet_104():
+  config = get_hparams_imagenet_56()
+  config.n_res = [2, 2, 11, 2]
 
   return config
diff --git a/tensorflow/contrib/eager/python/examples/revnet/imagenet_input.py b/tensorflow/contrib/eager/python/examples/revnet/imagenet_input.py
new file mode 100644
index 0000000000..e81351b1b1
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/revnet/imagenet_input.py
@@ -0,0 +1,230 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Efficient ImageNet input pipeline using tf.data.Dataset."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import os
+
+import tensorflow as tf
+
+from tensorflow.contrib.eager.python.examples.revnet import resnet_preprocessing
+
+
+def image_serving_input_fn():
+  """Serving input fn for raw images."""
+
+  def _preprocess_image(image_bytes):
+    """Preprocess a single raw image."""
+    image = resnet_preprocessing.preprocess_image(
+        image_bytes=image_bytes, is_training=False)
+    return image
+
+  image_bytes_list = tf.placeholder(
+      shape=[None],
+      dtype=tf.string,
+  )
+  images = tf.map_fn(
+      _preprocess_image, image_bytes_list, back_prop=False, dtype=tf.float32)
+  return tf.estimator.export.ServingInputReceiver(
+      images, {'image_bytes': image_bytes_list})
+
+
+class ImageNetInput(object):
+  """Generates ImageNet input_fn for training or evaluation.
+
+  The training data is assumed to be in TFRecord format with keys as specified
+  in the dataset_parser below, sharded across 1024 files, named sequentially:
+      train-00000-of-01024
+      train-00001-of-01024
+      ...
+      train-01023-of-01024
+
+  The validation data is in the same format but sharded in 128 files.
+
+  The format of the data required is created by the script at:
+      https://github.com/tensorflow/tpu/blob/master/tools/datasets/imagenet_to_gcs.py
+
+  Args:
+    is_training: `bool` for whether the input is for training
+    data_dir: `str` for the directory of the training and validation data;
+        if 'null' (the literal string 'null', not None), then construct a null
+        pipeline, consisting of empty images.
+    use_bfloat16: If True, use bfloat16 precision; else use float32.
+    transpose_input: 'bool' for whether to use the double transpose trick
+    num_cores: `int` for the number of TPU cores
+  """
+
+  def __init__(self, is_training,
+               use_bfloat16,
+               data_dir,
+               num_cores=8,
+               num_parallel_calls=64,
+               image_size=224,
+               transpose_input=False,
+               cache=False):
+    self.image_preprocessing_fn = resnet_preprocessing.preprocess_image
+    self.is_training = is_training
+    self.use_bfloat16 = use_bfloat16
+    self.data_dir = data_dir
+    self.num_cores = num_cores
+    self.num_parallel_calls = num_parallel_calls
+    if self.data_dir == 'null' or self.data_dir == '':
+      self.data_dir = None
+    self.transpose_input = transpose_input
+    self.image_size = image_size
+    self.cache = cache
+
+  def set_shapes(self, batch_size, images, labels):
+    """Statically set the batch_size dimension."""
+    if self.transpose_input:
+      images.set_shape(images.get_shape().merge_with(
+          tf.TensorShape([None, None, None, batch_size])))
+      labels.set_shape(labels.get_shape().merge_with(
+          tf.TensorShape([batch_size])))
+    else:
+      images.set_shape(images.get_shape().merge_with(
+          tf.TensorShape([batch_size, None, None, None])))
+      labels.set_shape(labels.get_shape().merge_with(
+          tf.TensorShape([batch_size])))
+
+    return images, labels
+
+  def dataset_parser(self, value):
+    """Parse an ImageNet record from a serialized string Tensor."""
+    keys_to_features = {
+        'image/encoded': tf.FixedLenFeature((), tf.string, ''),
+        'image/format': tf.FixedLenFeature((), tf.string, 'jpeg'),
+        'image/class/label': tf.FixedLenFeature([], tf.int64, -1),
+        'image/class/text': tf.FixedLenFeature([], tf.string, ''),
+        'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32),
+        'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32),
+        'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32),
+        'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32),
+        'image/object/class/label': tf.VarLenFeature(dtype=tf.int64),
+    }
+
+    parsed = tf.parse_single_example(value, keys_to_features)
+    image_bytes = tf.reshape(parsed['image/encoded'], shape=[])
+
+    image = self.image_preprocessing_fn(
+        image_bytes=image_bytes,
+        is_training=self.is_training,
+        image_size=self.image_size,
+        use_bfloat16=self.use_bfloat16)
+
+    # Subtract one so that labels are in [0, 1000).
+    label = tf.cast(
+        tf.reshape(parsed['image/class/label'], shape=[]), dtype=tf.int32) - 1
+
+    return image, label
+
+  def input_fn(self, params):
+    """Input function which provides a single batch for train or eval.
+
+    Args:
+      params: `dict` of parameters passed from the `TPUEstimator`.
+          `params['batch_size']` is always provided and should be used as the
+          effective batch size.
+
+    Returns:
+      A `tf.data.Dataset` object.
+    """
+    if self.data_dir is None:
+      tf.logging.info('Using fake input.')
+      return self.input_fn_null(params)
+
+    # Retrieves the batch size for the current shard. The # of shards is
+    # computed according to the input pipeline deployment. See
+    # tf.contrib.tpu.RunConfig for details.
+    batch_size = params['batch_size']
+
+    # Shuffle the filenames to ensure better randomization.
+    file_pattern = os.path.join(
+        self.data_dir, 'train-*' if self.is_training else 'validation-*')
+    dataset = tf.data.Dataset.list_files(file_pattern, shuffle=self.is_training)
+
+    if self.is_training and not self.cache:
+      dataset = dataset.repeat()
+
+    def fetch_dataset(filename):
+      buffer_size = 8 * 1024 * 1024  # 8 MiB per file
+      dataset = tf.data.TFRecordDataset(filename, buffer_size=buffer_size)
+      return dataset
+
+    # Read the data from disk in parallel
+    dataset = dataset.apply(
+        tf.contrib.data.parallel_interleave(
+            fetch_dataset, cycle_length=self.num_parallel_calls, sloppy=True))
+    if self.cache:
+      dataset = dataset.cache().apply(
+          tf.contrib.data.shuffle_and_repeat(1024 * 16))
+    else:
+      dataset = dataset.shuffle(1024)
+
+    # Use the fused map-and-batch operation.
+    #
+    # For XLA, we must used fixed shapes. Because we repeat the source training
+    # dataset indefinitely, we can use `drop_remainder=True` to get fixed-size
+    # batches without dropping any training examples.
+    #
+    # When evaluating, `drop_remainder=True` prevents accidentally evaluating
+    # the same image twice by dropping the final batch if it is less than a full
+    # batch size. As long as this validation is done with consistent batch size,
+    # exactly the same images will be used.
+    dataset = dataset.apply(
+        tf.contrib.data.map_and_batch(
+            self.dataset_parser, batch_size=batch_size,
+            num_parallel_batches=self.num_cores, drop_remainder=True))
+
+    # Transpose for performance on TPU
+    if self.transpose_input:
+      dataset = dataset.map(
+          lambda images, labels: (tf.transpose(images, [1, 2, 3, 0]), labels),
+          num_parallel_calls=self.num_cores)
+
+    # Assign static batch size dimension
+    dataset = dataset.map(functools.partial(self.set_shapes, batch_size))
+
+    # Prefetch overlaps in-feed with training
+    dataset = dataset.prefetch(tf.contrib.data.AUTOTUNE)
+    return dataset
+
+  def input_fn_null(self, params):
+    """Input function which provides null (black) images."""
+    batch_size = params['batch_size']
+    dataset = tf.data.Dataset.range(1).repeat().map(self._get_null_input)
+    dataset = dataset.prefetch(batch_size)
+
+    dataset = dataset.apply(
+        tf.contrib.data.batch_and_drop_remainder(batch_size))
+    if self.transpose_input:
+      dataset = dataset.map(
+          lambda images, labels: (tf.transpose(images, [1, 2, 3, 0]), labels),
+          num_parallel_calls=8)
+
+    dataset = dataset.map(functools.partial(self.set_shapes, batch_size))
+
+    dataset = dataset.prefetch(32)     # Prefetch overlaps in-feed with training
+    tf.logging.info('Input dataset: %s', str(dataset))
+    return dataset
+
+  def _get_null_input(self, _):
+    null_image = tf.zeros([224, 224, 3], tf.bfloat16
+                          if self.use_bfloat16 else tf.float32)
+    return (null_image, tf.constant(0, tf.int32))
diff --git a/tensorflow/contrib/eager/python/examples/revnet/main.py b/tensorflow/contrib/eager/python/examples/revnet/main.py
index e2f43b03f9..b702e91f92 100644
--- a/tensorflow/contrib/eager/python/examples/revnet/main.py
+++ b/tensorflow/contrib/eager/python/examples/revnet/main.py
@@ -29,10 +29,18 @@ from tensorflow.contrib.eager.python.examples.revnet import revnet
 tfe = tf.contrib.eager
 
 
+def apply_gradients(optimizer, grads, vars_, global_step=None):
+  """Functional style apply_grads for `tfe.defun`."""
+  optimizer.apply_gradients(zip(grads, vars_), global_step=global_step)
+
+
 def main(_):
   """Eager execution workflow with RevNet trained on CIFAR-10."""
-  config = get_config()
-  ds_train, ds_train_one_shot, ds_validation, ds_test = get_datasets(config)
+  tf.enable_eager_execution()
+
+  config = get_config(config_name=FLAGS.config, dataset=FLAGS.dataset)
+  ds_train, ds_train_one_shot, ds_validation, ds_test = get_datasets(
+      data_dir=FLAGS.data_dir, config=config)
   model = revnet.RevNet(config=config)
   global_step = tf.train.get_or_create_global_step()  # Ensure correct summary
   global_step.assign(1)
@@ -43,6 +51,14 @@ def main(_):
   checkpointer = tf.train.Checkpoint(
       optimizer=optimizer, model=model, optimizer_step=global_step)
 
+  if FLAGS.use_defun:
+    model.call = tfe.defun(model.call)
+    model.compute_gradients = tfe.defun(model.compute_gradients)
+    model.get_moving_stats = tfe.defun(model.get_moving_stats)
+    model.restore_moving_stats = tfe.defun(model.restore_moving_stats)
+    global apply_gradients  # pylint:disable=global-variable-undefined
+    apply_gradients = tfe.defun(apply_gradients)
+
   if FLAGS.train_dir:
     summary_writer = tf.contrib.summary.create_file_writer(FLAGS.train_dir)
     if FLAGS.restore:
@@ -52,46 +68,37 @@ def main(_):
             "with global_step: {}".format(latest_path, global_step.numpy()))
       sys.stdout.flush()
 
-  if FLAGS.manual_grad:
-    print("Using manual gradients.")
-  else:
-    print("Not using manual gradients.")
-  sys.stdout.flush()
-
   for x, y in ds_train:
     train_one_iter(model, x, y, optimizer, global_step=global_step)
 
     if global_step.numpy() % config.log_every == 0:
-      it_train = ds_train_one_shot.make_one_shot_iterator()
       it_test = ds_test.make_one_shot_iterator()
-      acc_train, loss_train = evaluate(model, it_train)
       acc_test, loss_test = evaluate(model, it_test)
 
       if FLAGS.validate:
+        it_train = ds_train_one_shot.make_one_shot_iterator()
         it_validation = ds_validation.make_one_shot_iterator()
+        acc_train, loss_train = evaluate(model, it_train)
         acc_validation, loss_validation = evaluate(model, it_validation)
         print("Iter {}, "
               "training set accuracy {:.4f}, loss {:.4f}; "
-              "validation set accuracy {:.4f}, loss {:4.f}"
+              "validation set accuracy {:.4f}, loss {:.4f}; "
               "test accuracy {:.4f}, loss {:.4f}".format(
                   global_step.numpy(), acc_train, loss_train, acc_validation,
                   loss_validation, acc_test, loss_test))
       else:
-        print("Iter {}, "
-              "training set accuracy {:.4f}, loss {:.4f}; "
-              "test accuracy {:.4f}, loss {:.4f}".format(
-                  global_step.numpy(), acc_train, loss_train, acc_test,
-                  loss_test))
+        print("Iter {}, test accuracy {:.4f}, loss {:.4f}".format(
+            global_step.numpy(), acc_test, loss_test))
       sys.stdout.flush()
 
       if FLAGS.train_dir:
         with summary_writer.as_default():
           with tf.contrib.summary.always_record_summaries():
-            tf.contrib.summary.scalar("Training accuracy", acc_train)
             tf.contrib.summary.scalar("Test accuracy", acc_test)
-            tf.contrib.summary.scalar("Training loss", loss_train)
             tf.contrib.summary.scalar("Test loss", loss_test)
             if FLAGS.validate:
+              tf.contrib.summary.scalar("Training accuracy", acc_train)
+              tf.contrib.summary.scalar("Training loss", loss_train)
               tf.contrib.summary.scalar("Validation accuracy", acc_validation)
               tf.contrib.summary.scalar("Validation loss", loss_validation)
 
@@ -103,34 +110,38 @@ def main(_):
       sys.stdout.flush()
 
 
-def get_config():
+def get_config(config_name="revnet-38", dataset="cifar-10"):
   """Return configuration."""
-  print("Config: {}".format(FLAGS.config))
+  print("Config: {}".format(config_name))
   sys.stdout.flush()
   config = {
       "revnet-38": config_.get_hparams_cifar_38(),
       "revnet-110": config_.get_hparams_cifar_110(),
       "revnet-164": config_.get_hparams_cifar_164(),
-  }[FLAGS.config]
+  }[config_name]
 
-  if FLAGS.dataset == "cifar-100":
-    config.n_classes = 100
+  if dataset == "cifar-10":
+    config.add_hparam("n_classes", 10)
+    config.add_hparam("dataset", "cifar-10")
+  else:
+    config.add_hparam("n_classes", 100)
+    config.add_hparam("dataset", "cifar-100")
 
   return config
 
 
-def get_datasets(config):
+def get_datasets(data_dir, config):
   """Return dataset."""
-  if FLAGS.data_dir is None:
+  if data_dir is None:
     raise ValueError("No supplied data directory")
-  if not os.path.exists(FLAGS.data_dir):
-    raise ValueError("Data directory {} does not exist".format(FLAGS.data_dir))
-  if FLAGS.dataset not in ["cifar-10", "cifar-100"]:
-    raise ValueError("Unknown dataset {}".format(FLAGS.dataset))
+  if not os.path.exists(data_dir):
+    raise ValueError("Data directory {} does not exist".format(data_dir))
+  if config.dataset not in ["cifar-10", "cifar-100"]:
+    raise ValueError("Unknown dataset {}".format(config.dataset))
 
-  print("Training on {} dataset.".format(FLAGS.dataset))
+  print("Training on {} dataset.".format(config.dataset))
   sys.stdout.flush()
-  data_dir = os.path.join(FLAGS.data_dir, FLAGS.dataset)
+  data_dir = os.path.join(data_dir, config.dataset)
   if FLAGS.validate:
     # 40k Training set
     ds_train = cifar_input.get_ds_from_tfrecords(
@@ -168,7 +179,7 @@ def get_datasets(config):
         prefetch=config.batch_size)
     ds_validation = None
 
-  # Always compute loss and accuracy on whole training and test set
+  # Always compute loss and accuracy on whole test set
   ds_train_one_shot = cifar_input.get_ds_from_tfrecords(
       data_dir=data_dir,
       split="train_all",
@@ -196,19 +207,15 @@ def get_datasets(config):
 
 def train_one_iter(model, inputs, labels, optimizer, global_step=None):
   """Train for one iteration."""
-  if FLAGS.manual_grad:
-    grads, vars_, loss = model.compute_gradients(inputs, labels, training=True)
-    optimizer.apply_gradients(zip(grads, vars_), global_step=global_step)
-  else:  # For correctness validation
-    with tf.GradientTape() as tape:
-      logits, _ = model(inputs, training=True)
-      loss = model.compute_loss(logits=logits, labels=labels)
-      tf.logging.info("Logits are placed on device: {}".format(logits.device))
-    grads = tape.gradient(loss, model.trainable_variables)
-    optimizer.apply_gradients(
-        zip(grads, model.trainable_variables), global_step=global_step)
+  logits, saved_hiddens = model(inputs, training=True)
+  values = model.get_moving_stats()
+  grads, loss = model.compute_gradients(saved_hiddens, labels)
+  # Restore moving averages when executing eagerly to avoid updating twice
+  model.restore_moving_stats(values)
+  apply_gradients(
+      optimizer, grads, model.trainable_variables, global_step=global_step)
 
-  return loss.numpy()
+  return logits, loss
 
 
 def evaluate(model, iterator):
@@ -241,16 +248,18 @@ if __name__ == "__main__":
       "validate",
       default=False,
       help="[Optional] Use the validation set or not for hyperparameter search")
-  flags.DEFINE_boolean(
-      "manual_grad",
-      default=False,
-      help="[Optional] Use manual gradient graph to save memory")
   flags.DEFINE_string(
       "dataset",
       default="cifar-10",
       help="[Optional] The dataset used; either `cifar-10` or `cifar-100`")
   flags.DEFINE_string(
-      "config", default="revnet-38", help="[Optional] Architecture of network.")
+      "config",
+      default="revnet-38",
+      help="[Optional] Architecture of network. "
+      "Other options include `revnet-110` and `revnet-164`")
+  flags.DEFINE_boolean(
+      "use_defun",
+      default=False,
+      help="[Optional] Use `tfe.defun` to boost performance.")
   FLAGS = flags.FLAGS
-  tf.enable_eager_execution()
   tf.app.run(main)
diff --git a/tensorflow/contrib/eager/python/examples/revnet/main_estimator.py b/tensorflow/contrib/eager/python/examples/revnet/main_estimator.py
new file mode 100644
index 0000000000..a21b573c06
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/revnet/main_estimator.py
@@ -0,0 +1,200 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Estimator workflow with RevNet train on CIFAR-10."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from absl import flags
+import tensorflow as tf
+from tensorflow.contrib.eager.python.examples.revnet import cifar_input
+from tensorflow.contrib.eager.python.examples.revnet import main as main_
+from tensorflow.contrib.eager.python.examples.revnet import revnet
+
+
+def model_fn(features, labels, mode, params):
+  """Function specifying the model that is required by the `tf.estimator` API.
+
+  Args:
+    features: Input images
+    labels: Labels of images
+    mode: One of `ModeKeys.TRAIN`, `ModeKeys.EVAL` or 'ModeKeys.PREDICT'
+    params: A dictionary of extra parameter that might be passed
+
+  Returns:
+    An instance of `tf.estimator.EstimatorSpec`
+  """
+
+  inputs = features
+  if isinstance(inputs, dict):
+    inputs = features["image"]
+
+  config = params["config"]
+  model = revnet.RevNet(config=config)
+
+  if mode == tf.estimator.ModeKeys.TRAIN:
+    global_step = tf.train.get_or_create_global_step()
+    learning_rate = tf.train.piecewise_constant(
+        global_step, config.lr_decay_steps, config.lr_list)
+    optimizer = tf.train.MomentumOptimizer(
+        learning_rate, momentum=config.momentum)
+    logits, saved_hidden = model(inputs, training=True)
+    grads, loss = model.compute_gradients(saved_hidden, labels, training=True)
+    with tf.control_dependencies(model.get_updates_for(inputs)):
+      train_op = optimizer.apply_gradients(
+          zip(grads, model.trainable_variables), global_step=global_step)
+
+    return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
+  else:
+    logits, _ = model(inputs, training=False)
+    predictions = tf.argmax(logits, axis=1)
+    probabilities = tf.nn.softmax(logits)
+
+    if mode == tf.estimator.ModeKeys.EVAL:
+      loss = model.compute_loss(labels=labels, logits=logits)
+      return tf.estimator.EstimatorSpec(
+          mode=mode,
+          loss=loss,
+          eval_metric_ops={
+              "accuracy":
+                  tf.metrics.accuracy(labels=labels, predictions=predictions)
+          })
+
+    else:  # mode == tf.estimator.ModeKeys.PREDICT
+      result = {
+          "classes": predictions,
+          "probabilities": probabilities,
+      }
+
+      return tf.estimator.EstimatorSpec(
+          mode=mode,
+          predictions=predictions,
+          export_outputs={
+              "classify": tf.estimator.export.PredictOutput(result)
+          })
+
+
+def get_input_fn(config, data_dir, split):
+  """Get the input function that is required by the `tf.estimator` API.
+
+  Args:
+    config: Customized hyperparameters
+    data_dir: Directory where the data is stored
+    split: One of `train`, `validation`, `train_all`, and `test`
+
+  Returns:
+    Input function required by the `tf.estimator` API
+  """
+
+  data_dir = os.path.join(data_dir, config.dataset)
+  # Fix split-dependent hyperparameters
+  if split == "train_all" or split == "train":
+    data_aug = True
+    batch_size = config.batch_size
+    epochs = config.epochs
+    shuffle = True
+    prefetch = config.batch_size
+  else:
+    data_aug = False
+    batch_size = config.eval_batch_size
+    epochs = 1
+    shuffle = False
+    prefetch = config.eval_batch_size
+
+  def input_fn():
+    """Input function required by the `tf.estimator.Estimator` API."""
+    return cifar_input.get_ds_from_tfrecords(
+        data_dir=data_dir,
+        split=split,
+        data_aug=data_aug,
+        batch_size=batch_size,
+        epochs=epochs,
+        shuffle=shuffle,
+        prefetch=prefetch,
+        data_format=config.data_format)
+
+  return input_fn
+
+
+def main(_):
+  tf.logging.set_verbosity(tf.logging.INFO)
+
+  # RevNet specific configuration
+  config = main_.get_config(config_name=FLAGS.config, dataset=FLAGS.dataset)
+
+  # Estimator specific configuration
+  run_config = tf.estimator.RunConfig(
+      model_dir=FLAGS.train_dir,  # Directory for storing checkpoints
+      tf_random_seed=config.seed,
+      save_summary_steps=config.log_every,
+      save_checkpoints_steps=config.log_every,
+      session_config=None,  # Using default
+      keep_checkpoint_max=100,
+      keep_checkpoint_every_n_hours=10000,  # Using default
+      log_step_count_steps=config.log_every,
+      train_distribute=None  # Default not use distribution strategy
+  )
+
+  # Construct estimator
+  revnet_estimator = tf.estimator.Estimator(
+      model_fn=model_fn,
+      model_dir=FLAGS.train_dir,
+      config=run_config,
+      params={"config": config})
+
+  # Construct input functions
+  train_input_fn = get_input_fn(
+      config=config, data_dir=FLAGS.data_dir, split="train_all")
+  eval_input_fn = get_input_fn(
+      config=config, data_dir=FLAGS.data_dir, split="test")
+
+  # Train and evaluate estimator
+  revnet_estimator.train(input_fn=train_input_fn)
+  revnet_estimator.evaluate(input_fn=eval_input_fn)
+
+  if FLAGS.export:
+    input_shape = (None,) + config.input_shape
+    inputs = tf.placeholder(tf.float32, shape=input_shape)
+    input_fn = tf.estimator.export.build_raw_serving_input_receiver_fn({
+        "image": inputs
+    })
+    revnet_estimator.export_savedmodel(FLAGS.train_dir, input_fn)
+
+
+if __name__ == "__main__":
+  flags.DEFINE_string(
+      "data_dir", default=None, help="Directory to load tfrecords")
+  flags.DEFINE_string(
+      "train_dir",
+      default=None,
+      help="[Optional] Directory to store the training information")
+  flags.DEFINE_string(
+      "dataset",
+      default="cifar-10",
+      help="[Optional] The dataset used; either `cifar-10` or `cifar-100`")
+  flags.DEFINE_boolean(
+      "export",
+      default=False,
+      help="[Optional] Export the model for serving if True")
+  flags.DEFINE_string(
+      "config",
+      default="revnet-38",
+      help="[Optional] Architecture of network. "
+      "Other options include `revnet-110` and `revnet-164`")
+  FLAGS = flags.FLAGS
+  tf.app.run()
diff --git a/tensorflow/contrib/eager/python/examples/revnet/main_estimator_tpu.py b/tensorflow/contrib/eager/python/examples/revnet/main_estimator_tpu.py
new file mode 100644
index 0000000000..f0aad9b110
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/revnet/main_estimator_tpu.py
@@ -0,0 +1,319 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Cloud TPU Estimator workflow with RevNet train on CIFAR-10."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import time
+
+from absl import flags
+import tensorflow as tf
+from tensorflow.contrib.eager.python.examples.revnet import cifar_input
+from tensorflow.contrib.eager.python.examples.revnet import main as main_
+from tensorflow.contrib.eager.python.examples.revnet import revnet
+from tensorflow.contrib.training.python.training import evaluation
+from tensorflow.python.estimator import estimator as estimator_
+
+
+def model_fn(features, labels, mode, params):
+  """Model function required by the `tf.contrib.tpu.TPUEstimator` API.
+
+  Args:
+    features: Input images
+    labels: Labels of images
+    mode: One of `ModeKeys.TRAIN`, `ModeKeys.EVAL` or 'ModeKeys.PREDICT'
+    params: A dictionary of extra parameter that might be passed
+
+  Returns:
+    An instance of `tf.contrib.tpu.TPUEstimatorSpec`
+  """
+
+  inputs = features
+  if isinstance(inputs, dict):
+    inputs = features["image"]
+
+  config = params["config"]
+  model = revnet.RevNet(config=config)
+
+  if mode == tf.estimator.ModeKeys.TRAIN:
+    global_step = tf.train.get_or_create_global_step()
+    learning_rate = tf.train.piecewise_constant(
+        global_step, config.lr_decay_steps, config.lr_list)
+    optimizer = tf.train.MomentumOptimizer(
+        learning_rate, momentum=config.momentum)
+
+    if FLAGS.use_tpu:
+      optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
+
+    logits, saved_hidden = model(inputs, training=True)
+    grads, loss = model.compute_gradients(saved_hidden, labels, training=True)
+    train_op = optimizer.apply_gradients(
+        zip(grads, model.trainable_variables), global_step=global_step)
+
+    return tf.contrib.tpu.TPUEstimatorSpec(
+        mode=tf.estimator.ModeKeys.TRAIN, loss=loss, train_op=train_op)
+
+  elif mode == tf.estimator.ModeKeys.EVAL:
+    logits, _ = model(inputs, training=False)
+    loss = model.compute_loss(labels=labels, logits=logits)
+
+    def metric_fn(labels, logits):
+      predictions = tf.argmax(logits, axis=1)
+      accuracy = tf.metrics.accuracy(labels=labels, predictions=predictions)
+      return {
+          "accuracy": accuracy,
+      }
+
+    return tf.contrib.tpu.TPUEstimatorSpec(
+        mode=mode, loss=loss, eval_metrics=(metric_fn, [labels, logits]))
+
+  else:  # Predict or export
+    logits, _ = model(inputs, training=False)
+    predictions = {
+        "classes": tf.argmax(logits, axis=1),
+        "probabilities": tf.nn.softmax(logits),
+    }
+
+    return tf.contrib.tpu.TPUEstimatorSpec(
+        mode=mode,
+        predictions=predictions,
+        export_outputs={
+            "classify": tf.estimator.export.PredictOutput(predictions)
+        })
+
+
+def get_input_fn(config, data_dir, split):
+  """Get the input function required by the `tf.contrib.tpu.TPUEstimator` API.
+
+  Args:
+    config: Customized hyperparameters
+    data_dir: Directory where the data is stored
+    split: One of `train`, `validation`, `train_all`, and `test`
+
+  Returns:
+    Input function required by the `tf.contrib.tpu.TPUEstimator` API
+  """
+
+  data_dir = os.path.join(data_dir, config.dataset)
+  # Fix split-dependent hyperparameters
+  if split == "train_all" or split == "train":
+    data_aug = True
+    epochs = config.tpu_epochs
+    shuffle = True
+  else:
+    data_aug = False
+    epochs = 1
+    shuffle = False
+
+  def input_fn(params):
+    """Input function required by the `tf.contrib.tpu.TPUEstimator` API."""
+    batch_size = params["batch_size"]
+    return cifar_input.get_ds_from_tfrecords(
+        data_dir=data_dir,
+        split=split,
+        data_aug=data_aug,
+        batch_size=batch_size,  # per-shard batch size
+        epochs=epochs,
+        shuffle=shuffle,
+        prefetch=batch_size,  # per-shard batch size
+        data_format=config.data_format)
+
+  return input_fn
+
+
+def main(_):
+  tf.logging.set_verbosity(tf.logging.INFO)
+
+  # RevNet specific configuration
+  config = main_.get_config(config_name=FLAGS.config, dataset=FLAGS.dataset)
+
+  if FLAGS.use_tpu:
+    tf.logging.info("Using TPU.")
+    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
+        FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
+  else:
+    tpu_cluster_resolver = None
+
+  # TPU specific configuration
+  tpu_config = tf.contrib.tpu.TPUConfig(
+      # Recommended to be set as number of global steps for next checkpoint
+      iterations_per_loop=FLAGS.iterations_per_loop,
+      num_shards=FLAGS.num_shards)
+
+  # Estimator specific configuration
+  run_config = tf.contrib.tpu.RunConfig(
+      cluster=tpu_cluster_resolver,
+      model_dir=FLAGS.model_dir,
+      session_config=tf.ConfigProto(
+          allow_soft_placement=True, log_device_placement=False),
+      tpu_config=tpu_config,
+  )
+
+  # Construct TPU Estimator
+  estimator = tf.contrib.tpu.TPUEstimator(
+      model_fn=model_fn,
+      use_tpu=FLAGS.use_tpu,
+      train_batch_size=config.tpu_batch_size,
+      eval_batch_size=config.tpu_eval_batch_size,
+      config=run_config,
+      params={"config": config})
+
+  # Construct input functions
+  train_input_fn = get_input_fn(
+      config=config, data_dir=FLAGS.data_dir, split="train_all")
+  eval_input_fn = get_input_fn(
+      config=config, data_dir=FLAGS.data_dir, split="test")
+
+  # Disabling a range within an else block currently doesn't work
+  # due to https://github.com/PyCQA/pylint/issues/872
+  # pylint: disable=protected-access
+  if FLAGS.mode == "eval":
+    # TPUEstimator.evaluate *requires* a steps argument.
+    # Note that the number of examples used during evaluation is
+    # --eval_steps * --batch_size.
+    # So if you change --batch_size then change --eval_steps too.
+    eval_steps = 10000 // config.tpu_eval_batch_size
+
+    # Run evaluation when there's a new checkpoint
+    for ckpt in evaluation.checkpoints_iterator(
+        FLAGS.model_dir, timeout=FLAGS.eval_timeout):
+      tf.logging.info("Starting to evaluate.")
+      try:
+        start_timestamp = time.time()  # This time will include compilation time
+        eval_results = estimator.evaluate(
+            input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=ckpt)
+        elapsed_time = int(time.time() - start_timestamp)
+        tf.logging.info("Eval results: %s. Elapsed seconds: %d" %
+                        (eval_results, elapsed_time))
+
+        # Terminate eval job when final checkpoint is reached
+        current_step = int(os.path.basename(ckpt).split("-")[1])
+        if current_step >= config.max_train_iter:
+          tf.logging.info(
+              "Evaluation finished after training step %d" % current_step)
+          break
+
+      except tf.errors.NotFoundError:
+        # Since the coordinator is on a different job than the TPU worker,
+        # sometimes the TPU worker does not finish initializing until long after
+        # the CPU job tells it to start evaluating. In this case, the checkpoint
+        # file could have been deleted already.
+        tf.logging.info(
+            "Checkpoint %s no longer exists, skipping checkpoint" % ckpt)
+
+  else:  # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval'
+    current_step = estimator_._load_global_step_from_checkpoint_dir(
+        FLAGS.model_dir)
+    tf.logging.info("Training for %d steps . Current"
+                    " step %d." % (config.max_train_iter, current_step))
+
+    start_timestamp = time.time()  # This time will include compilation time
+    if FLAGS.mode == "train":
+      estimator.train(input_fn=train_input_fn, max_steps=config.max_train_iter)
+    else:
+      eval_steps = 10000 // config.tpu_eval_batch_size
+      assert FLAGS.mode == "train_and_eval"
+      while current_step < config.max_train_iter:
+        # Train for up to steps_per_eval number of steps.
+        # At the end of training, a checkpoint will be written to --model_dir.
+        next_checkpoint = min(current_step + FLAGS.steps_per_eval,
+                              config.max_train_iter)
+        estimator.train(input_fn=train_input_fn, max_steps=next_checkpoint)
+        current_step = next_checkpoint
+
+        # Evaluate the model on the most recent model in --model_dir.
+        # Since evaluation happens in batches of --eval_batch_size, some images
+        # may be consistently excluded modulo the batch size.
+        tf.logging.info("Starting to evaluate.")
+        eval_results = estimator.evaluate(
+            input_fn=eval_input_fn, steps=eval_steps)
+        tf.logging.info("Eval results: %s" % eval_results)
+
+    elapsed_time = int(time.time() - start_timestamp)
+    tf.logging.info("Finished training up to step %d. Elapsed seconds %d." %
+                    (config.max_train_iter, elapsed_time))
+  # pylint: enable=protected-access
+
+
+if __name__ == "__main__":
+  # Cloud TPU Cluster Resolver flags
+  flags.DEFINE_string(
+      "tpu",
+      default=None,
+      help="The Cloud TPU to use for training. This should be either the name "
+      "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 "
+      "url.")
+  flags.DEFINE_string(
+      "tpu_zone",
+      default=None,
+      help="[Optional] GCE zone where the Cloud TPU is located in. If not "
+      "specified, we will attempt to automatically detect the GCE project from "
+      "metadata.")
+  flags.DEFINE_string(
+      "gcp_project",
+      default=None,
+      help="[Optional] Project name for the Cloud TPU-enabled project. If not "
+      "specified, we will attempt to automatically detect the GCE project from "
+      "metadata.")
+
+  # Model specific parameters
+  flags.DEFINE_string(
+      "data_dir", default=None, help="Directory to load tfrecords")
+  flags.DEFINE_string(
+      "model_dir",
+      default=None,
+      help="[Optional] Directory to store the model information")
+  flags.DEFINE_string(
+      "dataset",
+      default="cifar-10",
+      help="[Optional] The dataset used; either `cifar-10` or `cifar-100`")
+  flags.DEFINE_string(
+      "config",
+      default="revnet-38",
+      help="[Optional] Architecture of network. "
+      "Other options include `revnet-110` and `revnet-164`")
+  flags.DEFINE_boolean(
+      "use_tpu", default=True, help="[Optional] Whether to use TPU")
+  flags.DEFINE_integer(
+      "num_shards", default=8, help="Number of shards (TPU chips).")
+  flags.DEFINE_integer(
+      "iterations_per_loop",
+      default=100,
+      help=(
+          "Number of steps to run on TPU before feeding metrics to the CPU."
+          " If the number of iterations in the loop would exceed the number of"
+          " train steps, the loop will exit before reaching"
+          " --iterations_per_loop. The larger this value is, the higher the"
+          " utilization on the TPU."))
+  flags.DEFINE_string(
+      "mode",
+      default="train_and_eval",
+      help="[Optional] Mode to run: train, eval, train_and_eval")
+  flags.DEFINE_integer(
+      "eval_timeout", 60 * 60 * 24,
+      "Maximum seconds between checkpoints before evaluation terminates.")
+  flags.DEFINE_integer(
+      "steps_per_eval",
+      default=1000,
+      help=(
+          "Controls how often evaluation is performed. Since evaluation is"
+          " fairly expensive, it is advised to evaluate as infrequently as"
+          " possible (i.e. up to --train_steps, which evaluates the model only"
+          " after finishing the entire training regime)."))
+  FLAGS = flags.FLAGS
+  tf.app.run()
diff --git a/tensorflow/contrib/eager/python/examples/revnet/resnet_preprocessing.py b/tensorflow/contrib/eager/python/examples/revnet/resnet_preprocessing.py
new file mode 100644
index 0000000000..21a1ab85d4
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/revnet/resnet_preprocessing.py
@@ -0,0 +1,190 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""ImageNet preprocessing for ResNet."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+IMAGE_SIZE = 224
+CROP_PADDING = 32
+
+
+def distorted_bounding_box_crop(image_bytes,
+                                bbox,
+                                min_object_covered=0.1,
+                                aspect_ratio_range=(0.75, 1.33),
+                                area_range=(0.05, 1.0),
+                                max_attempts=100,
+                                scope=None):
+  """Generates cropped_image using one of the bboxes randomly distorted.
+
+  See `tf.image.sample_distorted_bounding_box` for more documentation.
+
+  Args:
+    image_bytes: `Tensor` of binary image data.
+    bbox: `Tensor` of bounding boxes arranged `[1, num_boxes, coords]`
+        where each coordinate is [0, 1) and the coordinates are arranged
+        as `[ymin, xmin, ymax, xmax]`. If num_boxes is 0 then use the whole
+        image.
+    min_object_covered: An optional `float`. Defaults to `0.1`. The cropped
+        area of the image must contain at least this fraction of any bounding
+        box supplied.
+    aspect_ratio_range: An optional list of `float`s. The cropped area of the
+        image must have an aspect ratio = width / height within this range.
+    area_range: An optional list of `float`s. The cropped area of the image
+        must contain a fraction of the supplied image within in this range.
+    max_attempts: An optional `int`. Number of attempts at generating a cropped
+        region of the image of the specified constraints. After `max_attempts`
+        failures, return the entire image.
+    scope: Optional `str` for name scope.
+  Returns:
+    cropped image `Tensor`
+  """
+  with tf.name_scope(scope, 'distorted_bounding_box_crop', [image_bytes, bbox]):
+    shape = tf.image.extract_jpeg_shape(image_bytes)
+    sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box(
+        shape,
+        bounding_boxes=bbox,
+        min_object_covered=min_object_covered,
+        aspect_ratio_range=aspect_ratio_range,
+        area_range=area_range,
+        max_attempts=max_attempts,
+        use_image_if_no_bounding_boxes=True)
+    bbox_begin, bbox_size, _ = sample_distorted_bounding_box
+
+    # Crop the image to the specified bounding box.
+    offset_y, offset_x, _ = tf.unstack(bbox_begin)
+    target_height, target_width, _ = tf.unstack(bbox_size)
+    crop_window = tf.stack([offset_y, offset_x, target_height, target_width])
+    image = tf.image.decode_and_crop_jpeg(image_bytes, crop_window, channels=3)
+
+    return image
+
+
+def _at_least_x_are_equal(a, b, x):
+  """At least `x` of `a` and `b` `Tensors` are equal."""
+  match = tf.equal(a, b)
+  match = tf.cast(match, tf.int32)
+  return tf.greater_equal(tf.reduce_sum(match), x)
+
+
+def _decode_and_random_crop(image_bytes, image_size):
+  """Make a random crop of image_size."""
+  bbox = tf.constant([0.0, 0.0, 1.0, 1.0], dtype=tf.float32, shape=[1, 1, 4])
+  image = distorted_bounding_box_crop(
+      image_bytes,
+      bbox,
+      min_object_covered=0.1,
+      aspect_ratio_range=(3. / 4, 4. / 3.),
+      area_range=(0.08, 1.0),
+      max_attempts=10,
+      scope=None)
+  original_shape = tf.image.extract_jpeg_shape(image_bytes)
+  bad = _at_least_x_are_equal(original_shape, tf.shape(image), 3)
+
+  image = tf.cond(
+      bad,
+      lambda: _decode_and_center_crop(image_bytes, image_size),
+      lambda: tf.image.resize_bicubic([image],  # pylint: disable=g-long-lambda
+                                      [image_size, image_size])[0])
+
+  return image
+
+
+def _decode_and_center_crop(image_bytes, image_size):
+  """Crops to center of image with padding then scales image_size."""
+  shape = tf.image.extract_jpeg_shape(image_bytes)
+  image_height = shape[0]
+  image_width = shape[1]
+
+  padded_center_crop_size = tf.cast(
+      ((image_size / (image_size + CROP_PADDING)) *
+       tf.cast(tf.minimum(image_height, image_width), tf.float32)),
+      tf.int32)
+
+  offset_height = ((image_height - padded_center_crop_size) + 1) // 2
+  offset_width = ((image_width - padded_center_crop_size) + 1) // 2
+  crop_window = tf.stack([offset_height, offset_width,
+                          padded_center_crop_size, padded_center_crop_size])
+  image = tf.image.decode_and_crop_jpeg(image_bytes, crop_window, channels=3)
+  image = tf.image.resize_bicubic([image], [image_size, image_size])[0]
+
+  return image
+
+
+def _flip(image):
+  """Random horizontal image flip."""
+  image = tf.image.random_flip_left_right(image)
+  return image
+
+
+def preprocess_for_train(image_bytes, use_bfloat16, image_size=IMAGE_SIZE):
+  """Preprocesses the given image for evaluation.
+
+  Args:
+    image_bytes: `Tensor` representing an image binary of arbitrary size.
+    use_bfloat16: `bool` for whether to use bfloat16.
+    image_size: image size.
+
+  Returns:
+    A preprocessed image `Tensor`.
+  """
+  image = _decode_and_random_crop(image_bytes, image_size)
+  image = _flip(image)
+  image = tf.reshape(image, [image_size, image_size, 3])
+  image = tf.image.convert_image_dtype(
+      image, dtype=tf.bfloat16 if use_bfloat16 else tf.float32)
+  return image
+
+
+def preprocess_for_eval(image_bytes, use_bfloat16, image_size=IMAGE_SIZE):
+  """Preprocesses the given image for evaluation.
+
+  Args:
+    image_bytes: `Tensor` representing an image binary of arbitrary size.
+    use_bfloat16: `bool` for whether to use bfloat16.
+    image_size: image size.
+
+  Returns:
+    A preprocessed image `Tensor`.
+  """
+  image = _decode_and_center_crop(image_bytes, image_size)
+  image = tf.reshape(image, [image_size, image_size, 3])
+  image = tf.image.convert_image_dtype(
+      image, dtype=tf.bfloat16 if use_bfloat16 else tf.float32)
+  return image
+
+
+def preprocess_image(image_bytes,
+                     is_training=False,
+                     use_bfloat16=False,
+                     image_size=IMAGE_SIZE):
+  """Preprocesses the given image.
+
+  Args:
+    image_bytes: `Tensor` representing an image binary of arbitrary size.
+    is_training: `bool` for whether the preprocessing is for training.
+    use_bfloat16: `bool` for whether to use bfloat16.
+    image_size: image size.
+
+  Returns:
+    A preprocessed image `Tensor`.
+  """
+  if is_training:
+    return preprocess_for_train(image_bytes, use_bfloat16, image_size)
+  else:
+    return preprocess_for_eval(image_bytes, use_bfloat16, image_size)
diff --git a/tensorflow/contrib/eager/python/examples/revnet/revnet.py b/tensorflow/contrib/eager/python/examples/revnet/revnet.py
index af0d20fa72..1f2cb14972 100644
--- a/tensorflow/contrib/eager/python/examples/revnet/revnet.py
+++ b/tensorflow/contrib/eager/python/examples/revnet/revnet.py
@@ -24,10 +24,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import functools
-import operator
-
-import six
 import tensorflow as tf
 from tensorflow.contrib.eager.python.examples.revnet import blocks
 
@@ -45,71 +41,10 @@ class RevNet(tf.keras.Model):
     self.axis = 1 if config.data_format == "channels_first" else 3
     self.config = config
 
-    self._init_block = self._construct_init_block()
+    self._init_block = blocks.InitBlock(config=self.config)
+    self._final_block = blocks.FinalBlock(config=self.config)
     self._block_list = self._construct_intermediate_blocks()
-    self._final_block = self._construct_final_block()
-
-  def _construct_init_block(self):
-    init_block = tf.keras.Sequential(
-        [
-            tf.keras.layers.Conv2D(
-                filters=self.config.init_filters,
-                kernel_size=self.config.init_kernel,
-                strides=(self.config.init_stride, self.config.init_stride),
-                data_format=self.config.data_format,
-                use_bias=False,
-                padding="SAME",
-                input_shape=self.config.input_shape,
-                dtype=self.config.dtype),
-            tf.keras.layers.BatchNormalization(
-                axis=self.axis,
-                fused=self.config.fused,
-                dtype=self.config.dtype),
-            tf.keras.layers.Activation("relu"),
-        ],
-        name="init")
-    if self.config.init_max_pool:
-      init_block.add(
-          tf.keras.layers.MaxPooling2D(
-              pool_size=(3, 3),
-              strides=(2, 2),
-              padding="SAME",
-              data_format=self.config.data_format,
-              dtype=self.config.dtype))
-    return init_block
-
-  def _construct_final_block(self):
-    f = self.config.filters[-1]  # Number of filters
-    r = functools.reduce(operator.mul, self.config.strides, 1)  # Reduce ratio
-    r *= self.config.init_stride
-    if self.config.init_max_pool:
-      r *= 2
-
-    if self.config.data_format == "channels_first":
-      w, h = self.config.input_shape[1], self.config.input_shape[2]
-      input_shape = (f, w // r, h // r)
-    elif self.config.data_format == "channels_last":
-      w, h = self.config.input_shape[0], self.config.input_shape[1]
-      input_shape = (w // r, h // r, f)
-    else:
-      raise ValueError("Data format should be either `channels_first`"
-                       " or `channels_last`")
-
-    final_block = tf.keras.Sequential(
-        [
-            tf.keras.layers.BatchNormalization(
-                axis=self.axis,
-                input_shape=input_shape,
-                fused=self.config.fused,
-                dtype=self.config.dtype),
-            tf.keras.layers.Activation("relu"),
-            tf.keras.layers.GlobalAveragePooling2D(
-                data_format=self.config.data_format, dtype=self.config.dtype),
-            tf.keras.layers.Dense(
-                self.config.n_classes, dtype=self.config.dtype)
-        ],
-        name="final")
-    return final_block
+    self._moving_average_variables = []
 
   def _construct_intermediate_blocks(self):
     # Precompute input shape after initial block
@@ -193,109 +128,90 @@ class RevNet(tf.keras.Model):
 
     return tf.reduce_mean(cross_ent)
 
-  def compute_gradients(self, inputs, labels, training=True, l2_reg=True):
+  def compute_gradients(self, saved_hidden, labels, training=True, l2_reg=True):
     """Manually computes gradients.
 
-    When eager execution is enabled, this method also SILENTLY updates the
-    running averages of batch normalization when `training` is set to True.
+    This method silently updates the running averages of batch normalization.
 
     Args:
-      inputs: Image tensor, either NHWC or NCHW, conforming to `data_format`
+      saved_hidden: List of hidden states Tensors
       labels: One-hot labels for classification
       training: Use the mini-batch stats in batch norm if set to True
       l2_reg: Apply l2 regularization
 
     Returns:
-      list of tuples each being (grad, var) for optimizer to use
+      A tuple with the first entry being a list of all gradients and the second
+      being the loss
     """
 
-    # Run forward pass to record hidden states; avoid updating running averages
-    vars_and_vals = self.get_moving_stats()
-    _, saved_hidden = self.call(inputs, training=training)
-    self.restore_moving_stats(vars_and_vals)
-
-    grads_all = []
-    vars_all = []
+    def _defunable_pop(l):
+      """Functional style list pop that works with `tfe.defun`."""
+      t, l = l[-1], l[:-1]
+      return t, l
 
-    # Manually backprop through last block
+    # Backprop through last block
     x = saved_hidden[-1]
     with tf.GradientTape() as tape:
-      x = tf.identity(x)
       tape.watch(x)
-      # Running stats updated below
       logits = self._final_block(x, training=training)
       loss = self.compute_loss(logits, labels)
-
     grads_combined = tape.gradient(loss,
                                    [x] + self._final_block.trainable_variables)
-    dy, grads_ = grads_combined[0], grads_combined[1:]
-    grads_all += grads_
-    vars_all += self._final_block.trainable_variables
+    dy, final_grads = grads_combined[0], grads_combined[1:]
 
-    # Manually backprop through intermediate blocks
+    # Backprop through intermediate blocks
+    intermediate_grads = []
     for block in reversed(self._block_list):
-      y = saved_hidden.pop()
+      y, saved_hidden = _defunable_pop(saved_hidden)
       x = saved_hidden[-1]
-      dy, grads, vars_ = block.backward_grads_and_vars(
-          x, y, dy, training=training)
-      grads_all += grads
-      vars_all += vars_
-
-    # Manually backprop through first block
-    saved_hidden.pop()
-    x = saved_hidden.pop()
-    assert not saved_hidden  # Cleared after backprop
+      dy, grads = block.backward_grads(x, y, dy, training=training)
+      intermediate_grads = grads + intermediate_grads
 
+    # Backprop through first block
+    _, saved_hidden = _defunable_pop(saved_hidden)
+    x, saved_hidden = _defunable_pop(saved_hidden)
+    assert not saved_hidden
     with tf.GradientTape() as tape:
-      x = tf.identity(x)
-      # Running stats updated below
       y = self._init_block(x, training=training)
-
-    grads_all += tape.gradient(
+    init_grads = tape.gradient(
         y, self._init_block.trainable_variables, output_gradients=dy)
-    vars_all += self._init_block.trainable_variables
 
-    # Apply weight decay
+    # Ordering match up with `model.trainable_variables`
+    grads_all = init_grads + final_grads + intermediate_grads
     if l2_reg:
-      grads_all = self._apply_weight_decay(grads_all, vars_all)
+      grads_all = self._apply_weight_decay(grads_all)
 
-    return grads_all, vars_all, loss
+    return grads_all, loss
 
-  def _apply_weight_decay(self, grads, vars_):
+  def _apply_weight_decay(self, grads):
     """Update gradients to reflect weight decay."""
-    # Don't decay bias
     return [
         g + self.config.weight_decay * v if v.name.endswith("kernel:0") else g
-        for g, v in zip(grads, vars_)
+        for g, v in zip(grads, self.trainable_variables)
     ]
 
   def get_moving_stats(self):
-    """Get moving averages of batch normalization.
-
-    This is needed to avoid updating the running average twice in one iteration.
-
-    Returns:
-      A dictionary mapping variables for batch normalization moving averages
-      to their current values.
-    """
-    vars_and_vals = {}
-
-    def _is_moving_var(v):
+    """Get moving averages of batch normalization."""
+    device = "/gpu:0" if tf.test.is_gpu_available() else "/cpu:0"
+    with tf.device(device):
+      return [v.read_value() for v in self.moving_average_variables]
+
+  def restore_moving_stats(self, values):
+    """Restore moving averages of batch normalization."""
+    device = "/gpu:0" if tf.test.is_gpu_available() else "/cpu:0"
+    with tf.device(device):
+      for var_, val in zip(self.moving_average_variables, values):
+        var_.assign(val)
+
+  @property
+  def moving_average_variables(self):
+    """Get all variables that are batch norm moving averages."""
+
+    def _is_moving_avg(v):
       n = v.name
       return n.endswith("moving_mean:0") or n.endswith("moving_variance:0")
 
-    for v in filter(_is_moving_var, self.variables):
-      vars_and_vals[v] = v.read_value()
+    if not self._moving_average_variables:
+      self._moving_average_variables = filter(_is_moving_avg, self.variables)
 
-    return vars_and_vals
-
-  def restore_moving_stats(self, vars_and_vals):
-    """Restore moving averages of batch normalization.
-
-    This is needed to avoid updating the running average twice in one iteration.
-
-    Args:
-      vars_and_vals: The dictionary mapping variables to their previous values.
-    """
-    for var_, val in six.iteritems(vars_and_vals):
-      var_.assign(val)
+    return self._moving_average_variables
diff --git a/tensorflow/contrib/eager/python/examples/revnet/revnet_test.py b/tensorflow/contrib/eager/python/examples/revnet/revnet_test.py
index b0d0a5486d..84b2ddf0de 100644
--- a/tensorflow/contrib/eager/python/examples/revnet/revnet_test.py
+++ b/tensorflow/contrib/eager/python/examples/revnet/revnet_test.py
@@ -31,10 +31,13 @@ tfe = tf.contrib.eager
 
 def train_one_iter(model, inputs, labels, optimizer, global_step=None):
   """Train for one iteration."""
-  grads, vars_, loss = model.compute_gradients(inputs, labels, training=True)
-  optimizer.apply_gradients(zip(grads, vars_), global_step=global_step)
+  logits, saved_hidden = model(inputs)
+  grads, loss = model.compute_gradients(
+      saved_hidden=saved_hidden, labels=labels)
+  optimizer.apply_gradients(
+      zip(grads, model.trainable_variables), global_step=global_step)
 
-  return loss
+  return logits, loss
 
 
 class RevNetTest(tf.test.TestCase):
@@ -42,6 +45,8 @@ class RevNetTest(tf.test.TestCase):
   def setUp(self):
     super(RevNetTest, self).setUp()
     config = config_.get_hparams_cifar_38()
+    config.add_hparam("n_classes", 10)
+    config.add_hparam("dataset", "cifar-10")
     # Reconstruction could cause numerical error, use double precision for tests
     config.dtype = tf.float64
     config.fused = False  # Fused batch norm does not support tf.float64
@@ -93,9 +98,10 @@ class RevNetTest(tf.test.TestCase):
 
   def test_compute_gradients(self):
     """Test `compute_gradients` function."""
-    self.model(self.x, training=False)  # Initialize model
-    grads, vars_, loss = self.model.compute_gradients(
-        inputs=self.x, labels=self.t, training=True, l2_reg=True)
+    _, saved_hidden = self.model(self.x)  # Initialize model
+    grads, loss = self.model.compute_gradients(
+        saved_hidden=saved_hidden, labels=self.t)
+    vars_ = self.model.trainable_variables
     self.assertTrue(isinstance(grads, list))
     self.assertTrue(isinstance(vars_, list))
     self.assertEqual(len(grads), len(vars_))
@@ -104,7 +110,7 @@ class RevNetTest(tf.test.TestCase):
 
     # Compare against the true gradient computed by the tape
     with tf.GradientTape() as tape:
-      logits, _ = self.model(self.x, training=True)
+      logits, _ = self.model(self.x)
       loss_true = self.model.compute_loss(logits=logits, labels=self.t)
     grads_true = tape.gradient(loss_true, vars_)
     self.assertAllClose(loss, loss_true)
@@ -119,7 +125,9 @@ class RevNetTest(tf.test.TestCase):
   def test_compute_gradients_defun(self):
     """Test `compute_gradients` function with defun."""
     compute_gradients = tfe.defun(self.model.compute_gradients)
-    grads, vars_, _ = compute_gradients(self.x, self.t, training=True)
+    _, saved_hidden = self.model(self.x)
+    grads, _ = compute_gradients(saved_hidden=saved_hidden, labels=self.t)
+    vars_ = self.model.trainable_variables
     self.assertTrue(isinstance(grads, list))
     self.assertTrue(isinstance(vars_, list))
     self.assertEqual(len(grads), len(vars_))
@@ -131,6 +139,9 @@ class RevNetTest(tf.test.TestCase):
     """Test model training in graph mode."""
     with tf.Graph().as_default():
       config = config_.get_hparams_cifar_38()
+      config.add_hparam("n_classes", 10)
+      config.add_hparam("dataset", "cifar-10")
+
       x = tf.random_normal(
           shape=(self.config.batch_size,) + self.config.input_shape)
       t = tf.random_uniform(
@@ -140,15 +151,11 @@ class RevNetTest(tf.test.TestCase):
           dtype=tf.int32)
       global_step = tf.Variable(0., trainable=False)
       model = revnet.RevNet(config=config)
-      model(x)
-      updates = model.get_updates_for(x)
-
-      x_ = tf.identity(x)
-      grads_all, vars_all, _ = model.compute_gradients(x_, t, training=True)
+      _, saved_hidden = model(x)
+      grads, _ = model.compute_gradients(saved_hidden=saved_hidden, labels=t)
       optimizer = tf.train.AdamOptimizer(learning_rate=1e-3)
-      with tf.control_dependencies(updates):
-        train_op = optimizer.apply_gradients(
-            zip(grads_all, vars_all), global_step=global_step)
+      train_op = optimizer.apply_gradients(
+          zip(grads, model.trainable_variables), global_step=global_step)
 
       with tf.Session() as sess:
         sess.run(tf.global_variables_initializer())
diff --git a/tensorflow/contrib/eager/python/metrics_impl.py b/tensorflow/contrib/eager/python/metrics_impl.py
index efa6ba0626..6efafccd6b 100644
--- a/tensorflow/contrib/eager/python/metrics_impl.py
+++ b/tensorflow/contrib/eager/python/metrics_impl.py
@@ -291,8 +291,6 @@ class Metric(checkpointable.CheckpointableBase):
 
 class Mean(Metric):
   """Computes the (weighted) mean of the given values."""
-  # TODO(josh11b): Maybe have a dtype argument that defaults to tf.float64?
-  # Or defaults to type of the input if it is tf.float32, else tf.float64?
 
   def __init__(self, name=None, dtype=dtypes.float64,
                use_global_variables=False):
@@ -377,7 +375,7 @@ class Accuracy(Mean):
         array_ops.shape(labels), array_ops.shape(predictions),
         message="Shapes of labels and predictions are unequal")
     matches = math_ops.equal(labels, predictions)
-    matches = math_ops.cast(matches, dtypes.float64)
+    matches = math_ops.cast(matches, self.dtype)
     super(Accuracy, self).call(matches, weights=weights)
     if weights is None:
       return labels, predictions
@@ -421,7 +419,7 @@ class CategoricalAccuracy(Mean):
     labels = math_ops.argmax(labels, axis=-1)
     predictions = math_ops.argmax(predictions, axis=-1)
     matches = math_ops.equal(labels, predictions)
-    matches = math_ops.cast(matches, dtypes.float64)
+    matches = math_ops.cast(matches, self.dtype)
     super(CategoricalAccuracy, self).call(matches, weights=weights)
     if weights is None:
       return labels, predictions
@@ -472,7 +470,7 @@ class BinaryAccuracy(Mean):
     predictions = ops.convert_to_tensor(predictions)
     predictions = predictions > self.threshold
     matches = math_ops.equal(labels, predictions)
-    matches = math_ops.cast(matches, dtypes.float64)
+    matches = math_ops.cast(matches, self.dtype)
     super(BinaryAccuracy, self).call(matches, weights=weights)
     if weights is None:
       return labels, predictions
@@ -520,7 +518,7 @@ class SparseAccuracy(Mean):
     predictions = math_ops.argmax(predictions, axis=-1)
     labels = math_ops.cast(labels, dtypes.int64)
     matches = math_ops.equal(labels, predictions)
-    matches = math_ops.cast(matches, dtypes.float64)
+    matches = math_ops.cast(matches, self.dtype)
     super(SparseAccuracy, self).call(matches, weights=weights)
     if weights is None:
       return labels, predictions
diff --git a/tensorflow/contrib/eager/python/saver.py b/tensorflow/contrib/eager/python/saver.py
index fdaca90fd1..d709308647 100644
--- a/tensorflow/contrib/eager/python/saver.py
+++ b/tensorflow/contrib/eager/python/saver.py
@@ -125,8 +125,8 @@ class Saver(object):
 
     Args:
       var_list: The list of variables that will be saved and restored. Either a
-        list of `tfe.Variable` objects, or a dictionary mapping names to
-        `tfe.Variable` objects.
+        list of `tf.Variable` objects, or a dictionary mapping names to
+        `tf.Variable` objects.
 
     Raises:
       RuntimeError: if invoked when eager execution has not been enabled.
diff --git a/tensorflow/contrib/eager/python/tfe.py b/tensorflow/contrib/eager/python/tfe.py
index ca6430253b..2f0ab616e4 100644
--- a/tensorflow/contrib/eager/python/tfe.py
+++ b/tensorflow/contrib/eager/python/tfe.py
@@ -34,6 +34,7 @@ To use, at program startup, call `tfe.enable_eager_execution()`.
 
 @@run
 @@enable_eager_execution
+@@enable_remote_eager_execution
 
 @@custom_gradient
 
@@ -114,6 +115,7 @@ from tensorflow.python.eager.execution_callbacks import inf_nan_callback
 from tensorflow.python.eager.execution_callbacks import nan_callback
 from tensorflow.python.eager.execution_callbacks import seterr
 from tensorflow.python.framework.ops import enable_eager_execution
+from tensorflow.python.framework.ops import enable_eager_execution_internal as enable_remote_eager_execution
 from tensorflow.python.framework.ops import eager_run as run
 from tensorflow.python.framework.test_util import run_in_graph_and_eager_modes as run_test_in_graph_and_eager_modes
 from tensorflow.python.framework.test_util import run_all_in_graph_and_eager_modes as run_all_tests_in_graph_and_eager_modes
diff --git a/tensorflow/contrib/estimator/BUILD b/tensorflow/contrib/estimator/BUILD
index 1aa3df8d8d..349f48f7f7 100644
--- a/tensorflow/contrib/estimator/BUILD
+++ b/tensorflow/contrib/estimator/BUILD
@@ -28,6 +28,7 @@ py_library(
         ":multi_head",
         ":replicate_model_fn",
         ":rnn",
+        ":saved_model_estimator",
         "//tensorflow:tensorflow_py_no_contrib",
     ],
 )
@@ -465,3 +466,43 @@ py_test(
         "@absl_py//absl/testing:parameterized",
     ],
 )
+
+py_library(
+    name = "saved_model_estimator",
+    srcs = ["python/estimator/saved_model_estimator.py"],
+    deps = [
+        ":export",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:training",
+        "//tensorflow/python/estimator",
+        "//tensorflow/python/estimator:export",
+        "//tensorflow/python/estimator:model_fn",
+        "//tensorflow/python/saved_model",
+    ],
+)
+
+py_test(
+    name = "saved_model_estimator_test",
+    size = "medium",
+    srcs = ["python/estimator/saved_model_estimator_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":export",
+        ":saved_model_estimator",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:metrics",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/estimator",
+        "//tensorflow/python/estimator:export_export",
+        "//tensorflow/python/estimator:export_output",
+        "//tensorflow/python/estimator:model_fn",
+    ],
+)
diff --git a/tensorflow/contrib/estimator/__init__.py b/tensorflow/contrib/estimator/__init__.py
index 09fcfd66a1..e1453ae1d0 100644
--- a/tensorflow/contrib/estimator/__init__.py
+++ b/tensorflow/contrib/estimator/__init__.py
@@ -33,6 +33,8 @@ from tensorflow.contrib.estimator.python.estimator.logit_fns import *
 from tensorflow.contrib.estimator.python.estimator.multi_head import *
 from tensorflow.contrib.estimator.python.estimator.replicate_model_fn import *
 from tensorflow.contrib.estimator.python.estimator.rnn import *
+from tensorflow.contrib.estimator.python.estimator.saved_model_estimator import *
+from tensorflow.python.estimator.export.export import *
 
 from tensorflow.python.util.all_util import remove_undocumented
 # pylint: enable=unused-import,line-too-long,wildcard-import
@@ -70,6 +72,9 @@ _allowed_symbols = [
     'stop_if_higher_hook',
     'stop_if_no_increase_hook',
     'stop_if_no_decrease_hook',
+    'build_raw_supervised_input_receiver_fn',
+    'build_supervised_input_receiver_fn_from_input_fn',
+    'SavedModelEstimator'
 ]
 
 remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/contrib/estimator/python/estimator/boosted_trees.py b/tensorflow/contrib/estimator/python/estimator/boosted_trees.py
index 43bfcffd79..7ed77bcce6 100644
--- a/tensorflow/contrib/estimator/python/estimator/boosted_trees.py
+++ b/tensorflow/contrib/estimator/python/estimator/boosted_trees.py
@@ -50,7 +50,8 @@ class _BoostedTreesEstimator(estimator.Estimator):
                tree_complexity=0.,
                min_node_weight=0.,
                config=None,
-               center_bias=False):
+               center_bias=False,
+               pruning_mode='none'):
     """Initializes a `BoostedTreesEstimator` instance.
 
     Args:
@@ -89,13 +90,18 @@ class _BoostedTreesEstimator(estimator.Estimator):
         regression problems, the first node will return the mean of the labels.
         For binary classification problems, it will return a logit for a prior
         probability of label 1.
+      pruning_mode: one of 'none', 'pre', 'post' to indicate no pruning, pre-
+        pruning (do not split a node if not enough gain is observed) and post
+        pruning (build the tree up to a max depth and then prune branches with
+        negative gain). For pre and post pruning, you MUST provide
+        tree_complexity >0.
 
     """
     # pylint:disable=protected-access
     # HParams for the model.
     tree_hparams = canned_boosted_trees._TreeHParams(
         n_trees, max_depth, learning_rate, l1_regularization, l2_regularization,
-        tree_complexity, min_node_weight, center_bias)
+        tree_complexity, min_node_weight, center_bias, pruning_mode)
 
     def _model_fn(features, labels, mode, config):
       return canned_boosted_trees._bt_model_fn(
@@ -129,7 +135,8 @@ def boosted_trees_classifier_train_in_memory(
     min_node_weight=0.,
     config=None,
     train_hooks=None,
-    center_bias=False):
+    center_bias=False,
+    pruning_mode='none'):
   """Trains a boosted tree classifier with in memory dataset.
 
   Example:
@@ -208,6 +215,11 @@ def boosted_trees_classifier_train_in_memory(
         regression problems, the first node will return the mean of the labels.
         For binary classification problems, it will return a logit for a prior
         probability of label 1.
+    pruning_mode: one of 'none', 'pre', 'post' to indicate no pruning, pre-
+        pruning (do not split a node if not enough gain is observed) and post
+        pruning (build the tree up to a max depth and then prune branches with
+        negative gain). For pre and post pruning, you MUST provide
+        tree_complexity >0.
 
   Returns:
     a `BoostedTreesClassifier` instance created with the given arguments and
@@ -228,7 +240,7 @@ def boosted_trees_classifier_train_in_memory(
   # HParams for the model.
   tree_hparams = canned_boosted_trees._TreeHParams(
       n_trees, max_depth, learning_rate, l1_regularization, l2_regularization,
-      tree_complexity, min_node_weight, center_bias)
+      tree_complexity, min_node_weight, center_bias, pruning_mode)
 
   def _model_fn(features, labels, mode, config):
     return canned_boosted_trees._bt_model_fn(
@@ -269,7 +281,8 @@ def boosted_trees_regressor_train_in_memory(
     min_node_weight=0.,
     config=None,
     train_hooks=None,
-    center_bias=False):
+    center_bias=False,
+    pruning_mode='none'):
   """Trains a boosted tree regressor with in memory dataset.
 
   Example:
@@ -341,6 +354,11 @@ def boosted_trees_regressor_train_in_memory(
         regression problems, the first node will return the mean of the labels.
         For binary classification problems, it will return a logit for a prior
         probability of label 1.
+    pruning_mode: one of 'none', 'pre', 'post' to indicate no pruning, pre-
+        pruning (do not split a node if not enough gain is observed) and post
+        pruning (build the tree up to a max depth and then prune branches with
+        negative gain). For pre and post pruning, you MUST provide
+        tree_complexity >0.
 
   Returns:
     a `BoostedTreesClassifier` instance created with the given arguments and
@@ -360,7 +378,7 @@ def boosted_trees_regressor_train_in_memory(
   # HParams for the model.
   tree_hparams = canned_boosted_trees._TreeHParams(
       n_trees, max_depth, learning_rate, l1_regularization, l2_regularization,
-      tree_complexity, min_node_weight, center_bias)
+      tree_complexity, min_node_weight, center_bias, pruning_mode)
 
   def _model_fn(features, labels, mode, config):
     return canned_boosted_trees._bt_model_fn(
diff --git a/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py b/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py
index 999c2aa5e2..b1581f3750 100644
--- a/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py
@@ -136,6 +136,49 @@ class BoostedTreesEstimatorTest(test_util.TensorFlowTestCase):
     eval_res = est.evaluate(input_fn=input_fn, steps=1)
     self.assertAllClose(eval_res['average_loss'], 0.614642)
 
+  def testTrainAndEvaluateEstimatorWithPrePruning(self):
+    input_fn = _make_train_input_fn(is_classification=False)
+
+    est = boosted_trees._BoostedTreesEstimator(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=2,
+        head=self._head,
+        max_depth=5,
+        tree_complexity=0.001,
+        pruning_mode='pre')
+
+    num_steps = 100
+    # Train for a few steps, and validate final checkpoint.
+    est.train(input_fn, steps=num_steps)
+    # We stop actually after 2*depth*n_trees steps (via a hook) because we still
+    # could not grow 2 trees of depth 5 (due to pre-pruning).
+    self._assert_checkpoint(
+        est.model_dir, global_step=21, finalized_trees=0, attempted_layers=21)
+    eval_res = est.evaluate(input_fn=input_fn, steps=1)
+    self.assertAllClose(eval_res['average_loss'], 3.83943)
+
+  def testTrainAndEvaluateEstimatorWithPostPruning(self):
+    input_fn = _make_train_input_fn(is_classification=False)
+
+    est = boosted_trees._BoostedTreesEstimator(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=2,
+        head=self._head,
+        max_depth=5,
+        tree_complexity=0.001,
+        pruning_mode='post')
+
+    # It will stop after 10 steps because of the max depth and num trees.
+    num_steps = 100
+    # Train for a few steps, and validate final checkpoint.
+    est.train(input_fn, steps=num_steps)
+    self._assert_checkpoint(
+        est.model_dir, global_step=10, finalized_trees=2, attempted_layers=10)
+    eval_res = est.evaluate(input_fn=input_fn, steps=1)
+    self.assertAllClose(eval_res['average_loss'], 2.37652)
+
   def testInferEstimator(self):
     train_input_fn = _make_train_input_fn(is_classification=False)
     predict_input_fn = numpy_io.numpy_input_fn(
@@ -231,6 +274,31 @@ class BoostedTreesEstimatorTest(test_util.TensorFlowTestCase):
     self.assertAllClose([[0], [1], [1], [0], [0]],
                         [pred['class_ids'] for pred in predictions])
 
+  def testBinaryClassifierTrainInMemoryAndEvalAndInferWithPrePruning(self):
+    train_input_fn = _make_train_input_fn(is_classification=True)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.boosted_trees_classifier_train_in_memory(
+        train_input_fn=train_input_fn,
+        feature_columns=self._feature_columns,
+        n_trees=1,
+        max_depth=5,
+        pruning_mode='pre',
+        tree_complexity=0.01)
+    # We stop actually after 2*depth*n_trees steps (via a hook) because we still
+    # could not grow 1 trees of depth 5 (due to pre-pruning).
+    self._assert_checkpoint(
+        est.model_dir, global_step=11, finalized_trees=0, attempted_layers=11)
+
+    # Check evaluate and predict.
+    eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
+    self.assertAllClose(eval_res['accuracy'], 1.0)
+    # Validate predictions.
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose([[0], [1], [1], [0], [0]],
+                        [pred['class_ids'] for pred in predictions])
+
   def testBinaryClassifierTrainInMemoryWithDataset(self):
     train_input_fn = _make_train_input_fn_dataset(is_classification=True)
     predict_input_fn = numpy_io.numpy_input_fn(
diff --git a/tensorflow/contrib/estimator/python/estimator/early_stopping.py b/tensorflow/contrib/estimator/python/estimator/early_stopping.py
index af4855e91e..3eab21d5ac 100644
--- a/tensorflow/contrib/estimator/python/estimator/early_stopping.py
+++ b/tensorflow/contrib/estimator/python/estimator/early_stopping.py
@@ -394,10 +394,11 @@ def _summaries(eval_dir):
   Yields:
     `tensorflow.Event` object read from the event files.
   """
-  for event_file in gfile.Glob(
-      os.path.join(eval_dir, _EVENT_FILE_GLOB_PATTERN)):
-    for event in summary_iterator.summary_iterator(event_file):
-      yield event
+  if gfile.Exists(eval_dir):
+    for event_file in gfile.Glob(
+        os.path.join(eval_dir, _EVENT_FILE_GLOB_PATTERN)):
+      for event in summary_iterator.summary_iterator(event_file):
+        yield event
 
 
 def _get_or_create_stop_var():
diff --git a/tensorflow/contrib/estimator/python/estimator/early_stopping_test.py b/tensorflow/contrib/estimator/python/estimator/early_stopping_test.py
index b5eee818fa..e4bfd4b446 100644
--- a/tensorflow/contrib/estimator/python/estimator/early_stopping_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/early_stopping_test.py
@@ -92,6 +92,19 @@ class ReadEvalMetricsTest(test.TestCase):
         },
     }, early_stopping.read_eval_metrics(eval_dir))
 
+  def test_read_eval_metrics_when_no_events(self):
+    eval_dir = tempfile.mkdtemp()
+    self.assertTrue(os.path.exists(eval_dir))
+
+    # No error should be raised when eval directory exists with no event files.
+    self.assertEqual({}, early_stopping.read_eval_metrics(eval_dir))
+
+    os.rmdir(eval_dir)
+    self.assertFalse(os.path.exists(eval_dir))
+
+    # No error should be raised when eval directory does not exist.
+    self.assertEqual({}, early_stopping.read_eval_metrics(eval_dir))
+
 
 class EarlyStoppingHooksTest(test.TestCase, parameterized.TestCase):
 
diff --git a/tensorflow/contrib/estimator/python/estimator/hooks.py b/tensorflow/contrib/estimator/python/estimator/hooks.py
index ddd6aa442f..caadafdfa6 100644
--- a/tensorflow/contrib/estimator/python/estimator/hooks.py
+++ b/tensorflow/contrib/estimator/python/estimator/hooks.py
@@ -189,7 +189,7 @@ class InMemoryEvaluatorHook(training.SessionRunHook):
         init_fn=feed_variables, copy_from_scaffold=self._scaffold)
 
     with self._graph.as_default():
-      return self._estimator._evaluate_run(
+      self._estimator._evaluate_run(
           checkpoint_path=None,
           scaffold=scaffold,
           update_op=self._update_op,
diff --git a/tensorflow/contrib/estimator/python/estimator/hooks_test.py b/tensorflow/contrib/estimator/python/estimator/hooks_test.py
index 95ae971852..ee88d5ecf5 100644
--- a/tensorflow/contrib/estimator/python/estimator/hooks_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/hooks_test.py
@@ -102,6 +102,7 @@ class InMemoryEvaluatorHookTest(test.TestCase):
     self.assertTrue(os.path.isdir(estimator.eval_dir()))
     step_keyword_to_value = summary_step_keyword_to_value_mapping(
         estimator.eval_dir())
+
     # 4.5 = sum(range(10))/10
     # before training
     self.assertEqual(4.5, step_keyword_to_value[0]['mean_of_features'])
@@ -110,6 +111,7 @@ class InMemoryEvaluatorHookTest(test.TestCase):
     self.assertEqual(4.5, step_keyword_to_value[8]['mean_of_features'])
     # end
     self.assertEqual(4.5, step_keyword_to_value[10]['mean_of_features'])
+    self.assertEqual(set([0, 4, 8, 10]), set(step_keyword_to_value.keys()))
 
   def test_uses_latest_variable_value(self):
 
diff --git a/tensorflow/contrib/estimator/python/estimator/saved_model_estimator.py b/tensorflow/contrib/estimator/python/estimator/saved_model_estimator.py
new file mode 100644
index 0000000000..b0082f7e55
--- /dev/null
+++ b/tensorflow/contrib/estimator/python/estimator/saved_model_estimator.py
@@ -0,0 +1,449 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Class that creates an Estimator from a SavedModel."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+
+from tensorflow.python.estimator import estimator as estimator_lib
+from tensorflow.python.estimator import model_fn as model_fn_lib
+from tensorflow.python.estimator.export import export as export_lib
+from tensorflow.python.estimator.export import export_output
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.saved_model import constants
+from tensorflow.python.saved_model import loader_impl
+from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.training import checkpoint_utils
+from tensorflow.python.training import monitored_session
+from tensorflow.python.training import training_util
+
+
+class SavedModelEstimator(estimator_lib.Estimator):
+  """Create an Estimator from a SavedModel.
+
+  Only SavedModels exported with
+  `tf.contrib.estimator.export_all_saved_models()` or
+  `tf.estimator.Estimator.export_savedmodel()` are supported for this class.
+
+  Example with `tf.estimator.DNNClassifier`:
+
+  **Step 1: Create and train DNNClassifier.**
+
+  ```python
+  feature1 = tf.feature_column.embedding_column(
+      tf.feature_column.categorical_column_with_vocabulary_list(
+          key='feature1', vocabulary_list=('green', 'yellow')), dimension=1)
+  feature2 = tf.feature_column.numeric_column(key='feature2', default_value=0.0)
+
+  classifier = tf.estimator.DNNClassifier(
+      hidden_units=[4,2], feature_columns=[feature1, feature2])
+
+  def input_fn():
+    features = {'feature1': tf.constant(['green', 'green', 'yellow']),
+                'feature2': tf.constant([3.5, 4.2, 6.1])}
+    label = tf.constant([1., 0., 0.])
+    return tf.data.Dataset.from_tensors((features, label)).repeat()
+
+  classifier.train(input_fn=input_fn, steps=10)
+  ```
+
+  **Step 2: Export classifier.**
+  First, build functions that specify the expected inputs.
+
+  ```python
+  # During train and evaluation, both the features and labels should be defined.
+  supervised_input_receiver_fn = (
+      tf.contrib.estimator.build_raw_supervised_input_receiver_fn(
+          {'feature1': tf.placeholder(dtype=tf.string, shape=[None]),
+           'feature2': tf.placeholder(dtype=tf.float32, shape=[None])},
+          tf.placeholder(dtype=tf.float32, shape=[None])))
+
+  # During predict mode, expect to receive a `tf.Example` proto, so a parsing
+  # function is used.
+  serving_input_receiver_fn = (
+      tf.estimator.export.build_parsing_serving_input_receiver_fn(
+          tf.feature_column.make_parse_example_spec([feature1, feature2])))
+  ```
+
+  Next, export the model as a SavedModel. A timestamped directory will be
+  created (for example `/tmp/export_all/1234567890`).
+
+  ```python
+  # Option 1: Save all modes (train, eval, predict)
+  export_dir = tf.contrib.estimator.export_all_saved_models(
+      classifier, '/tmp/export_all',
+      {tf.estimator.ModeKeys.TRAIN: supervised_input_receiver_fn,
+       tf.estimator.ModeKeys.EVAL: supervised_input_receiver_fn,
+       tf.estimator.ModeKeys.PREDICT: serving_input_receiver_fn})
+
+  # Option 2: Only export predict mode
+  export_dir = classifier.export_savedmodel(
+      '/tmp/export_predict', serving_input_receiver_fn)
+  ```
+
+  **Step 3: Create a SavedModelEstimator from the exported SavedModel.**
+
+  ```python
+  est = tf.contrib.estimator.SavedModelEstimator(export_dir)
+
+  # If all modes were exported, you can immediately evaluate and predict, or
+  # continue training. Otherwise only predict is available.
+  eval_results = est.evaluate(input_fn=input_fn, steps=1)
+  print(eval_results)
+
+  est.train(input_fn=input_fn, steps=20)
+
+  def predict_input_fn():
+    example = tf.train.Example()
+    example.features.feature['feature1'].bytes_list.value.extend(['yellow'])
+    example.features.feature['feature2'].float_list.value.extend([1.])
+    return {'inputs':tf.constant([example.SerializeToString()])}
+
+  predictions = est.predict(predict_input_fn)
+  print(next(predictions))
+  ```
+  """
+
+  def __init__(self, saved_model_dir, model_dir=None):
+    """Initialize a SavedModelEstimator.
+
+    The SavedModelEstimator loads its model function and variable values from
+    the graphs defined in the SavedModel. There is no option to pass in
+    `RunConfig` or `params` arguments, because the model function graph is
+    defined statically in the SavedModel.
+
+    Args:
+      saved_model_dir: Directory containing SavedModel protobuf and subfolders.
+      model_dir: Directory to save new checkpoints during training.
+
+    Raises:
+      NotImplementedError: If a DistributionStrategy is defined in the config.
+        Unless the SavedModelEstimator is subclassed, this shouldn't happen.
+    """
+    checkpoint = estimator_lib._get_saved_model_ckpt(saved_model_dir)  # pylint: disable=protected-access
+    vars_to_warm_start = [name for name, _ in
+                          checkpoint_utils.list_variables(checkpoint)]
+    warm_start_settings = estimator_lib.WarmStartSettings(
+        ckpt_to_initialize_from=checkpoint,
+        vars_to_warm_start=vars_to_warm_start)
+
+    super(SavedModelEstimator, self).__init__(
+        model_fn=self._model_fn_from_saved_model, model_dir=model_dir,
+        warm_start_from=warm_start_settings)
+    if self._distribution is not None:
+      raise NotImplementedError(
+          'SavedModelEstimator currently does not support '
+          'DistributionStrategy.')
+    self.saved_model_dir = saved_model_dir
+    self.saved_model_loader = loader_impl.SavedModelLoader(saved_model_dir)
+    self._available_modes = self._extract_available_modes()
+
+  def _extract_available_modes(self):
+    """Return list of modes found in SavedModel."""
+    available_modes = []
+    logging.info('Checking available modes for SavedModelEstimator.')
+    for mode in [model_fn_lib.ModeKeys.TRAIN, model_fn_lib.ModeKeys.EVAL,
+                 model_fn_lib.ModeKeys.PREDICT]:
+      try:
+        self._get_meta_graph_def_for_mode(mode)
+      except RuntimeError:
+        logging.warning('%s mode not found in SavedModel.' % mode)
+        continue
+
+      if self._get_signature_def_for_mode(mode) is not None:
+        available_modes.append(mode)
+
+    logging.info('Available modes for Estimator: %s' % available_modes)
+    return available_modes
+
+  def _validate_mode(self, mode):
+    """Make sure that mode can be run using the SavedModel."""
+    if mode not in self._available_modes:
+      raise RuntimeError('%s mode is not available in the SavedModel. Use '
+                         'saved_model_cli to check that the Metagraph for this '
+                         'mode has been exported.' % mode)
+
+  def _get_meta_graph_def_for_mode(self, mode):
+    tags = model_fn_lib.EXPORT_TAG_MAP[mode]
+    return self.saved_model_loader.get_meta_graph_def_from_tags(tags)
+
+  def _get_signature_def_for_mode(self, mode):
+    meta_graph_def = self._get_meta_graph_def_for_mode(mode)
+    sig_def_key = (signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+                   if mode == model_fn_lib.ModeKeys.PREDICT else mode)
+    if sig_def_key not in meta_graph_def.signature_def:
+      logging.warning('Metagraph for mode %s was found, but SignatureDef with'
+                      ' key \"%s\" is missing.' % (mode, sig_def_key))
+      return None
+    return meta_graph_def.signature_def[sig_def_key]
+
+  def _create_and_assert_global_step(self, graph):
+    # Do nothing here. The global step variable will be created/loaded from the
+    # SavedModel. If a global step variable were created here, the result
+    # will be two duplicate global step variables, causing issues during
+    # the warm-start phase.
+    # Due to the global variable being created in the model function, this may
+    # cause issues when running DistributionStrategy. Thus, DistributionStrategy
+    # is not yet supported with SavedModelEstimator.
+    return None
+
+  def _model_fn_from_saved_model(self, features, labels, mode):
+    """Load a SavedModel graph and return an EstimatorSpec."""
+    # TODO(kathywu): Model function loads placeholders from the graph. Calling
+    # export_all_saved_models creates another placeholder for the inputs, on top
+    # of the original placeholders. There should be a way to avoid this.
+    self._validate_mode(mode)
+
+    g = ops.get_default_graph()
+    if  training_util.get_global_step(g) is not None:
+      raise RuntimeError(
+          'Graph must not contain a global step tensor before the SavedModel is'
+          ' loaded. Please make sure that the input function does not create a '
+          'global step.')
+
+    # Extract SignatureDef for information about the input and output tensors.
+    signature_def = self._get_signature_def_for_mode(mode)
+
+    # Generate input map for replacing the inputs in the SavedModel graph with
+    # the provided features and labels.
+    input_map = _generate_input_map(signature_def, features, labels)
+
+    # Create a list of the names of output tensors. When the graph is loaded,
+    # names of the output tensors may be remapped. This ensures that the correct
+    # tensors are returned in the EstimatorSpec.
+    output_tensor_names = [
+        value.name for value in six.itervalues(signature_def.outputs)]
+
+    # Load the graph. `output_tensors` contains output `Tensors` in the same
+    # same order as the `output_tensor_names` list.
+    tags = model_fn_lib.EXPORT_TAG_MAP[mode]
+    _, output_tensors = self.saved_model_loader.load_graph(
+        g, tags, input_map=input_map, return_elements=output_tensor_names)
+
+    # Create a scaffold from the MetaGraphDef that contains ops to initialize
+    # the graph. This should mirror the steps from _add_meta_graph_for_mode(),
+    # which creates a MetaGraphDef from the EstimatorSpec's scaffold.
+    scaffold = monitored_session.Scaffold(
+        local_init_op=loader_impl._get_main_op_tensor(  # pylint: disable=protected-access
+            self._get_meta_graph_def_for_mode(mode)))
+
+    # Ensure that a global step tensor has been created.
+    global_step_tensor = training_util.get_global_step(g)
+    training_util.assert_global_step(global_step_tensor)
+
+    # Extract values to return in the EstimatorSpec.
+    output_map = dict(zip(output_tensor_names, output_tensors))
+    outputs = {key: output_map[value.name]
+               for key, value in six.iteritems(signature_def.outputs)}
+
+    loss, predictions, metrics = _validate_and_extract_outputs(
+        mode, outputs, signature_def.method_name)
+
+    train_op = ops.get_collection(constants.TRAIN_OP_KEY)
+    if len(train_op) > 1:
+      raise RuntimeError('Multiple ops found in the train_op collection.')
+    train_op = None if not train_op else train_op[0]
+
+    _clear_saved_model_collections()
+    return model_fn_lib.EstimatorSpec(
+        scaffold=scaffold,
+        mode=mode,
+        loss=loss,
+        train_op=train_op,
+        predictions=predictions,
+        eval_metric_ops=metrics)
+
+
+def _clear_saved_model_collections():
+  """Clear collections that are expected empty when exporting a SavedModel.
+
+  The SavedModel builder uses these collections to track ops necessary to
+  restore the graph state. These collections are expected to be empty before
+  MetaGraphs are added to the builder.
+  """
+  del ops.get_collection_ref(constants.ASSETS_KEY)[:]
+  del ops.get_collection_ref(constants.LEGACY_INIT_OP_KEY)[:]
+  del ops.get_collection_ref(constants.MAIN_OP_KEY)[:]
+  del ops.get_collection_ref(constants.TRAIN_OP_KEY)[:]
+
+
+def _generate_input_map(signature_def, features, labels):
+  """Return dict mapping an input tensor name to a feature or label tensor.
+
+  Args:
+    signature_def: SignatureDef loaded from SavedModel
+    features: A `Tensor`, `SparseTensor`, or dict of string to `Tensor` or
+      `SparseTensor`, specifying the features to be passed to the model.
+    labels: A `Tensor`, `SparseTensor`, or dict of string to `Tensor` or
+      `SparseTensor`, specifying the labels to be passed to the model. May be
+      `None`.
+
+  Returns:
+    dict mapping string names of inputs to features or labels tensors
+
+  Raises:
+    ValueError: if SignatureDef inputs are not completely mapped by the input
+      features and labels.
+  """
+  # pylint: disable=protected-access
+  if not isinstance(features, dict):
+    features = {export_lib._SINGLE_FEATURE_DEFAULT_NAME: features}
+  if labels is not None and not isinstance(labels, dict):
+    labels = {export_lib._SINGLE_LABEL_DEFAULT_NAME: labels}
+  # pylint: enable=protected-access
+
+  inputs = signature_def.inputs
+  input_map = {}
+  for key, tensor_info in six.iteritems(inputs):
+    input_name = tensor_info.name
+    if ':' in input_name:
+      input_name = input_name[:input_name.find(':')]
+
+    # When tensors are used as control inputs for operations, their names are
+    # prepended with a '^' character in the GraphDef. To handle possible control
+    # flow edge cases, control input names must be included in the input map.
+    control_dependency_name = '^' + input_name
+
+    if key in features:
+      _check_same_dtype_and_shape(features[key], tensor_info, key)
+      input_map[input_name] = input_map[control_dependency_name] = features[key]
+    elif labels is not None and key in labels:
+      _check_same_dtype_and_shape(labels[key], tensor_info, key)
+      input_map[input_name] = input_map[control_dependency_name] = labels[key]
+    else:
+      raise ValueError(
+          'Key \"%s\" not found in features or labels passed in to the model '
+          'function. All required keys: %s' % (key, inputs.keys()))
+
+  return input_map
+
+
+def _check_same_dtype_and_shape(tensor, tensor_info, name):
+  """Validate that tensor has the same properties as the TensorInfo proto.
+
+  Args:
+    tensor: a `Tensor` object.
+    tensor_info: a `TensorInfo` proto.
+    name: Name of the input (to identify Tensor if an error is raised).
+
+  Raises:
+    ValueError: If the tensor shape or dtype don't match the TensorInfo
+  """
+  dtype_error = (tensor.dtype != dtypes.DType(tensor_info.dtype))
+  shape_error = not tensor.shape.is_compatible_with(tensor_info.tensor_shape)
+
+  if dtype_error or shape_error:
+    msg = 'Tensor shape and/or dtype validation failed for input %s:' % name
+    if dtype_error:
+      msg += ('\n\tExpected dtype: %s, Got: %s'
+              % (dtypes.DType(tensor_info.dtype), tensor.dtype))
+    if shape_error:
+      msg += ('\n\tExpected shape: %s, Got: %s'
+              % (tensor_shape.TensorShape(tensor_info.tensor_shape),
+                 tensor.shape))
+
+    raise ValueError(msg)
+
+
+def _extract_eval_metrics(output_dict):
+  """Return a eval metric dict extracted from the output_dict.
+
+  Eval metrics consist of a value tensor and an update op. Both must be in the
+  passed-in tensor dictionary for an eval metric to be added to the returned
+  dictionary.
+
+  Args:
+    output_dict: a dict that maps strings to tensors.
+
+  Returns:
+    dict mapping strings to (value, update_op) tuples.
+  """
+  # pylint: disable=protected-access
+  metric_ops = {}
+  separator_char = export_output._SupervisedOutput._SEPARATOR_CHAR
+
+  for key, tensor in six.iteritems(output_dict):
+    split_key = key.split(separator_char)
+
+    # The metric name may contain the separator character, so recreate its name.
+    metric_name = separator_char.join(split_key[:-1])
+
+    if split_key[0] == export_output._SupervisedOutput.METRICS_NAME:
+      # If the key ends with the value suffix, and there is a corresponding
+      # key ending with the update_op suffix, then add tensors to metrics dict.
+      if split_key[-1] == export_output._SupervisedOutput.METRIC_VALUE_SUFFIX:
+        update_op = ''.join(
+            [metric_name, separator_char,
+             export_output._SupervisedOutput.METRIC_UPDATE_SUFFIX])
+        if update_op in output_dict:
+          update_op_tensor = output_dict[update_op]
+          metric_ops[metric_name] = (tensor, update_op_tensor)
+
+  # pylint: enable=protected-access
+  return metric_ops
+
+
+def _validate_and_extract_outputs(mode, output_dict, method_name):
+  """Extract values from SignatureDef output dictionary.
+
+  Args:
+    mode: One of the modes enumerated in `tf.estimator.ModeKeys`.
+    output_dict: dict of string SignatureDef keys to `Tensor`.
+    method_name: Method name of the SignatureDef as a string.
+
+  Returns:
+    Tuple of (
+      loss: `Tensor` object,
+      predictions: dictionary mapping string keys to `Tensor` objects,
+      metrics: dictionary mapping string keys to a tuple of two `Tensor` objects
+    )
+
+  Raises:
+    RuntimeError: raised if SignatureDef has an invalid method name for the mode
+  """
+  # pylint: disable=protected-access
+  loss, predictions, metrics = None, None, None
+
+  if mode == model_fn_lib.ModeKeys.PREDICT:
+    predictions = output_dict
+  else:
+    # Validate that the SignatureDef's method name matches the expected name for
+    # the given mode.
+    expected_method_name = signature_constants.SUPERVISED_TRAIN_METHOD_NAME
+    if mode == model_fn_lib.ModeKeys.EVAL:
+      expected_method_name = signature_constants.SUPERVISED_EVAL_METHOD_NAME
+    if method_name != expected_method_name:
+      raise RuntimeError(
+          'Invalid SignatureDef method name for mode %s.\n\tExpected: %s\n\t'
+          'Got: %s\nPlease ensure that the SavedModel was exported with '
+          '`tf.contrib.estimator.export_all_saved_models()`.' %
+          (mode, expected_method_name, method_name))
+
+    # Extract loss, metrics and predictions from the output dict.
+    loss = output_dict[export_output._SupervisedOutput.LOSS_NAME]
+    metrics = _extract_eval_metrics(output_dict)
+    predictions = {
+        key: value for key, value in six.iteritems(output_dict)
+        if key.split(export_output._SupervisedOutput._SEPARATOR_CHAR)[0] == (
+            export_output._SupervisedOutput.PREDICTIONS_NAME)}
+
+  # pylint: enable=protected-access
+  return loss, predictions, metrics
diff --git a/tensorflow/contrib/estimator/python/estimator/saved_model_estimator_test.py b/tensorflow/contrib/estimator/python/estimator/saved_model_estimator_test.py
new file mode 100644
index 0000000000..718da1367c
--- /dev/null
+++ b/tensorflow/contrib/estimator/python/estimator/saved_model_estimator_test.py
@@ -0,0 +1,369 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for SavedModelEstimator."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import shutil
+import tempfile
+
+from tensorflow.contrib.estimator.python.estimator import export as contrib_export
+from tensorflow.contrib.estimator.python.estimator import saved_model_estimator
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.estimator import estimator
+from tensorflow.python.estimator import model_fn as model_fn_lib
+from tensorflow.python.estimator.export import export
+from tensorflow.python.estimator.export import export_output
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import metrics as metrics_lib
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import monitored_session
+from tensorflow.python.training import training
+
+
+def dummy_input_fn():
+  return dataset_ops.Dataset.from_tensors((
+      {'x': constant_op.constant([[1], [-2]], dtype=dtypes.int64)},
+      constant_op.constant([[4], [-3]], dtype=dtypes.float32))).repeat()
+
+
+def dummy_input_fn_features_only():
+  return dataset_ops.Dataset.from_tensors(
+      {'x': constant_op.constant([[5], [6]], dtype=dtypes.int64)}).repeat()
+
+
+def dummy_supervised_receiver_fn():
+  feature_spec = {
+      'x': array_ops.placeholder(
+          dtype=dtypes.int64, shape=(2, 1), name='feature_x'),
+      }
+  label_spec = array_ops.placeholder(
+      dtype=dtypes.float32, shape=[2, 1], name='truth')
+  return export.build_raw_supervised_input_receiver_fn(
+      feature_spec, label_spec)
+
+
+def dummy_serving_receiver_fn():
+  feature_spec = {'x': array_ops.placeholder(
+      dtype=dtypes.int64, shape=(2, 1), name='feature_x'),}
+  return export.build_raw_serving_input_receiver_fn(feature_spec)
+
+
+def model_fn_diff_modes(features, labels, mode):
+  _, _ = features, labels
+  v = variables.Variable(21, name='some_var')
+  train_op = None
+  loss = constant_op.constant(104)
+  if mode == model_fn_lib.ModeKeys.TRAIN:
+    loss = constant_op.constant(105)
+    predictions = constant_op.constant([501])
+    train_op = control_flow_ops.group(
+        state_ops.assign_add(training.get_global_step(), 1),
+        state_ops.assign_add(v, 3))
+  elif mode == model_fn_lib.ModeKeys.EVAL:
+    loss = constant_op.constant(106)
+    predictions = constant_op.constant([502])
+  else:
+    loss = constant_op.constant(107)
+    predictions = constant_op.constant([503])
+  return model_fn_lib.EstimatorSpec(
+      mode,
+      loss=loss,
+      train_op=train_op,
+      eval_metric_ops={
+          'abs_err': metrics_lib.mean_absolute_error(
+              constant_op.constant(0), predictions)},
+      predictions=predictions)
+
+
+class SavedModelEstimatorTest(test.TestCase):
+
+  def setUp(self):
+    self.tmpdirs = []
+
+  def tearDown(self):
+    for tmpdir in self.tmpdirs:
+      # gfile.DeleteRecursively fails in the windows cmake test, so use shutil.
+      shutil.rmtree(tmpdir, ignore_errors=True)
+    self.tmpdirs = []
+
+  def _get_tmp_dir(self):
+    tmpdir = tempfile.mkdtemp()
+    self.tmpdirs.append(tmpdir)
+    return tmpdir
+
+  def _export_estimator(self, train=True, evaluate=True, predict=True,
+                        model_fn=model_fn_diff_modes):
+    est = estimator.Estimator(model_fn, self._get_tmp_dir())
+    est.train(input_fn=dummy_input_fn, steps=10)
+
+    input_receiver_fn_map = {}
+    if train:
+      input_receiver_fn_map[model_fn_lib.ModeKeys.TRAIN] = (
+          dummy_supervised_receiver_fn())
+    if evaluate:
+      input_receiver_fn_map[model_fn_lib.ModeKeys.EVAL] = (
+          dummy_supervised_receiver_fn())
+    if predict:
+      input_receiver_fn_map[model_fn_lib.ModeKeys.PREDICT] = (
+          dummy_serving_receiver_fn())
+
+    export_base_path = self._get_tmp_dir()
+    export_dir = contrib_export.export_all_saved_models(
+        est, export_base_path, input_receiver_fn_map)
+    return export_dir
+
+  def test_load_all_modes(self):
+    sme = saved_model_estimator.SavedModelEstimator(
+        self._export_estimator(), self._get_tmp_dir())
+    sme.train(input_fn=dummy_input_fn, steps=1)
+    sme.train(input_fn=dummy_input_fn, steps=2)
+    self.assertEqual(13, sme.get_variable_value('global_step'))
+    self.assertEqual(60, sme.get_variable_value('some_var'))
+
+    eval_results = sme.evaluate(dummy_input_fn, steps=5)
+
+    self.assertEqual(13, eval_results['global_step'])
+    self.assertEqual(106, eval_results['loss'])
+    self.assertEqual(502, eval_results['metrics/abs_err'])
+
+    predictions = next(sme.predict(dummy_input_fn_features_only))
+    self.assertDictEqual({'output': 503}, predictions)
+
+  def test_load_all_modes_no_train(self):
+    """Ensure that all functions can be used without requiring a ckpt."""
+    sme = saved_model_estimator.SavedModelEstimator(
+        self._export_estimator(), self._get_tmp_dir())
+    eval_results = sme.evaluate(dummy_input_fn, steps=5)
+    self.assertEqual(10, eval_results['global_step'])
+    self.assertEqual(106, eval_results['loss'])
+    self.assertEqual(502, eval_results['metrics/abs_err'])
+
+    predictions = next(sme.predict(dummy_input_fn_features_only))
+    self.assertDictEqual({'output': 503}, predictions)
+
+  def test_partial_exported_estimator(self):
+    sme1 = saved_model_estimator.SavedModelEstimator(
+        self._export_estimator(train=False, predict=False), self._get_tmp_dir())
+    sme1.evaluate(dummy_input_fn, steps=5)
+    with self.assertRaisesRegexp(RuntimeError, 'train mode is not available'):
+      sme1.train(input_fn=dummy_input_fn, steps=1)
+    with self.assertRaisesRegexp(RuntimeError, 'infer mode is not available'):
+      next(sme1.predict(dummy_input_fn_features_only))
+
+    sme2 = saved_model_estimator.SavedModelEstimator(
+        self._export_estimator(evaluate=False), self._get_tmp_dir())
+    sme2.train(input_fn=dummy_input_fn, steps=1)
+    next(sme2.predict(dummy_input_fn_features_only))
+    with self.assertRaisesRegexp(RuntimeError, 'eval mode is not available'):
+      sme2.evaluate(dummy_input_fn, steps=5)
+
+  def test_with_incorrect_input(self):
+    sme = saved_model_estimator.SavedModelEstimator(
+        self._export_estimator(), self._get_tmp_dir())
+
+    def bad_shape_input_fn():
+      return dataset_ops.Dataset.from_tensors((
+          {'x': constant_op.constant([1, 2], dtype=dtypes.int64)},
+          constant_op.constant([1, 2], dtype=dtypes.float32)))
+
+    with self.assertRaisesRegexp(ValueError, 'Expected shape'):
+      sme.train(bad_shape_input_fn, steps=1)
+
+    def bad_dtype_input_fn():
+      return dataset_ops.Dataset.from_tensors((
+          {'x': constant_op.constant([[1], [1]], dtype=dtypes.int32)},
+          constant_op.constant([[1], [1]], dtype=dtypes.int64)))
+
+    with self.assertRaisesRegexp(ValueError, 'Expected dtype'):
+      sme.train(bad_dtype_input_fn, steps=1)
+
+  def test_input_fn_with_global_step(self):
+    sme = saved_model_estimator.SavedModelEstimator(
+        self._export_estimator(), self._get_tmp_dir())
+
+    def bad_input_fn():
+      training.get_or_create_global_step()
+      return dataset_ops.Dataset.from_tensors((
+          {'x': constant_op.constant([[1], [1]], dtype=dtypes.int64)},
+          constant_op.constant([[1], [1]], dtype=dtypes.float32)))
+
+    with self.assertRaisesRegexp(RuntimeError,
+                                 'Graph must not contain a global step tensor'):
+      sme.train(bad_input_fn, steps=1)
+
+  def test_re_export_saved_model_serving_only(self):
+    sme = saved_model_estimator.SavedModelEstimator(
+        self._export_estimator(), self._get_tmp_dir())
+    sme.train(dummy_input_fn, steps=3)
+    self.assertEqual(13, sme.get_variable_value('global_step'))
+    self.assertEqual(60, sme.get_variable_value('some_var'))
+
+    predictions = next(sme.predict(dummy_input_fn_features_only))
+    self.assertDictEqual({'output': 503}, predictions)
+
+    # Export SavedModel, and test that the variable and prediction values are
+    # the same.
+    sme_export_dir = sme.export_savedmodel(
+        self._get_tmp_dir(), dummy_serving_receiver_fn())
+
+    sme2 = saved_model_estimator.SavedModelEstimator(
+        sme_export_dir, self._get_tmp_dir())
+    self.assertEqual(60, sme.get_variable_value('some_var'))
+    self.assertEqual(13, sme.get_variable_value('global_step'))
+
+    predictions = next(sme2.predict(dummy_input_fn_features_only))
+    self.assertDictEqual({'output': 503}, predictions)
+
+  def test_re_export_saved_model(self):
+    sme = saved_model_estimator.SavedModelEstimator(
+        self._export_estimator(), self._get_tmp_dir())
+    self.assertDictEqual(
+        {'loss': 106, 'metrics/abs_err': 502, 'global_step': 10},
+        sme.evaluate(dummy_input_fn, steps=1))
+
+    sme.train(dummy_input_fn, steps=3)
+    self.assertDictEqual(
+        {'loss': 106, 'metrics/abs_err': 502, 'global_step': 13},
+        sme.evaluate(dummy_input_fn, steps=1))
+    self.assertEqual(60, sme.get_variable_value('some_var'))
+
+    predictions = next(sme.predict(dummy_input_fn_features_only))
+    self.assertDictEqual({'output': 503}, predictions)
+
+    # Export SavedModel for all modes
+    input_receiver_fn_map = {
+        model_fn_lib.ModeKeys.TRAIN: dummy_supervised_receiver_fn(),
+        model_fn_lib.ModeKeys.EVAL: dummy_supervised_receiver_fn(),
+        model_fn_lib.ModeKeys.PREDICT: dummy_serving_receiver_fn()}
+    sme_export_dir = contrib_export.export_all_saved_models(
+        sme, self._get_tmp_dir(), input_receiver_fn_map)
+
+    sme2 = saved_model_estimator.SavedModelEstimator(
+        sme_export_dir, self._get_tmp_dir())
+    self.assertDictEqual(
+        {'loss': 106, 'metrics/abs_err': 502, 'global_step': 13},
+        sme.evaluate(dummy_input_fn, steps=1))
+    self.assertEqual(60, sme.get_variable_value('some_var'))
+
+    sme.train(dummy_input_fn, steps=7)
+    self.assertEqual(20, sme.get_variable_value('global_step'))
+
+    predictions = next(sme2.predict(dummy_input_fn_features_only))
+    self.assertDictEqual({'output': 503}, predictions)
+
+  def test_load_saved_model_from_serving_only(self):
+    def model_fn(features, labels, mode):
+      _, _ = features, labels
+      return model_fn_lib.EstimatorSpec(
+          mode,
+          loss=constant_op.constant([103]),
+          train_op=state_ops.assign_add(training.get_global_step(), 1),
+          predictions=constant_op.constant([502]),
+          export_outputs={'test': export_output.ClassificationOutput(
+              constant_op.constant([[32.]]))})
+
+    est = estimator.Estimator(model_fn, self._get_tmp_dir())
+    est.train(input_fn=dummy_input_fn, steps=10)
+
+    def serving_input_receiver_fn():
+      return export.ServingInputReceiver(
+          {'test-features': constant_op.constant([[1], [1]])},
+          array_ops.placeholder(dtype=dtypes.string))
+
+    export_dir = est.export_savedmodel(
+        self._get_tmp_dir(), serving_input_receiver_fn)
+
+    sme = saved_model_estimator.SavedModelEstimator(
+        export_dir, self._get_tmp_dir())
+
+    def input_fn():
+      return {'inputs': constant_op.constant('someinputstr')}
+
+    prediction = next(sme.predict(input_fn))
+    self.assertDictEqual({'scores': 32}, prediction)
+
+  def test_with_local_init_op(self):
+    def model_fn(features, labels, mode):
+      _, _ = features, labels
+      v = variables.Variable(21, name='some_var')
+      scaffold = monitored_session.Scaffold(
+          local_init_op=state_ops.assign_add(v, -3).op
+      )
+      return model_fn_lib.EstimatorSpec(
+          mode,
+          scaffold=scaffold,
+          train_op=state_ops.assign_add(training.get_global_step(), 1),
+          loss=array_ops.identity(v))
+    export_dir = self._export_estimator(predict=False, model_fn=model_fn)
+    sme = saved_model_estimator.SavedModelEstimator(
+        export_dir, self._get_tmp_dir())
+
+    eval_results1 = sme.evaluate(dummy_input_fn, steps=2)
+    self.assertEqual(15, eval_results1['loss'])
+
+    sme.train(dummy_input_fn, steps=1)
+    self.assertEqual(15, sme.get_variable_value('some_var'))
+
+    eval_results2 = sme.evaluate(dummy_input_fn, steps=5)
+    self.assertEqual(12, eval_results2['loss'])
+
+  def test_with_working_input_fn(self):
+    def model_fn(features, labels, mode):
+      loss = None
+      if labels is not None:
+        loss = labels[0][0] + labels[1][0]
+      return model_fn_lib.EstimatorSpec(
+          mode,
+          loss=loss,
+          train_op=state_ops.assign_add(training.get_global_step(), 1),
+          predictions={'features_0': array_ops.identity([features['x'][0][0]]),
+                       'features_1': array_ops.identity([features['x'][1][0]])})
+
+    sme = saved_model_estimator.SavedModelEstimator(
+        self._export_estimator(model_fn=model_fn), self._get_tmp_dir())
+    eval_results = sme.evaluate(dummy_input_fn, steps=1)
+    self.assertEqual(1, eval_results['loss'])
+
+    predictions = next(sme.predict(dummy_input_fn_features_only))
+    self.assertDictEqual({'features_0': 5, 'features_1': 6}, predictions)
+
+  def test_control_dependency(self):
+    # Control dependencies are saved with "^" appended to the start of the input
+    # name. The input map must include control dependencies as well.
+    def model_fn(features, labels, mode):
+      _ = labels
+      with ops.control_dependencies([features['x']]):
+        loss = features['x'][1][0]
+      return model_fn_lib.EstimatorSpec(
+          mode,
+          loss=loss,
+          train_op=state_ops.assign_add(training.get_global_step(), 1))
+    sme = saved_model_estimator.SavedModelEstimator(
+        self._export_estimator(train=False, predict=False, model_fn=model_fn),
+        self._get_tmp_dir())
+    sme.evaluate(dummy_input_fn, steps=1)  # Should run without error
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/framework/python/ops/variables.py b/tensorflow/contrib/framework/python/ops/variables.py
index e8e3180019..322d5c335e 100644
--- a/tensorflow/contrib/framework/python/ops/variables.py
+++ b/tensorflow/contrib/framework/python/ops/variables.py
@@ -34,6 +34,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import saver as tf_saver
@@ -199,10 +200,20 @@ def global_variable(initial_value,
 
 
 @contrib_add_arg_scope
-def variable(name, shape=None, dtype=None, initializer=None,
-             regularizer=None, trainable=True, collections=None,
-             caching_device=None, device=None,
-             partitioner=None, custom_getter=None, use_resource=None):
+def variable(name,
+             shape=None,
+             dtype=None,
+             initializer=None,
+             regularizer=None,
+             trainable=True,
+             collections=None,
+             caching_device=None,
+             device=None,
+             partitioner=None,
+             custom_getter=None,
+             use_resource=None,
+             synchronization=variables.VariableSynchronization.AUTO,
+             aggregation=variables.VariableAggregation.NONE):
   """Gets an existing variable with these parameters or creates a new one.
 
   Args:
@@ -228,6 +239,15 @@ def variable(name, shape=None, dtype=None, initializer=None,
     custom_getter: Callable that allows overwriting the internal
       get_variable method and has to have the same signature.
     use_resource: If `True` use a ResourceVariable instead of a Variable.
+    synchronization: Indicates when a distributed a variable will be
+      aggregated. Accepted values are constants defined in the class
+      @{tf.VariableSynchronization}. By default the synchronization is set to
+      `AUTO` and the current `DistributionStrategy` chooses
+      when to synchronize. If `synchronization` is set to `ON_READ`,
+      `trainable` must not be set to `True`.
+    aggregation: Indicates how a distributed variable will be aggregated.
+      Accepted values are constants defined in the class
+      @{tf.VariableAggregation}.
 
   Returns:
     The created or existing variable.
@@ -242,21 +262,36 @@ def variable(name, shape=None, dtype=None, initializer=None,
     getter = functools.partial(custom_getter,
                                reuse=variable_scope.get_variable_scope().reuse)
   with ops.device(device or ''):
-    return getter(name, shape=shape, dtype=dtype,
-                  initializer=initializer,
-                  regularizer=regularizer,
-                  trainable=trainable,
-                  collections=collections,
-                  caching_device=caching_device,
-                  partitioner=partitioner,
-                  use_resource=use_resource)
+    return getter(
+        name,
+        shape=shape,
+        dtype=dtype,
+        initializer=initializer,
+        regularizer=regularizer,
+        trainable=trainable,
+        collections=collections,
+        caching_device=caching_device,
+        partitioner=partitioner,
+        use_resource=use_resource,
+        synchronization=synchronization,
+        aggregation=aggregation)
 
 
 @contrib_add_arg_scope
-def model_variable(name, shape=None, dtype=dtypes.float32, initializer=None,
-                   regularizer=None, trainable=True, collections=None,
-                   caching_device=None, device=None, partitioner=None,
-                   custom_getter=None, use_resource=None):
+def model_variable(name,
+                   shape=None,
+                   dtype=dtypes.float32,
+                   initializer=None,
+                   regularizer=None,
+                   trainable=True,
+                   collections=None,
+                   caching_device=None,
+                   device=None,
+                   partitioner=None,
+                   custom_getter=None,
+                   use_resource=None,
+                   synchronization=variables.VariableSynchronization.AUTO,
+                   aggregation=variables.VariableAggregation.NONE):
   """Gets an existing model variable with these parameters or creates a new one.
 
   Args:
@@ -283,18 +318,36 @@ def model_variable(name, shape=None, dtype=dtypes.float32, initializer=None,
     custom_getter: Callable that allows overwriting the internal
       get_variable method and has to have the same signature.
     use_resource: If `True` use a ResourceVariable instead of a Variable.
+    synchronization: Indicates when a distributed a variable will be
+      aggregated. Accepted values are constants defined in the class
+      @{tf.VariableSynchronization}. By default the synchronization is set to
+      `AUTO` and the current `DistributionStrategy` chooses
+      when to synchronize. If `synchronization` is set to `ON_READ`,
+      `trainable` must not be set to `True`.
+    aggregation: Indicates how a distributed variable will be aggregated.
+      Accepted values are constants defined in the class
+      @{tf.VariableAggregation}.
 
   Returns:
     The created or existing variable.
   """
   collections = list(collections or [])
   collections += [ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.MODEL_VARIABLES]
-  var = variable(name, shape=shape, dtype=dtype,
-                 initializer=initializer, regularizer=regularizer,
-                 trainable=trainable, collections=collections,
-                 caching_device=caching_device, device=device,
-                 partitioner=partitioner, custom_getter=custom_getter,
-                 use_resource=use_resource)
+  var = variable(
+      name,
+      shape=shape,
+      dtype=dtype,
+      initializer=initializer,
+      regularizer=regularizer,
+      trainable=trainable,
+      collections=collections,
+      caching_device=caching_device,
+      device=device,
+      partitioner=partitioner,
+      custom_getter=custom_getter,
+      use_resource=use_resource,
+      synchronization=synchronization,
+      aggregation=aggregation)
   return var
 
 
diff --git a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
index 4554a3d89a..0ccb4583ab 100644
--- a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
+++ b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
@@ -443,6 +443,8 @@ void LaunchFusedConv2DBiasActivationOp<GPUDevice, T, BiasType, ScaleType>::
                                          : dnn::DataLayout::kBatchDepthYX;
   constexpr auto filter_layout = is_int8x4 ? dnn::FilterLayout::kOutputInputYX4
                                            : dnn::FilterLayout::kOutputInputYX;
+  constexpr auto compute_data_format =
+      is_int8x4 ? FORMAT_NCHW_VECT_C : FORMAT_NCHW;
 
   dnn::BatchDescriptor conv_input_desc;
   conv_input_desc.set_count(batch_size)
@@ -529,6 +531,7 @@ void LaunchFusedConv2DBiasActivationOp<GPUDevice, T, BiasType, ScaleType>::
       batch_size,
       conv_input_depth,
       {{conv_input_rows, conv_input_cols}},
+      compute_data_format,
       output_depth,
       {{filter_rows, filter_cols}},
       // TODO(yangzihao): Add support for arbitrary dilations for fused conv.
diff --git a/tensorflow/contrib/fused_conv/kernels/fused_conv_ops_gpu.h b/tensorflow/contrib/fused_conv/kernels/fused_conv_ops_gpu.h
index ba52697679..b9c131a2e9 100644
--- a/tensorflow/contrib/fused_conv/kernels/fused_conv_ops_gpu.h
+++ b/tensorflow/contrib/fused_conv/kernels/fused_conv_ops_gpu.h
@@ -29,13 +29,13 @@ namespace tensorflow {
 class FusedConvParameters : public ConvParameters {
  public:
   FusedConvParameters(int64 batch, int64 in_depths, const SpatialArray& in,
-                      int64 out_depths, const SpatialArray& filter,
-                      const SpatialArray& dilation, const SpatialArray& stride,
-                      const SpatialArray& padding, DataType dtype,
-                      int device_id, bool has_side_input,
+                      TensorFormat data_format, int64 out_depths,
+                      const SpatialArray& filter, const SpatialArray& dilation,
+                      const SpatialArray& stride, const SpatialArray& padding,
+                      DataType dtype, int device_id, bool has_side_input,
                       ActivationMode activation_mode)
-      : ConvParameters(batch, in_depths, in, out_depths, filter, dilation,
-                       stride, padding, dtype, device_id),
+      : ConvParameters(batch, in_depths, in, data_format, out_depths, filter,
+                       dilation, stride, padding, dtype, device_id),
         activation_mode_(activation_mode),
         has_side_input_(has_side_input) {
     hash_code_ = Hash64Combine(hash_code_, has_side_input);
diff --git a/tensorflow/contrib/gan/BUILD b/tensorflow/contrib/gan/BUILD
index c8c2af49d4..053d4e3e97 100644
--- a/tensorflow/contrib/gan/BUILD
+++ b/tensorflow/contrib/gan/BUILD
@@ -57,6 +57,7 @@ py_library(
 py_test(
     name = "train_test",
     srcs = ["python/train_test.py"],
+    shard_count = 50,
     srcs_version = "PY2AND3",
     tags = ["notsan"],
     deps = [
@@ -80,6 +81,7 @@ py_test(
         "//tensorflow/python:variables",
         "//tensorflow/python/ops/distributions",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -194,11 +196,16 @@ py_test(
     srcs = ["python/losses/python/tuple_losses_test.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":losses_impl",
         ":namedtuples",
         ":tuple_losses",
+        "//tensorflow/contrib/layers:layers_py",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
         "//third_party/py/numpy",
     ],
@@ -255,12 +262,15 @@ py_library(
 py_test(
     name = "random_tensor_pool_test",
     srcs = ["python/features/python/random_tensor_pool_test.py"],
+    shard_count = 6,
     srcs_version = "PY2AND3",
     deps = [
         ":random_tensor_pool",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/contrib/gan/python/features/python/random_tensor_pool_impl.py b/tensorflow/contrib/gan/python/features/python/random_tensor_pool_impl.py
index 9e4ec59e70..ca2d724b49 100644
--- a/tensorflow/contrib/gan/python/features/python/random_tensor_pool_impl.py
+++ b/tensorflow/contrib/gan/python/features/python/random_tensor_pool_impl.py
@@ -36,16 +36,15 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.util import nest
 
 __all__ = [
     'tensor_pool',
 ]
 
 
-def _to_tuple(x):
-  if isinstance(x, (list, tuple)):
-    return tuple(x)
-  return (x,)
+def _to_list(x):
+  return [x] if isinstance(x, ops.Tensor) else list(x)
 
 
 def tensor_pool(input_values,
@@ -63,8 +62,8 @@ def tensor_pool(input_values,
   `pool_size` = 0 or `pooling_probability` = 0.
 
   Args:
-    input_values: A `Tensor`, or a list or tuple of `Tensor`s from which to read
-      values to be pooled.
+    input_values: An arbitrarily nested structure of `tf.Tensors`, from which to
+      read values to be pooled.
     pool_size: An integer specifying the maximum size of the pool. Defaults to
       50.
     pooling_probability: A float `Tensor` specifying the probability of getting
@@ -72,9 +71,10 @@ def tensor_pool(input_values,
     name: A string prefix for the name scope for all tensorflow ops.
 
   Returns:
-    A `Tensor`, or a list or tuple of `Tensor`s (according to the type ofx
-    `input_values`) which is with given probability either the `input_values` or
-    a randomly chosen sample that was previously inserted in the pool.
+    A nested structure of `Tensor` objects with the same structure as
+    `input_values`. With the given probability, the Tensor values are either the
+    same as in `input_values` or a randomly chosen sample that was previously
+    inserted in the pool.
 
   Raises:
     ValueError: If `pool_size` is negative.
@@ -86,11 +86,10 @@ def tensor_pool(input_values,
     return input_values
 
   original_input_values = input_values
-  input_values = _to_tuple(input_values)
+  input_values = nest.flatten(input_values)
 
-  with ops.name_scope(
-      '{}_pool_queue'.format(name),
-      values=input_values + (pooling_probability,)):
+  with ops.name_scope('{}_pool_queue'.format(name),
+                      values=input_values + [pooling_probability]):
     pool_queue = data_flow_ops.RandomShuffleQueue(
         capacity=pool_size,
         min_after_dequeue=0,
@@ -112,10 +111,10 @@ def tensor_pool(input_values,
     def _get_input_value_pooled():
       enqueue_op = pool_queue.enqueue(input_values)
       with ops.control_dependencies([enqueue_op]):
-        return tuple(array_ops.identity(v) for v in input_values)
+        return [array_ops.identity(v) for v in input_values]
 
     def _get_random_pool_value_and_enqueue_input():
-      dequeue_values = _to_tuple(pool_queue.dequeue())
+      dequeue_values = _to_list(pool_queue.dequeue())
       with ops.control_dependencies(dequeue_values):
         enqueue_op = pool_queue.enqueue(input_values)
         with ops.control_dependencies([enqueue_op]):
@@ -124,7 +123,7 @@ def tensor_pool(input_values,
           return control_flow_ops.cond(prob, lambda: dequeue_values,
                                        lambda: input_values)
 
-    output_values = _to_tuple(control_flow_ops.cond(
+    output_values = _to_list(control_flow_ops.cond(
         pool_queue.size() < pool_size, _get_input_value_pooled,
         _get_random_pool_value_and_enqueue_input))
 
@@ -132,8 +131,4 @@ def tensor_pool(input_values,
     for input_value, output_value in zip(input_values, output_values):
       output_value.set_shape(input_value.shape)
 
-  if isinstance(original_input_values, list):
-    return list(output_values)
-  elif isinstance(original_input_values, tuple):
-    return output_values
-  return output_values[0]
+  return nest.pack_sequence_as(original_input_values, output_values)
diff --git a/tensorflow/contrib/gan/python/features/python/random_tensor_pool_test.py b/tensorflow/contrib/gan/python/features/python/random_tensor_pool_test.py
index d8cf549cf7..08584dcd65 100644
--- a/tensorflow/contrib/gan/python/features/python/random_tensor_pool_test.py
+++ b/tensorflow/contrib/gan/python/features/python/random_tensor_pool_test.py
@@ -21,7 +21,9 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.contrib.gan.python.features.python.random_tensor_pool_impl import tensor_pool
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
@@ -111,6 +113,23 @@ class TensorPoolTest(test.TestCase):
         self.assertEqual(len(outs), len(input_values))
         self.assertEqual(outs[1] - outs[0], 1)
 
+  def test_pool_preserves_shape(self):
+    t = constant_op.constant(1)
+    input_values = [[t, t, t], (t, t), t]
+    output_values = tensor_pool(input_values, pool_size=5)
+    print('stuff: ', output_values)
+    # Overall shape.
+    self.assertIsInstance(output_values, list)
+    self.assertEqual(3, len(output_values))
+    # Shape of first element.
+    self.assertIsInstance(output_values[0], list)
+    self.assertEqual(3, len(output_values[0]))
+    # Shape of second element.
+    self.assertIsInstance(output_values[1], tuple)
+    self.assertEqual(2, len(output_values[1]))
+    # Shape of third element.
+    self.assertIsInstance(output_values[2], ops.Tensor)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/gan/python/losses/python/losses_impl.py b/tensorflow/contrib/gan/python/losses/python/losses_impl.py
index 1ba3a64167..d389748374 100644
--- a/tensorflow/contrib/gan/python/losses/python/losses_impl.py
+++ b/tensorflow/contrib/gan/python/losses/python/losses_impl.py
@@ -949,6 +949,11 @@ def cycle_consistency_loss(data_x,
   * loss = (loss_x2x + loss_y2y) / 2
   where `loss` is the final result.
 
+  For the L1-norm, we follow the original implementation:
+  https://github.com/junyanz/CycleGAN/blob/master/models/cycle_gan_model.lua
+  we use L1-norm of pixel-wise error normalized by data size such that
+  `cycle_loss_weight` can be specified independent of image size.
+
   See https://arxiv.org/abs/1703.10593 for more details.
 
   Args:
@@ -965,19 +970,12 @@ def cycle_consistency_loss(data_x,
     A scalar `Tensor` of cycle consistency loss.
   """
 
-  def _partial_cycle_consistency_loss(data, reconstructed_data):
-    # Following the original implementation
-    # https://github.com/junyanz/CycleGAN/blob/master/models/cycle_gan_model.lua
-    # use L1-norm of pixel-wise error normalized by data size so that
-    # `cycle_loss_weight` can be specified independent of image size.
-    return math_ops.reduce_mean(math_ops.abs(data - reconstructed_data))
-
   with ops.name_scope(
       scope,
       'cycle_consistency_loss',
       values=[data_x, reconstructed_data_x, data_y, reconstructed_data_y]):
-    loss_x2x = _partial_cycle_consistency_loss(data_x, reconstructed_data_x)
-    loss_y2y = _partial_cycle_consistency_loss(data_y, reconstructed_data_y)
+    loss_x2x = losses.absolute_difference(data_x, reconstructed_data_x)
+    loss_y2y = losses.absolute_difference(data_y, reconstructed_data_y)
     loss = (loss_x2x + loss_y2y) / 2.0
     if add_summaries:
       summary.scalar('cycle_consistency_loss_x2x', loss_x2x)
diff --git a/tensorflow/contrib/gan/python/losses/python/tuple_losses_impl.py b/tensorflow/contrib/gan/python/losses/python/tuple_losses_impl.py
index dcc3f94c2d..221c70c38b 100644
--- a/tensorflow/contrib/gan/python/losses/python/tuple_losses_impl.py
+++ b/tensorflow/contrib/gan/python/losses/python/tuple_losses_impl.py
@@ -80,6 +80,9 @@ __all__ = [
     'mutual_information_penalty',
     'combine_adversarial_loss',
     'cycle_consistency_loss',
+    'stargan_generator_loss_wrapper',
+    'stargan_discriminator_loss_wrapper',
+    'stargan_gradient_penalty_wrapper'
 ]
 
 
@@ -277,3 +280,86 @@ def cycle_consistency_loss(cyclegan_model, scope=None, add_summaries=False):
       cyclegan_model.model_x2y.generator_inputs, cyclegan_model.reconstructed_x,
       cyclegan_model.model_y2x.generator_inputs, cyclegan_model.reconstructed_y,
       scope, add_summaries)
+
+
+def stargan_generator_loss_wrapper(loss_fn):
+  """Convert a generator loss function to take a StarGANModel.
+
+  The new function has the same name as the original one.
+
+  Args:
+    loss_fn: A python function taking Discriminator's real/fake prediction for
+      generated data.
+
+  Returns:
+    A new function that takes a StarGANModel namedtuple and returns the same
+    loss.
+  """
+
+  def new_loss_fn(stargan_model, **kwargs):
+    return loss_fn(
+        stargan_model.discriminator_generated_data_source_predication, **kwargs)
+
+  new_docstring = """The stargan_model version of %s.""" % loss_fn.__name__
+  new_loss_fn.__docstring__ = new_docstring
+  new_loss_fn.__name__ = loss_fn.__name__
+  new_loss_fn.__module__ = loss_fn.__module__
+  return new_loss_fn
+
+
+def stargan_discriminator_loss_wrapper(loss_fn):
+  """Convert a discriminator loss function to take a StarGANModel.
+
+  The new function has the same name as the original one.
+
+  Args:
+    loss_fn: A python function taking Discriminator's real/fake prediction for
+      real data and generated data.
+
+  Returns:
+    A new function that takes a StarGANModel namedtuple and returns the same
+    loss.
+  """
+
+  def new_loss_fn(stargan_model, **kwargs):
+    return loss_fn(
+        stargan_model.discriminator_input_data_source_predication,
+        stargan_model.discriminator_generated_data_source_predication, **kwargs)
+
+  new_docstring = """The stargan_model version of %s.""" % loss_fn.__name__
+  new_loss_fn.__docstring__ = new_docstring
+  new_loss_fn.__name__ = loss_fn.__name__
+  new_loss_fn.__module__ = loss_fn.__module__
+  return new_loss_fn
+
+
+def stargan_gradient_penalty_wrapper(loss_fn):
+  """Convert a gradient penalty function to take a StarGANModel.
+
+  The new function has the same name as the original one.
+
+  Args:
+    loss_fn: A python function taking real_data, generated_data,
+      generator_inputs for Discriminator's condition (i.e. number of domains),
+      discriminator_fn, and discriminator_scope.
+
+  Returns:
+    A new function that takes a StarGANModel namedtuple and returns the same
+    loss.
+  """
+
+  def new_loss_fn(stargan_model, **kwargs):
+    num_domains = stargan_model.input_data_domain_label.shape.as_list()[-1]
+    return loss_fn(
+        real_data=stargan_model.input_data,
+        generated_data=stargan_model.generated_data,
+        generator_inputs=num_domains,
+        discriminator_fn=stargan_model.discriminator_fn,
+        discriminator_scope=stargan_model.discriminator_scope,
+        **kwargs)
+
+  new_docstring = """The stargan_model version of %s.""" % loss_fn.__name__
+  new_loss_fn.__docstring__ = new_docstring
+  new_loss_fn.__name__ = loss_fn.__name__
+  new_loss_fn.__module__ = loss_fn.__module__
+  return new_loss_fn
diff --git a/tensorflow/contrib/gan/python/losses/python/tuple_losses_test.py b/tensorflow/contrib/gan/python/losses/python/tuple_losses_test.py
index aa1ef11172..a559bbfa11 100644
--- a/tensorflow/contrib/gan/python/losses/python/tuple_losses_test.py
+++ b/tensorflow/contrib/gan/python/losses/python/tuple_losses_test.py
@@ -22,10 +22,15 @@ import collections
 
 import numpy as np
 
+from tensorflow.contrib import layers
 from tensorflow.contrib.gan.python import namedtuples
+from tensorflow.contrib.gan.python.losses.python import losses_impl as tfgan_losses_impl
 from tensorflow.contrib.gan.python.losses.python import tuple_losses_impl as tfgan_losses
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
@@ -129,6 +134,9 @@ manual_tests = [
     'mutual_information_penalty',
     'wasserstein_gradient_penalty',
     'cycle_consistency_loss',
+    'stargan_generator_loss_wrapper',
+    'stargan_discriminator_loss_wrapper',
+    'stargan_gradient_penalty_wrapper'
 ]
 
 discriminator_keyword_args = {
@@ -175,6 +183,112 @@ class CycleConsistencyLossTest(test.TestCase):
       self.assertNear(5.0, loss.eval(), 1e-5)
 
 
+class StarGANLossWrapperTest(test.TestCase):
+
+  def setUp(self):
+
+    super(StarGANLossWrapperTest, self).setUp()
+
+    self.input_data = array_ops.ones([1, 2, 2, 3])
+    self.input_data_domain_label = constant_op.constant([[0, 1]])
+    self.generated_data = array_ops.ones([1, 2, 2, 3])
+    self.discriminator_input_data_source_predication = array_ops.ones([1])
+    self.discriminator_generated_data_source_predication = array_ops.ones([1])
+
+    def _discriminator_fn(inputs, num_domains):
+      """Differentiable dummy discriminator for StarGAN."""
+      hidden = layers.flatten(inputs)
+      output_src = math_ops.reduce_mean(hidden, axis=1)
+      output_cls = layers.fully_connected(
+          inputs=hidden,
+          num_outputs=num_domains,
+          activation_fn=None,
+          normalizer_fn=None,
+          biases_initializer=None)
+      return output_src, output_cls
+
+    with variable_scope.variable_scope('discriminator') as dis_scope:
+      pass
+
+    self.model = namedtuples.StarGANModel(
+        input_data=self.input_data,
+        input_data_domain_label=self.input_data_domain_label,
+        generated_data=self.generated_data,
+        generated_data_domain_target=None,
+        reconstructed_data=None,
+        discriminator_input_data_source_predication=self.
+        discriminator_input_data_source_predication,
+        discriminator_generated_data_source_predication=self.
+        discriminator_generated_data_source_predication,
+        discriminator_input_data_domain_predication=None,
+        discriminator_generated_data_domain_predication=None,
+        generator_variables=None,
+        generator_scope=None,
+        generator_fn=None,
+        discriminator_variables=None,
+        discriminator_scope=dis_scope,
+        discriminator_fn=_discriminator_fn)
+
+    self.discriminator_fn = _discriminator_fn
+    self.discriminator_scope = dis_scope
+
+  def test_stargan_generator_loss_wrapper(self):
+    """Test StarGAN generator loss wrapper."""
+    loss_fn = tfgan_losses_impl.wasserstein_generator_loss
+    wrapped_loss_fn = tfgan_losses.stargan_generator_loss_wrapper(loss_fn)
+
+    loss_result_tensor = loss_fn(
+        self.discriminator_generated_data_source_predication)
+    wrapped_loss_result_tensor = wrapped_loss_fn(self.model)
+
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      loss_result, wrapped_loss_result = sess.run(
+          [loss_result_tensor, wrapped_loss_result_tensor])
+      self.assertAlmostEqual(loss_result, wrapped_loss_result)
+
+  def test_stargan_discriminator_loss_wrapper(self):
+    """Test StarGAN discriminator loss wrapper."""
+    loss_fn = tfgan_losses_impl.wasserstein_discriminator_loss
+    wrapped_loss_fn = tfgan_losses.stargan_discriminator_loss_wrapper(loss_fn)
+
+    loss_result_tensor = loss_fn(
+        self.discriminator_generated_data_source_predication,
+        self.discriminator_generated_data_source_predication)
+    wrapped_loss_result_tensor = wrapped_loss_fn(self.model)
+
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      loss_result, wrapped_loss_result = sess.run(
+          [loss_result_tensor, wrapped_loss_result_tensor])
+      self.assertAlmostEqual(loss_result, wrapped_loss_result)
+
+  def test_stargan_gradient_penalty_wrapper(self):
+    """Test StaGAN gradient penalty wrapper.
+
+    Notes:
+      The random interpolates are handled by given setting the reconstruction to
+      be the same as the input.
+
+    """
+    loss_fn = tfgan_losses_impl.wasserstein_gradient_penalty
+    wrapped_loss_fn = tfgan_losses.stargan_gradient_penalty_wrapper(loss_fn)
+
+    loss_result_tensor = loss_fn(
+        real_data=self.input_data,
+        generated_data=self.generated_data,
+        generator_inputs=self.input_data_domain_label.shape.as_list()[-1],
+        discriminator_fn=self.discriminator_fn,
+        discriminator_scope=self.discriminator_scope)
+    wrapped_loss_result_tensor = wrapped_loss_fn(self.model)
+
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      loss_result, wrapped_loss_result = sess.run(
+          [loss_result_tensor, wrapped_loss_result_tensor])
+      self.assertAlmostEqual(loss_result, wrapped_loss_result)
+
+
 if __name__ == '__main__':
   for loss_name in tfgan_losses.__all__:
     if loss_name in manual_tests: continue
diff --git a/tensorflow/contrib/gan/python/train.py b/tensorflow/contrib/gan/python/train.py
index 49d9327333..03f52d214b 100644
--- a/tensorflow/contrib/gan/python/train.py
+++ b/tensorflow/contrib/gan/python/train.py
@@ -34,6 +34,7 @@ from __future__ import print_function
 from tensorflow.contrib.framework.python.ops import variables as variables_lib
 from tensorflow.contrib.gan.python import losses as tfgan_losses
 from tensorflow.contrib.gan.python import namedtuples
+from tensorflow.contrib.gan.python.losses.python import losses_impl as tfgan_losses_impl
 from tensorflow.contrib.slim.python.slim import learning as slim_learning
 from tensorflow.contrib.training.python.training import training
 from tensorflow.python.framework import dtypes
@@ -41,14 +42,17 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops.distributions import distribution as ds
 from tensorflow.python.ops.losses import losses
+from tensorflow.python.summary import summary
 from tensorflow.python.training import session_run_hook
 from tensorflow.python.training import sync_replicas_optimizer
 from tensorflow.python.training import training_util
 
+
 __all__ = [
     'gan_model',
     'infogan_model',
@@ -514,33 +518,42 @@ def _tensor_pool_adjusted_model(model, tensor_pool_fn):
   Raises:
     ValueError: If tensor pool does not support the `model`.
   """
-  if tensor_pool_fn is None:
-    return model
-
-  pooled_generated_data, pooled_generator_inputs = tensor_pool_fn(
-      (model.generated_data, model.generator_inputs))
-
   if isinstance(model, namedtuples.GANModel):
+    pooled_generator_inputs, pooled_generated_data = tensor_pool_fn(
+        (model.generator_inputs, model.generated_data))
     with variable_scope.variable_scope(model.discriminator_scope, reuse=True):
       dis_gen_outputs = model.discriminator_fn(pooled_generated_data,
                                                pooled_generator_inputs)
-    return model._replace(discriminator_gen_outputs=dis_gen_outputs)
+    return model._replace(
+        generator_inputs=pooled_generator_inputs,
+        generated_data=pooled_generated_data,
+        discriminator_gen_outputs=dis_gen_outputs)
   elif isinstance(model, namedtuples.ACGANModel):
+    pooled_generator_inputs, pooled_generated_data = tensor_pool_fn(
+        (model.generator_inputs, model.generated_data))
     with variable_scope.variable_scope(model.discriminator_scope, reuse=True):
-      (dis_pooled_gen_outputs,
-       dis_pooled_gen_classification_logits) = model.discriminator_fn(
+      (pooled_discriminator_gen_outputs,
+       pooled_discriminator_gen_classification_logits) = model.discriminator_fn(
            pooled_generated_data, pooled_generator_inputs)
     return model._replace(
-        discriminator_gen_outputs=dis_pooled_gen_outputs,
+        generator_inputs=pooled_generator_inputs,
+        generated_data=pooled_generated_data,
+        discriminator_gen_outputs=pooled_discriminator_gen_outputs,
         discriminator_gen_classification_logits=
-        dis_pooled_gen_classification_logits)
+        pooled_discriminator_gen_classification_logits)
   elif isinstance(model, namedtuples.InfoGANModel):
+    pooled_generator_inputs, pooled_generated_data, pooled_structured_input = (
+        tensor_pool_fn((model.generator_inputs, model.generated_data,
+                        model.structured_generator_inputs)))
     with variable_scope.variable_scope(model.discriminator_scope, reuse=True):
-      (dis_pooled_gen_outputs,
+      (pooled_discriminator_gen_outputs,
        pooled_predicted_distributions) = model.discriminator_and_aux_fn(
            pooled_generated_data, pooled_generator_inputs)
     return model._replace(
-        discriminator_gen_outputs=dis_pooled_gen_outputs,
+        generator_inputs=pooled_generator_inputs,
+        generated_data=pooled_generated_data,
+        structured_generator_inputs=pooled_structured_input,
+        discriminator_gen_outputs=pooled_discriminator_gen_outputs,
         predicted_distributions=pooled_predicted_distributions)
   else:
     raise ValueError('Tensor pool does not support `model`: %s.' % type(model))
@@ -632,33 +645,38 @@ def gan_loss(
         'is provided, `model` must be an `ACGANModel`. Instead, was %s.' %
         type(model))
 
+  # Optionally create pooled model.
+  pooled_model = (_tensor_pool_adjusted_model(model, tensor_pool_fn) if
+                  tensor_pool_fn else model)
+
   # Create standard losses.
   gen_loss = generator_loss_fn(model, add_summaries=add_summaries)
-  dis_loss = discriminator_loss_fn(
-      _tensor_pool_adjusted_model(model, tensor_pool_fn),
-      add_summaries=add_summaries)
+  dis_loss = discriminator_loss_fn(pooled_model, add_summaries=add_summaries)
 
   # Add optional extra losses.
   if _use_aux_loss(gradient_penalty_weight):
     gp_loss = tfgan_losses.wasserstein_gradient_penalty(
-        model,
+        pooled_model,
         epsilon=gradient_penalty_epsilon,
         target=gradient_penalty_target,
         one_sided=gradient_penalty_one_sided,
         add_summaries=add_summaries)
     dis_loss += gradient_penalty_weight * gp_loss
   if _use_aux_loss(mutual_information_penalty_weight):
-    info_loss = tfgan_losses.mutual_information_penalty(
+    gen_info_loss = tfgan_losses.mutual_information_penalty(
         model, add_summaries=add_summaries)
-    dis_loss += mutual_information_penalty_weight * info_loss
-    gen_loss += mutual_information_penalty_weight * info_loss
+    dis_info_loss = (gen_info_loss if tensor_pool_fn is None else
+                     tfgan_losses.mutual_information_penalty(
+                         pooled_model, add_summaries=add_summaries))
+    gen_loss += mutual_information_penalty_weight * gen_info_loss
+    dis_loss += mutual_information_penalty_weight * dis_info_loss
   if _use_aux_loss(aux_cond_generator_weight):
     ac_gen_loss = tfgan_losses.acgan_generator_loss(
         model, add_summaries=add_summaries)
     gen_loss += aux_cond_generator_weight * ac_gen_loss
   if _use_aux_loss(aux_cond_discriminator_weight):
     ac_disc_loss = tfgan_losses.acgan_discriminator_loss(
-        model, add_summaries=add_summaries)
+        pooled_model, add_summaries=add_summaries)
     dis_loss += aux_cond_discriminator_weight * ac_disc_loss
   # Gathers auxiliary losses.
   if model.generator_scope:
@@ -737,6 +755,130 @@ def cyclegan_loss(
   return namedtuples.CycleGANLoss(loss_x2y, loss_y2x)
 
 
+def stargan_loss(
+    model,
+    generator_loss_fn=tfgan_losses.stargan_generator_loss_wrapper(
+        tfgan_losses_impl.wasserstein_generator_loss),
+    discriminator_loss_fn=tfgan_losses.stargan_discriminator_loss_wrapper(
+        tfgan_losses_impl.wasserstein_discriminator_loss),
+    gradient_penalty_weight=10.0,
+    gradient_penalty_epsilon=1e-10,
+    gradient_penalty_target=1.0,
+    gradient_penalty_one_sided=False,
+    reconstruction_loss_fn=losses.absolute_difference,
+    reconstruction_loss_weight=10.0,
+    classification_loss_fn=losses.softmax_cross_entropy,
+    classification_loss_weight=1.0,
+    classification_one_hot=True,
+    add_summaries=True):
+  """StarGAN Loss.
+
+  The four major part can be found here: http://screen/tMRMBAohDYG.
+
+  Args:
+    model: (StarGAN) Model output of the stargan_model() function call.
+    generator_loss_fn: The loss function on the generator. Takes a
+      `StarGANModel` named tuple.
+    discriminator_loss_fn: The loss function on the discriminator. Takes a
+      `StarGANModel` namedtuple.
+    gradient_penalty_weight: (float) Gradient penalty weight. Default to 10 per
+      the original paper https://arxiv.org/abs/1711.09020. Set to 0 or None to
+      turn off gradient penalty.
+    gradient_penalty_epsilon: (float) A small positive number added for
+      numerical stability when computing the gradient norm.
+    gradient_penalty_target: (float, or tf.float `Tensor`) The target value of
+      gradient norm. Defaults to 1.0.
+    gradient_penalty_one_sided: (bool) If `True`, penalty proposed in
+      https://arxiv.org/abs/1709.08894 is used. Defaults to `False`.
+    reconstruction_loss_fn: The reconstruction loss function. Default to L1-norm
+      and the function must conform to the `tf.losses` API.
+    reconstruction_loss_weight: Reconstruction loss weight. Default to 10.0.
+    classification_loss_fn: The loss function on the discriminator's ability to
+      classify domain of the input. Default to one-hot softmax cross entropy
+      loss, and the function must conform to the `tf.losses` API.
+    classification_loss_weight: (float) Classification loss weight. Default to
+      1.0.
+    classification_one_hot: (bool) If the label is one hot representation.
+      Default to True. If False, classification classification_loss_fn need to
+      be sigmoid cross entropy loss instead.
+    add_summaries: (bool) Add the loss to the summary
+
+  Returns:
+    GANLoss namedtuple where we have generator loss and discriminator loss.
+
+  Raises:
+    ValueError: If input StarGANModel.input_data_domain_label does not have rank
+    2, or dimension 2 is not defined.
+  """
+
+  def _classification_loss_helper(true_labels, predict_logits, scope_name):
+    """Classification Loss Function Helper.
+
+    Args:
+      true_labels: Tensor of shape [batch_size, num_domains] representing the
+        label where each row is an one-hot vector.
+      predict_logits: Tensor of shape [batch_size, num_domains] representing the
+        predicted label logit, which is UNSCALED output from the NN.
+      scope_name: (string) Name scope of the loss component.
+
+    Returns:
+      Single scalar tensor representing the classification loss.
+    """
+
+    with ops.name_scope(scope_name, values=(true_labels, predict_logits)):
+
+      loss = classification_loss_fn(
+          onehot_labels=true_labels, logits=predict_logits)
+
+      if not classification_one_hot:
+        loss = math_ops.reduce_sum(loss, axis=1)
+      loss = math_ops.reduce_mean(loss)
+
+      if add_summaries:
+        summary.scalar(scope_name, loss)
+
+      return loss
+
+  # Check input shape.
+  model.input_data_domain_label.shape.assert_has_rank(2)
+  model.input_data_domain_label.shape[1:].assert_is_fully_defined()
+
+  # Adversarial Loss.
+  generator_loss = generator_loss_fn(model, add_summaries=add_summaries)
+  discriminator_loss = discriminator_loss_fn(model, add_summaries=add_summaries)
+
+  # Gradient Penalty.
+  if _use_aux_loss(gradient_penalty_weight):
+    gradient_penalty_fn = tfgan_losses.stargan_gradient_penalty_wrapper(
+        tfgan_losses_impl.wasserstein_gradient_penalty)
+    discriminator_loss += gradient_penalty_fn(
+        model,
+        epsilon=gradient_penalty_epsilon,
+        target=gradient_penalty_target,
+        one_sided=gradient_penalty_one_sided,
+        add_summaries=add_summaries) * gradient_penalty_weight
+
+  # Reconstruction Loss.
+  reconstruction_loss = reconstruction_loss_fn(model.input_data,
+                                               model.reconstructed_data)
+  generator_loss += reconstruction_loss * reconstruction_loss_weight
+  if add_summaries:
+    summary.scalar('reconstruction_loss', reconstruction_loss)
+
+  # Classification Loss.
+  generator_loss += _classification_loss_helper(
+      true_labels=model.generated_data_domain_target,
+      predict_logits=model.discriminator_generated_data_domain_predication,
+      scope_name='generator_classification_loss') * classification_loss_weight
+  discriminator_loss += _classification_loss_helper(
+      true_labels=model.input_data_domain_label,
+      predict_logits=model.discriminator_input_data_domain_predication,
+      scope_name='discriminator_classification_loss'
+  ) * classification_loss_weight
+
+  return namedtuples.GANLoss(generator_loss, discriminator_loss)
+
+
 def _get_update_ops(kwargs, gen_scope, dis_scope, check_for_unused_ops=True):
   """Gets generator and discriminator update ops.
 
diff --git a/tensorflow/contrib/gan/python/train_test.py b/tensorflow/contrib/gan/python/train_test.py
index 93a12af944..58f348034f 100644
--- a/tensorflow/contrib/gan/python/train_test.py
+++ b/tensorflow/contrib/gan/python/train_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.contrib import layers
@@ -113,6 +114,12 @@ def stargan_generator_model(inputs, _):
   return variable_scope.get_variable('dummy_g', initializer=0.5) * inputs
 
 
+class StarGANGenerator(object):
+
+  def __call__(self, inputs, _):
+    return stargan_generator_model(inputs, _)
+
+
 def stargan_discriminator_model(inputs, num_domains):
   """Differentiable dummy discriminator for StarGAN."""
 
@@ -129,6 +136,12 @@ def stargan_discriminator_model(inputs, num_domains):
   return output_src, output_cls
 
 
+class StarGANDiscriminator(object):
+
+  def __call__(self, inputs, num_domains):
+    return stargan_discriminator_model(inputs, num_domains)
+
+
 def get_gan_model():
   # TODO(joelshor): Find a better way of creating a variable scope.
   with variable_scope.variable_scope('generator') as gen_scope:
@@ -271,63 +284,75 @@ def create_callable_cyclegan_model():
       data_y=array_ops.ones([1, 2]))
 
 
-def get_sync_optimizer():
-  return sync_replicas_optimizer.SyncReplicasOptimizer(
-      gradient_descent.GradientDescentOptimizer(learning_rate=1.0),
-      replicas_to_aggregate=1)
+def get_stargan_model():
+  """Similar to get_gan_model()."""
+  # TODO(joelshor): Find a better way of creating a variable scope.
+  with variable_scope.variable_scope('generator') as gen_scope:
+    pass
+  with variable_scope.variable_scope('discriminator') as dis_scope:
+    pass
+  return namedtuples.StarGANModel(
+      input_data=array_ops.ones([1, 2, 2, 3]),
+      input_data_domain_label=array_ops.ones([1, 2]),
+      generated_data=array_ops.ones([1, 2, 2, 3]),
+      generated_data_domain_target=array_ops.ones([1, 2]),
+      reconstructed_data=array_ops.ones([1, 2, 2, 3]),
+      discriminator_input_data_source_predication=array_ops.ones([1]),
+      discriminator_generated_data_source_predication=array_ops.ones([1]),
+      discriminator_input_data_domain_predication=array_ops.ones([1, 2]),
+      discriminator_generated_data_domain_predication=array_ops.ones([1, 2]),
+      generator_variables=None,
+      generator_scope=gen_scope,
+      generator_fn=stargan_generator_model,
+      discriminator_variables=None,
+      discriminator_scope=dis_scope,
+      discriminator_fn=stargan_discriminator_model)
 
 
-def get_tensor_pool_fn(pool_size):
+def get_callable_stargan_model():
+  model = get_stargan_model()
+  return model._replace(
+      generator_fn=StarGANGenerator(), discriminator_fn=StarGANDiscriminator())
 
-  def tensor_pool_fn_impl(input_values):
-    return random_tensor_pool.tensor_pool(input_values, pool_size=pool_size)
 
-  return tensor_pool_fn_impl
+def create_stargan_model():
+  return train.stargan_model(
+      stargan_generator_model, stargan_discriminator_model,
+      array_ops.ones([1, 2, 2, 3]), array_ops.ones([1, 2]))
 
 
-def get_tensor_pool_fn_for_infogan(pool_size):
+def create_callable_stargan_model():
+  return train.stargan_model(StarGANGenerator(), StarGANDiscriminator(),
+                             array_ops.ones([1, 2, 2, 3]),
+                             array_ops.ones([1, 2]))
 
-  def tensor_pool_fn_impl(input_values):
-    generated_data, generator_inputs = input_values
-    output_values = random_tensor_pool.tensor_pool(
-        [generated_data] + generator_inputs, pool_size=pool_size)
-    return output_values[0], output_values[1:]
 
-  return tensor_pool_fn_impl
+def get_sync_optimizer():
+  return sync_replicas_optimizer.SyncReplicasOptimizer(
+      gradient_descent.GradientDescentOptimizer(learning_rate=1.0),
+      replicas_to_aggregate=1)
 
 
-class GANModelTest(test.TestCase):
+class GANModelTest(test.TestCase, parameterized.TestCase):
   """Tests for `gan_model`."""
 
-  def _test_output_type_helper(self, create_fn, tuple_type):
-    self.assertTrue(isinstance(create_fn(), tuple_type))
-
-  def test_output_type_gan(self):
-    self._test_output_type_helper(get_gan_model, namedtuples.GANModel)
-
-  def test_output_type_callable_gan(self):
-    self._test_output_type_helper(get_callable_gan_model, namedtuples.GANModel)
-
-  def test_output_type_infogan(self):
-    self._test_output_type_helper(get_infogan_model, namedtuples.InfoGANModel)
-
-  def test_output_type_callable_infogan(self):
-    self._test_output_type_helper(get_callable_infogan_model,
-                                  namedtuples.InfoGANModel)
-
-  def test_output_type_acgan(self):
-    self._test_output_type_helper(get_acgan_model, namedtuples.ACGANModel)
-
-  def test_output_type_callable_acgan(self):
-    self._test_output_type_helper(get_callable_acgan_model,
-                                  namedtuples.ACGANModel)
-
-  def test_output_type_cyclegan(self):
-    self._test_output_type_helper(get_cyclegan_model, namedtuples.CycleGANModel)
-
-  def test_output_type_callable_cyclegan(self):
-    self._test_output_type_helper(get_callable_cyclegan_model,
-                                  namedtuples.CycleGANModel)
+  @parameterized.named_parameters(
+      ('gan', get_gan_model, namedtuples.GANModel),
+      ('callable_gan', get_callable_gan_model, namedtuples.GANModel),
+      ('infogan', get_infogan_model, namedtuples.InfoGANModel),
+      ('callable_infogan', get_callable_infogan_model,
+       namedtuples.InfoGANModel),
+      ('acgan', get_acgan_model, namedtuples.ACGANModel),
+      ('callable_acgan', get_callable_acgan_model, namedtuples.ACGANModel),
+      ('cyclegan', get_cyclegan_model, namedtuples.CycleGANModel),
+      ('callable_cyclegan', get_callable_cyclegan_model,
+       namedtuples.CycleGANModel),
+      ('stargan', get_stargan_model, namedtuples.StarGANModel),
+      ('callabel_stargan', get_callable_stargan_model, namedtuples.StarGANModel)
+  )
+  def test_output_type(self, create_fn, expected_tuple_type):
+    """Test that output type is as expected."""
+    self.assertIsInstance(create_fn(), expected_tuple_type)
 
   def test_no_shape_check(self):
 
@@ -357,7 +382,6 @@ class StarGANModelTest(test.TestCase):
 
   @staticmethod
   def create_input_and_label_tensor(batch_size, img_size, c_size, num_domains):
-
     input_tensor_list = []
     label_tensor_list = []
     for _ in range(num_domains):
@@ -369,7 +393,6 @@ class StarGANModelTest(test.TestCase):
     return input_tensor_list, label_tensor_list
 
   def test_generate_stargan_random_domain_target(self):
-
     batch_size = 8
     domain_numbers = 3
 
@@ -384,7 +407,6 @@ class StarGANModelTest(test.TestCase):
         self.assertEqual(1, np.max(target))
 
   def test_stargan_model_output_type(self):
-
     batch_size = 2
     img_size = 16
     c_size = 3
@@ -408,7 +430,6 @@ class StarGANModelTest(test.TestCase):
     self.assertTrue(callable(model.generator_fn))
 
   def test_stargan_model_generator_output(self):
-
     batch_size = 2
     img_size = 16
     c_size = 3
@@ -439,7 +460,6 @@ class StarGANModelTest(test.TestCase):
           reconstructed_data.shape)
 
   def test_stargan_model_discriminator_output(self):
-
     batch_size = 2
     img_size = 16
     c_size = 3
@@ -484,53 +504,55 @@ class StarGANModelTest(test.TestCase):
                             disc_gen_label.shape)
 
 
-class GANLossTest(test.TestCase):
+class GANLossTest(test.TestCase, parameterized.TestCase):
   """Tests for `gan_loss`."""
 
-  # Test output type.
-  def _test_output_type_helper(self, get_gan_model_fn):
+  @parameterized.named_parameters(
+      ('gan', get_gan_model),
+      ('callable_gan', get_callable_gan_model),
+      ('infogan', get_infogan_model),
+      ('callable_infogan', get_callable_infogan_model),
+      ('acgan', get_acgan_model),
+      ('callable_acgan', get_callable_acgan_model),
+  )
+  def test_output_type(self, get_gan_model_fn):
+    """Test output type."""
     loss = train.gan_loss(get_gan_model_fn(), add_summaries=True)
-    self.assertTrue(isinstance(loss, namedtuples.GANLoss))
+    self.assertIsInstance(loss, namedtuples.GANLoss)
     self.assertGreater(len(ops.get_collection(ops.GraphKeys.SUMMARIES)), 0)
 
-  def test_output_type_gan(self):
-    self._test_output_type_helper(get_gan_model)
-
-  def test_output_type_callable_gan(self):
-    self._test_output_type_helper(get_callable_gan_model)
-
-  def test_output_type_infogan(self):
-    self._test_output_type_helper(get_infogan_model)
-
-  def test_output_type_callable_infogan(self):
-    self._test_output_type_helper(get_callable_infogan_model)
-
-  def test_output_type_acgan(self):
-    self._test_output_type_helper(get_acgan_model)
-
-  def test_output_type_callable_acgan(self):
-    self._test_output_type_helper(get_callable_acgan_model)
-
-  def test_output_type_cyclegan(self):
-    loss = train.cyclegan_loss(create_cyclegan_model(), add_summaries=True)
-    self.assertIsInstance(loss, namedtuples.CycleGANLoss)
-    self.assertGreater(len(ops.get_collection(ops.GraphKeys.SUMMARIES)), 0)
-
-  def test_output_type_callable_cyclegan(self):
-    loss = train.cyclegan_loss(
-        create_callable_cyclegan_model(), add_summaries=True)
+  @parameterized.named_parameters(
+      ('cyclegan', create_cyclegan_model),
+      ('callable_cyclegan', create_callable_cyclegan_model),
+  )
+  def test_cyclegan_output_type(self, get_gan_model_fn):
+    loss = train.cyclegan_loss(get_gan_model_fn(), add_summaries=True)
     self.assertIsInstance(loss, namedtuples.CycleGANLoss)
     self.assertGreater(len(ops.get_collection(ops.GraphKeys.SUMMARIES)), 0)
 
-  # Test gradient penalty option.
-  def _test_grad_penalty_helper(self, create_gan_model_fn, one_sided=False):
+  @parameterized.named_parameters(
+      ('gan', create_gan_model, False),
+      ('gan_one_sided', create_gan_model, True),
+      ('callable_gan', create_callable_gan_model, False),
+      ('callable_gan_one_sided', create_callable_gan_model, True),
+      ('infogan', create_infogan_model, False),
+      ('infogan_one_sided', create_infogan_model, True),
+      ('callable_infogan', create_callable_infogan_model, False),
+      ('callable_infogan_one_sided', create_callable_infogan_model, True),
+      ('acgan', create_acgan_model, False),
+      ('acgan_one_sided', create_acgan_model, True),
+      ('callable_acgan', create_callable_acgan_model, False),
+      ('callable_acgan_one_sided', create_callable_acgan_model, True),
+  )
+  def test_grad_penalty(self, create_gan_model_fn, one_sided):
+    """Test gradient penalty option."""
     model = create_gan_model_fn()
     loss = train.gan_loss(model)
     loss_gp = train.gan_loss(
         model,
         gradient_penalty_weight=1.0,
         gradient_penalty_one_sided=one_sided)
-    self.assertTrue(isinstance(loss_gp, namedtuples.GANLoss))
+    self.assertIsInstance(loss_gp, namedtuples.GANLoss)
 
     # Check values.
     with self.test_session(use_gpu=True) as sess:
@@ -541,59 +563,28 @@ class GANLossTest(test.TestCase):
           [loss.discriminator_loss, loss_gp.discriminator_loss])
 
     self.assertEqual(loss_gen_np, loss_gen_gp_np)
-    self.assertTrue(loss_dis_np < loss_dis_gp_np)
-
-  def test_grad_penalty_gan(self):
-    self._test_grad_penalty_helper(create_gan_model)
-
-  def test_grad_penalty_callable_gan(self):
-    self._test_grad_penalty_helper(create_callable_gan_model)
-
-  def test_grad_penalty_infogan(self):
-    self._test_grad_penalty_helper(create_infogan_model)
-
-  def test_grad_penalty_callable_infogan(self):
-    self._test_grad_penalty_helper(create_callable_infogan_model)
-
-  def test_grad_penalty_acgan(self):
-    self._test_grad_penalty_helper(create_acgan_model)
-
-  def test_grad_penalty_callable_acgan(self):
-    self._test_grad_penalty_helper(create_callable_acgan_model)
-
-  def test_grad_penalty_one_sided_gan(self):
-    self._test_grad_penalty_helper(create_gan_model, one_sided=True)
-
-  def test_grad_penalty_one_sided_callable_gan(self):
-    self._test_grad_penalty_helper(create_callable_gan_model, one_sided=True)
-
-  def test_grad_penalty_one_sided_infogan(self):
-    self._test_grad_penalty_helper(create_infogan_model, one_sided=True)
-
-  def test_grad_penalty_one_sided_callable_infogan(self):
-    self._test_grad_penalty_helper(
-        create_callable_infogan_model, one_sided=True)
-
-  def test_grad_penalty_one_sided_acgan(self):
-    self._test_grad_penalty_helper(create_acgan_model, one_sided=True)
-
-  def test_grad_penalty_one_sided_callable_acgan(self):
-    self._test_grad_penalty_helper(create_callable_acgan_model, one_sided=True)
-
-  # Test mutual information penalty option.
-  def _test_mutual_info_penalty_helper(self, create_gan_model_fn):
+    self.assertLess(loss_dis_np, loss_dis_gp_np)
+
+  @parameterized.named_parameters(
+      ('infogan', get_infogan_model),
+      ('callable_infogan', get_callable_infogan_model),
+  )
+  def test_mutual_info_penalty(self, create_gan_model_fn):
+    """Test mutual information penalty option."""
     train.gan_loss(
         create_gan_model_fn(),
         mutual_information_penalty_weight=constant_op.constant(1.0))
 
-  def test_mutual_info_penalty_infogan(self):
-    self._test_mutual_info_penalty_helper(get_infogan_model)
-
-  def test_mutual_info_penalty_callable_infogan(self):
-    self._test_mutual_info_penalty_helper(get_callable_infogan_model)
-
-  # Test regularization loss.
-  def _test_regularization_helper(self, get_gan_model_fn):
+  @parameterized.named_parameters(
+      ('gan', get_gan_model),
+      ('callable_gan', get_callable_gan_model),
+      ('infogan', get_infogan_model),
+      ('callable_infogan', get_callable_infogan_model),
+      ('acgan', get_acgan_model),
+      ('callable_acgan', get_callable_acgan_model),
+  )
+  def test_regularization_helper(self, get_gan_model_fn):
+    """Test regularization loss."""
     # Evaluate losses without regularization.
     no_reg_loss = train.gan_loss(get_gan_model_fn())
     with self.test_session(use_gpu=True):
@@ -616,33 +607,19 @@ class GANLossTest(test.TestCase):
     self.assertEqual(3.0, reg_loss_gen_np - no_reg_loss_gen_np)
     self.assertEqual(2.0, reg_loss_dis_np - no_reg_loss_dis_np)
 
-  def test_regularization_gan(self):
-    self._test_regularization_helper(get_gan_model)
-
-  def test_regularization_callable_gan(self):
-    self._test_regularization_helper(get_callable_gan_model)
-
-  def test_regularization_infogan(self):
-    self._test_regularization_helper(get_infogan_model)
-
-  def test_regularization_callable_infogan(self):
-    self._test_regularization_helper(get_callable_infogan_model)
-
-  def test_regularization_acgan(self):
-    self._test_regularization_helper(get_acgan_model)
-
-  def test_regularization_callable_acgan(self):
-    self._test_regularization_helper(get_callable_acgan_model)
-
-  # Test that ACGan models work.
-  def _test_acgan_helper(self, create_gan_model_fn):
+  @parameterized.named_parameters(
+      ('notcallable', create_acgan_model),
+      ('callable', create_callable_acgan_model),
+  )
+  def test_acgan(self, create_gan_model_fn):
+    """Test that ACGAN models work."""
     model = create_gan_model_fn()
     loss = train.gan_loss(model)
     loss_ac_gen = train.gan_loss(model, aux_cond_generator_weight=1.0)
     loss_ac_dis = train.gan_loss(model, aux_cond_discriminator_weight=1.0)
-    self.assertTrue(isinstance(loss, namedtuples.GANLoss))
-    self.assertTrue(isinstance(loss_ac_gen, namedtuples.GANLoss))
-    self.assertTrue(isinstance(loss_ac_dis, namedtuples.GANLoss))
+    self.assertIsInstance(loss, namedtuples.GANLoss)
+    self.assertIsInstance(loss_ac_gen, namedtuples.GANLoss)
+    self.assertIsInstance(loss_ac_dis, namedtuples.GANLoss)
 
     # Check values.
     with self.test_session(use_gpu=True) as sess:
@@ -656,20 +633,18 @@ class GANLossTest(test.TestCase):
           loss_ac_dis.discriminator_loss
       ])
 
-    self.assertTrue(loss_gen_np < loss_dis_np)
+    self.assertLess(loss_gen_np, loss_dis_np)
     self.assertTrue(np.isscalar(loss_ac_gen_gen_np))
     self.assertTrue(np.isscalar(loss_ac_dis_gen_np))
     self.assertTrue(np.isscalar(loss_ac_gen_dis_np))
     self.assertTrue(np.isscalar(loss_ac_dis_dis_np))
 
-  def test_acgan(self):
-    self._test_acgan_helper(create_acgan_model)
-
-  def test_callable_acgan(self):
-    self._test_acgan_helper(create_callable_acgan_model)
-
-  # Test that CycleGan models work.
-  def _test_cyclegan_helper(self, create_gan_model_fn):
+  @parameterized.named_parameters(
+      ('notcallable', create_cyclegan_model),
+      ('callable', create_callable_cyclegan_model),
+  )
+  def test_cyclegan(self, create_gan_model_fn):
+    """Test that CycleGan models work."""
     model = create_gan_model_fn()
     loss = train.cyclegan_loss(model)
     self.assertIsInstance(loss, namedtuples.CycleGANLoss)
@@ -690,14 +665,86 @@ class GANLossTest(test.TestCase):
     self.assertTrue(np.isscalar(loss_y2x_gen_np))
     self.assertTrue(np.isscalar(loss_y2x_dis_np))
 
-  def test_cyclegan(self):
-    self._test_cyclegan_helper(create_cyclegan_model)
+  @parameterized.named_parameters(
+      ('notcallable', create_stargan_model),
+      ('callable', create_callable_stargan_model),
+  )
+  def test_stargan(self, create_gan_model_fn):
 
-  def test_callable_cyclegan(self):
-    self._test_cyclegan_helper(create_callable_cyclegan_model)
+    model = create_gan_model_fn()
+    model_loss = train.stargan_loss(model)
+
+    self.assertIsInstance(model_loss, namedtuples.GANLoss)
+
+    with self.test_session() as sess:
+
+      sess.run(variables.global_variables_initializer())
+
+      gen_loss, disc_loss = sess.run(
+          [model_loss.generator_loss, model_loss.discriminator_loss])
+
+      self.assertTrue(np.isscalar(gen_loss))
+      self.assertTrue(np.isscalar(disc_loss))
+
+  @parameterized.named_parameters(
+      ('gan', create_gan_model),
+      ('callable_gan', create_callable_gan_model),
+      ('infogan', create_infogan_model),
+      ('callable_infogan', create_callable_infogan_model),
+      ('acgan', create_acgan_model),
+      ('callable_acgan', create_callable_acgan_model),
+  )
+  def test_tensor_pool(self, create_gan_model_fn):
+    """Test tensor pool option."""
+    model = create_gan_model_fn()
+    tensor_pool_fn = lambda x: random_tensor_pool.tensor_pool(x, pool_size=5)
+    loss = train.gan_loss(model, tensor_pool_fn=tensor_pool_fn)
+    self.assertIsInstance(loss, namedtuples.GANLoss)
+
+    # Check values.
+    with self.test_session(use_gpu=True) as sess:
+      variables.global_variables_initializer().run()
+      for _ in range(10):
+        sess.run([loss.generator_loss, loss.discriminator_loss])
+
+  def test_discriminator_only_sees_pool(self):
+    """Checks that discriminator only sees pooled values."""
+    def checker_gen_fn(_):
+      return constant_op.constant(0.0)
+    model = train.gan_model(
+        checker_gen_fn,
+        discriminator_model,
+        real_data=array_ops.zeros([]),
+        generator_inputs=random_ops.random_normal([]))
+    def tensor_pool_fn(_):
+      return (random_ops.random_uniform([]), random_ops.random_uniform([]))
+    def checker_dis_fn(inputs, _):
+      """Discriminator that checks that it only sees pooled Tensors."""
+      self.assertFalse(constant_op.is_constant(inputs))
+      return inputs
+    model = model._replace(
+        discriminator_fn=checker_dis_fn)
+    train.gan_loss(model, tensor_pool_fn=tensor_pool_fn)
+
+  def test_doesnt_crash_when_in_nested_scope(self):
+    with variable_scope.variable_scope('outer_scope'):
+      gan_model = train.gan_model(
+          generator_model,
+          discriminator_model,
+          real_data=array_ops.zeros([1, 2]),
+          generator_inputs=random_ops.random_normal([1, 2]))
+
+      # This should work inside a scope.
+      train.gan_loss(gan_model, gradient_penalty_weight=1.0)
+
+    # This should also work outside a scope.
+    train.gan_loss(gan_model, gradient_penalty_weight=1.0)
 
-  def _check_tensor_pool_adjusted_model_outputs(self, tensor1, tensor2,
-                                                pool_size):
+
+class TensorPoolAdjusteModelTest(test.TestCase):
+
+  def _check_tensor_pool_adjusted_model_outputs(
+      self, tensor1, tensor2, pool_size):
     history_values = []
     with self.test_session(use_gpu=True) as sess:
       variables.global_variables_initializer().run()
@@ -714,115 +761,66 @@ class GANLossTest(test.TestCase):
           # pool).
           self.assertTrue(any([(v == t2).all() for v in history_values]))
 
-  # Test `_tensor_pool_adjusted_model` for gan model.
-  def test_tensor_pool_adjusted_model_gan(self):
-    model = create_gan_model()
-
-    new_model = train._tensor_pool_adjusted_model(model, None)
+  def _make_new_model_and_check(self, model, pool_size):
+    pool_fn = lambda x: random_tensor_pool.tensor_pool(x, pool_size=pool_size)
+    new_model = train._tensor_pool_adjusted_model(model, pool_fn)
     # 'Generator/dummy_g:0' and 'Discriminator/dummy_d:0'
     self.assertEqual(2, len(ops.get_collection(ops.GraphKeys.VARIABLES)))
-    self.assertIs(new_model.discriminator_gen_outputs,
-                  model.discriminator_gen_outputs)
-
-    pool_size = 5
-    new_model = train._tensor_pool_adjusted_model(
-        model, get_tensor_pool_fn(pool_size=pool_size))
     self.assertIsNot(new_model.discriminator_gen_outputs,
                      model.discriminator_gen_outputs)
+
+    return new_model
+
+  def test_tensor_pool_adjusted_model_gan(self):
+    """Test `_tensor_pool_adjusted_model` for gan model."""
+    pool_size = 5
+    model = create_gan_model()
+    new_model = self._make_new_model_and_check(model, pool_size)
+
     # Check values.
     self._check_tensor_pool_adjusted_model_outputs(
         model.discriminator_gen_outputs, new_model.discriminator_gen_outputs,
         pool_size)
 
-  # Test _tensor_pool_adjusted_model for infogan model.
   def test_tensor_pool_adjusted_model_infogan(self):
+    """Test _tensor_pool_adjusted_model for infogan model."""
+    pool_size = 5
     model = create_infogan_model()
+    new_model = self._make_new_model_and_check(model, pool_size)
 
-    pool_size = 5
-    new_model = train._tensor_pool_adjusted_model(
-        model, get_tensor_pool_fn_for_infogan(pool_size=pool_size))
-    # 'Generator/dummy_g:0' and 'Discriminator/dummy_d:0'
-    self.assertEqual(2, len(ops.get_collection(ops.GraphKeys.VARIABLES)))
-    self.assertIsNot(new_model.discriminator_gen_outputs,
-                     model.discriminator_gen_outputs)
+    # Check values.
     self.assertIsNot(new_model.predicted_distributions,
                      model.predicted_distributions)
-    # Check values.
     self._check_tensor_pool_adjusted_model_outputs(
         model.discriminator_gen_outputs, new_model.discriminator_gen_outputs,
         pool_size)
 
-  # Test _tensor_pool_adjusted_model for acgan model.
   def test_tensor_pool_adjusted_model_acgan(self):
+    """Test _tensor_pool_adjusted_model for acgan model."""
+    pool_size = 5
     model = create_acgan_model()
+    new_model = self._make_new_model_and_check(model, pool_size)
 
-    pool_size = 5
-    new_model = train._tensor_pool_adjusted_model(
-        model, get_tensor_pool_fn(pool_size=pool_size))
-    # 'Generator/dummy_g:0' and 'Discriminator/dummy_d:0'
-    self.assertEqual(2, len(ops.get_collection(ops.GraphKeys.VARIABLES)))
-    self.assertIsNot(new_model.discriminator_gen_outputs,
-                     model.discriminator_gen_outputs)
+    # Check values.
     self.assertIsNot(new_model.discriminator_gen_classification_logits,
                      model.discriminator_gen_classification_logits)
-    # Check values.
     self._check_tensor_pool_adjusted_model_outputs(
         model.discriminator_gen_outputs, new_model.discriminator_gen_outputs,
         pool_size)
 
-  # Test tensor pool.
-  def _test_tensor_pool_helper(self, create_gan_model_fn):
-    model = create_gan_model_fn()
-    if isinstance(model, namedtuples.InfoGANModel):
-      tensor_pool_fn = get_tensor_pool_fn_for_infogan(pool_size=5)
-    else:
-      tensor_pool_fn = get_tensor_pool_fn(pool_size=5)
-    loss = train.gan_loss(model, tensor_pool_fn=tensor_pool_fn)
-    self.assertTrue(isinstance(loss, namedtuples.GANLoss))
-
-    # Check values.
-    with self.test_session(use_gpu=True) as sess:
-      variables.global_variables_initializer().run()
-      for _ in range(10):
-        sess.run([loss.generator_loss, loss.discriminator_loss])
-
-  def test_tensor_pool_gan(self):
-    self._test_tensor_pool_helper(create_gan_model)
-
-  def test_tensor_pool_callable_gan(self):
-    self._test_tensor_pool_helper(create_callable_gan_model)
-
-  def test_tensor_pool_infogan(self):
-    self._test_tensor_pool_helper(create_infogan_model)
-
-  def test_tensor_pool_callable_infogan(self):
-    self._test_tensor_pool_helper(create_callable_infogan_model)
-
-  def test_tensor_pool_acgan(self):
-    self._test_tensor_pool_helper(create_acgan_model)
-
-  def test_tensor_pool_callable_acgan(self):
-    self._test_tensor_pool_helper(create_callable_acgan_model)
-
-  def test_doesnt_crash_when_in_nested_scope(self):
-    with variable_scope.variable_scope('outer_scope'):
-      gan_model = train.gan_model(
-          generator_model,
-          discriminator_model,
-          real_data=array_ops.zeros([1, 2]),
-          generator_inputs=random_ops.random_normal([1, 2]))
-
-      # This should work inside a scope.
-      train.gan_loss(gan_model, gradient_penalty_weight=1.0)
 
-    # This should also work outside a scope.
-    train.gan_loss(gan_model, gradient_penalty_weight=1.0)
-
-
-class GANTrainOpsTest(test.TestCase):
+class GANTrainOpsTest(test.TestCase, parameterized.TestCase):
   """Tests for `gan_train_ops`."""
 
-  def _test_output_type_helper(self, create_gan_model_fn):
+  @parameterized.named_parameters(
+      ('gan', create_gan_model),
+      ('callable_gan', create_callable_gan_model),
+      ('infogan', create_infogan_model),
+      ('callable_infogan', create_callable_infogan_model),
+      ('acgan', create_acgan_model),
+      ('callable_acgan', create_callable_acgan_model),
+  )
+  def test_output_type(self, create_gan_model_fn):
     model = create_gan_model_fn()
     loss = train.gan_loss(model)
 
@@ -836,28 +834,24 @@ class GANTrainOpsTest(test.TestCase):
         summarize_gradients=True,
         colocate_gradients_with_ops=True)
 
-    self.assertTrue(isinstance(train_ops, namedtuples.GANTrainOps))
-
-  def test_output_type_gan(self):
-    self._test_output_type_helper(create_gan_model)
-
-  def test_output_type_callable_gan(self):
-    self._test_output_type_helper(create_callable_gan_model)
-
-  def test_output_type_infogan(self):
-    self._test_output_type_helper(create_infogan_model)
-
-  def test_output_type_callable_infogan(self):
-    self._test_output_type_helper(create_callable_infogan_model)
-
-  def test_output_type_acgan(self):
-    self._test_output_type_helper(create_acgan_model)
-
-  def test_output_type_callable_acgan(self):
-    self._test_output_type_helper(create_callable_acgan_model)
+    self.assertIsInstance(train_ops, namedtuples.GANTrainOps)
 
   # TODO(joelshor): Add a test to check that custom update op is run.
-  def _test_unused_update_ops(self, create_gan_model_fn, provide_update_ops):
+  @parameterized.named_parameters(
+      ('gan', create_gan_model, False),
+      ('gan_provideupdates', create_gan_model, True),
+      ('callable_gan', create_callable_gan_model, False),
+      ('callable_gan_provideupdates', create_callable_gan_model, True),
+      ('infogan', create_infogan_model, False),
+      ('infogan_provideupdates', create_infogan_model, True),
+      ('callable_infogan', create_callable_infogan_model, False),
+      ('callable_infogan_provideupdates', create_callable_infogan_model, True),
+      ('acgan', create_acgan_model, False),
+      ('acgan_provideupdates', create_acgan_model, True),
+      ('callable_acgan', create_callable_acgan_model, False),
+      ('callable_acgan_provideupdates', create_callable_acgan_model, True),
+  )
+  def test_unused_update_ops(self, create_gan_model_fn, provide_update_ops):
     model = create_gan_model_fn()
     loss = train.gan_loss(model)
 
@@ -904,45 +898,16 @@ class GANTrainOpsTest(test.TestCase):
       self.assertEqual(1, gen_update_count.eval())
       self.assertEqual(1, dis_update_count.eval())
 
-  def test_unused_update_ops_gan(self):
-    self._test_unused_update_ops(create_gan_model, False)
-
-  def test_unused_update_ops_gan_provideupdates(self):
-    self._test_unused_update_ops(create_gan_model, True)
-
-  def test_unused_update_ops_callable_gan(self):
-    self._test_unused_update_ops(create_callable_gan_model, False)
-
-  def test_unused_update_ops_callable_gan_provideupdates(self):
-    self._test_unused_update_ops(create_callable_gan_model, True)
-
-  def test_unused_update_ops_infogan(self):
-    self._test_unused_update_ops(create_infogan_model, False)
-
-  def test_unused_update_ops_infogan_provideupdates(self):
-    self._test_unused_update_ops(create_infogan_model, True)
-
-  def test_unused_update_ops_callable_infogan(self):
-    self._test_unused_update_ops(create_callable_infogan_model, False)
-
-  def test_unused_update_ops_callable_infogan_provideupdates(self):
-    self._test_unused_update_ops(create_callable_infogan_model, True)
-
-  def test_unused_update_ops_acgan(self):
-    self._test_unused_update_ops(create_acgan_model, False)
-
-  def test_unused_update_ops_acgan_provideupdates(self):
-    self._test_unused_update_ops(create_acgan_model, True)
-
-  def test_unused_update_ops_callable_acgan(self):
-    self._test_unused_update_ops(create_callable_acgan_model, False)
-
-  def test_unused_update_ops_callable_acgan_provideupdates(self):
-    self._test_unused_update_ops(create_callable_acgan_model, True)
-
-  def _test_sync_replicas_helper(self,
-                                 create_gan_model_fn,
-                                 create_global_step=False):
+  @parameterized.named_parameters(
+      ('gan', create_gan_model, False),
+      ('callable_gan', create_callable_gan_model, False),
+      ('infogan', create_infogan_model, False),
+      ('callable_infogan', create_callable_infogan_model, False),
+      ('acgan', create_acgan_model, False),
+      ('callable_acgan', create_callable_acgan_model, False),
+      ('gan_canbeint32', create_gan_model, True),
+  )
+  def test_sync_replicas(self, create_gan_model_fn, create_global_step):
     model = create_gan_model_fn()
     loss = train.gan_loss(model)
     num_trainable_vars = len(variables_lib.get_trainable_variables())
@@ -956,7 +921,7 @@ class GANTrainOpsTest(test.TestCase):
     d_opt = get_sync_optimizer()
     train_ops = train.gan_train_ops(
         model, loss, generator_optimizer=g_opt, discriminator_optimizer=d_opt)
-    self.assertTrue(isinstance(train_ops, namedtuples.GANTrainOps))
+    self.assertIsInstance(train_ops, namedtuples.GANTrainOps)
     # No new trainable variables should have been added.
     self.assertEqual(num_trainable_vars,
                      len(variables_lib.get_trainable_variables()))
@@ -994,29 +959,8 @@ class GANTrainOpsTest(test.TestCase):
       coord.request_stop()
       coord.join(g_threads + d_threads)
 
-  def test_sync_replicas_gan(self):
-    self._test_sync_replicas_helper(create_gan_model)
-
-  def test_sync_replicas_callable_gan(self):
-    self._test_sync_replicas_helper(create_callable_gan_model)
-
-  def test_sync_replicas_infogan(self):
-    self._test_sync_replicas_helper(create_infogan_model)
-
-  def test_sync_replicas_callable_infogan(self):
-    self._test_sync_replicas_helper(create_callable_infogan_model)
 
-  def test_sync_replicas_acgan(self):
-    self._test_sync_replicas_helper(create_acgan_model)
-
-  def test_sync_replicas_callable_acgan(self):
-    self._test_sync_replicas_helper(create_callable_acgan_model)
-
-  def test_global_step_can_be_int32(self):
-    self._test_sync_replicas_helper(create_gan_model, create_global_step=True)
-
-
-class GANTrainTest(test.TestCase):
+class GANTrainTest(test.TestCase, parameterized.TestCase):
   """Tests for `gan_train`."""
 
   def _gan_train_ops(self, generator_add, discriminator_add):
@@ -1032,7 +976,15 @@ class GANTrainTest(test.TestCase):
         global_step_inc_op=step.assign_add(1))
     return train_ops
 
-  def _test_run_helper(self, create_gan_model_fn):
+  @parameterized.named_parameters(
+      ('gan', create_gan_model),
+      ('callable_gan', create_callable_gan_model),
+      ('infogan', create_infogan_model),
+      ('callable_infogan', create_callable_infogan_model),
+      ('acgan', create_acgan_model),
+      ('callable_acgan', create_callable_acgan_model),
+  )
+  def test_run_helper(self, create_gan_model_fn):
     random_seed.set_random_seed(1234)
     model = create_gan_model_fn()
     loss = train.gan_loss(model)
@@ -1048,26 +1000,12 @@ class GANTrainTest(test.TestCase):
     self.assertTrue(np.isscalar(final_step))
     self.assertEqual(2, final_step)
 
-  def test_run_gan(self):
-    self._test_run_helper(create_gan_model)
-
-  def test_run_callable_gan(self):
-    self._test_run_helper(create_callable_gan_model)
-
-  def test_run_infogan(self):
-    self._test_run_helper(create_infogan_model)
-
-  def test_run_callable_infogan(self):
-    self._test_run_helper(create_callable_infogan_model)
-
-  def test_run_acgan(self):
-    self._test_run_helper(create_acgan_model)
-
-  def test_run_callable_acgan(self):
-    self._test_run_helper(create_callable_acgan_model)
-
-  # Test multiple train steps.
-  def _test_multiple_steps_helper(self, get_hooks_fn_fn):
+  @parameterized.named_parameters(
+      ('seq_train_steps', train.get_sequential_train_hooks),
+      ('efficient_seq_train_steps', train.get_joint_train_hooks),
+  )
+  def test_multiple_steps(self, get_hooks_fn_fn):
+    """Test multiple train steps."""
     train_ops = self._gan_train_ops(generator_add=10, discriminator_add=100)
     train_steps = namedtuples.GANTrainSteps(
         generator_train_steps=3, discriminator_train_steps=4)
@@ -1080,12 +1018,6 @@ class GANTrainTest(test.TestCase):
     self.assertTrue(np.isscalar(final_step))
     self.assertEqual(1 + 3 * 10 + 4 * 100, final_step)
 
-  def test_multiple_steps_seq_train_steps(self):
-    self._test_multiple_steps_helper(train.get_sequential_train_hooks)
-
-  def test_multiple_steps_efficient_seq_train_steps(self):
-    self._test_multiple_steps_helper(train.get_joint_train_hooks)
-
   def test_supervisor_run_gan_model_train_ops_multiple_steps(self):
     step = training_util.create_global_step()
     train_ops = namedtuples.GANTrainOps(
@@ -1105,10 +1037,18 @@ class GANTrainTest(test.TestCase):
     self.assertEqual(17.0, final_loss)
 
 
-class PatchGANTest(test.TestCase):
+class PatchGANTest(test.TestCase, parameterized.TestCase):
   """Tests that functions work on PatchGAN style output."""
 
-  def _test_patchgan_helper(self, create_gan_model_fn):
+  @parameterized.named_parameters(
+      ('gan', create_gan_model),
+      ('callable_gan', create_callable_gan_model),
+      ('infogan', create_infogan_model),
+      ('callable_infogan', create_callable_infogan_model),
+      ('acgan', create_acgan_model),
+      ('callable_acgan', create_callable_acgan_model),
+  )
+  def test_patchgan(self, create_gan_model_fn):
     """Ensure that patch-based discriminators work end-to-end."""
     random_seed.set_random_seed(1234)
     model = create_gan_model_fn()
@@ -1125,24 +1065,6 @@ class PatchGANTest(test.TestCase):
     self.assertTrue(np.isscalar(final_step))
     self.assertEqual(2, final_step)
 
-  def test_patchgan_gan(self):
-    self._test_patchgan_helper(create_gan_model)
-
-  def test_patchgan_callable_gan(self):
-    self._test_patchgan_helper(create_callable_gan_model)
-
-  def test_patchgan_infogan(self):
-    self._test_patchgan_helper(create_infogan_model)
-
-  def test_patchgan_callable_infogan(self):
-    self._test_patchgan_helper(create_callable_infogan_model)
-
-  def test_patchgan_acgan(self):
-    self._test_patchgan_helper(create_acgan_model)
-
-  def test_patchgan_callable_acgan(self):
-    self._test_patchgan_helper(create_callable_acgan_model)
-
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/gdr/gdr_memory_manager.cc b/tensorflow/contrib/gdr/gdr_memory_manager.cc
index 1435e19109..f3bbf6b4d7 100644
--- a/tensorflow/contrib/gdr/gdr_memory_manager.cc
+++ b/tensorflow/contrib/gdr/gdr_memory_manager.cc
@@ -33,10 +33,11 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/bfc_allocator.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/common_runtime/pool_allocator.h"
+#include "tensorflow/core/common_runtime/process_state.h"
 #if GOOGLE_CUDA
 #include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_util.h"
-#include "tensorflow/core/common_runtime/process_state.h"
 #endif  // GOOGLE_CUDA
 #include "tensorflow/core/framework/allocator_registry.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -182,28 +183,25 @@ class GdrMemoryManager : public RemoteMemoryManager {
   TF_DISALLOW_COPY_AND_ASSIGN(GdrMemoryManager);
 };
 
-// TODO(byronyi): remove this class duplicated from the one in
-// common/runtime/gpu/pool_allocator.h when it is available in common_runtime
-class BasicCPUAllocator : public SubAllocator {
- public:
-  ~BasicCPUAllocator() override {}
-
-  void* Alloc(size_t alignment, size_t num_bytes) override {
-    return port::AlignedMalloc(num_bytes, alignment);
-  }
-  void Free(void* ptr, size_t) override { port::AlignedFree(ptr); }
-};
-
 // TODO(byronyi): remove this class and its registration when the default
-// cpu_allocator() returns visitable allocator
+// cpu_allocator() returns visitable allocator, or cpu_allocator() is no
+// longer in use.
 class BFCRdmaAllocator : public BFCAllocator {
  public:
   BFCRdmaAllocator()
-      : BFCAllocator(new BasicCPUAllocator(), 1LL << 36, true, "cpu_rdma_bfc") {
+      : BFCAllocator(new BasicCPUAllocator(port::kNUMANoAffinity), 1LL << 36,
+                     true, "cpu_rdma_bfc") {}
+};
+class BFCRdmaAllocatorFactory : public AllocatorFactory {
+ public:
+  Allocator* CreateAllocator() override { return new BFCRdmaAllocator; }
+
+  virtual SubAllocator* CreateSubAllocator(int numa_node) {
+    return new BasicCPUAllocator(numa_node);
   }
 };
 
-REGISTER_MEM_ALLOCATOR("BFCRdmaAllocator", 101, BFCRdmaAllocator);
+REGISTER_MEM_ALLOCATOR("BFCRdmaAllocator", 101, BFCRdmaAllocatorFactory);
 
 GdrMemoryManager::GdrMemoryManager(const string& host, const string& port)
     : host_(host),
@@ -276,8 +274,8 @@ Status GdrMemoryManager::Init() {
   Allocator* allocators[] = {
 #if GOOGLE_CUDA
     GPUProcessState::singleton()->GetCUDAHostAllocator(0),
-    ProcessState::singleton()->GetCPUAllocator(0),
 #endif  // GOOGLE_CUDA
+    ProcessState::singleton()->GetCPUAllocator(0),
     cpu_allocator(),
   };
 
diff --git a/tensorflow/contrib/graph_editor/reroute.py b/tensorflow/contrib/graph_editor/reroute.py
index 95c02a64d4..d42e0c01f4 100644
--- a/tensorflow/contrib/graph_editor/reroute.py
+++ b/tensorflow/contrib/graph_editor/reroute.py
@@ -208,9 +208,9 @@ def _reroute_ts(ts0, ts1, mode, can_modify=None, cannot_modify=None):
 def swap_ts(ts0, ts1, can_modify=None, cannot_modify=None):
   """For each tensor's pair, swap the end of (t0,t1).
 
-  B0 B1     B0 B1
-  |  |    =>  X
-  A0 A1     A0 A1
+      B0 B1     B0 B1
+      |  |    =>  X
+      A0 A1     A0 A1
 
   Args:
     ts0: an object convertible to a list of `tf.Tensor`.
@@ -233,9 +233,9 @@ def swap_ts(ts0, ts1, can_modify=None, cannot_modify=None):
 def reroute_ts(ts0, ts1, can_modify=None, cannot_modify=None):
   """For each tensor's pair, replace the end of t1 by the end of t0.
 
-  B0 B1     B0 B1
-  |  |    => |/
-  A0 A1     A0 A1
+      B0 B1     B0 B1
+      |  |    => |/
+      A0 A1     A0 A1
 
   The end of the tensors in ts1 are left dangling.
 
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index beeabd6b65..dd602cf3a9 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -1702,19 +1702,22 @@ def _inner_flatten(inputs, new_rank, output_collections=None, scope=None):
   return utils.collect_named_outputs(output_collections, sc, flattened)
 
 
-def _model_variable_getter(getter,
-                           name,
-                           shape=None,
-                           dtype=None,
-                           initializer=None,
-                           regularizer=None,
-                           trainable=True,
-                           collections=None,
-                           caching_device=None,
-                           partitioner=None,
-                           rename=None,
-                           use_resource=None,
-                           **_):
+def _model_variable_getter(
+    getter,
+    name,
+    shape=None,
+    dtype=None,
+    initializer=None,
+    regularizer=None,
+    trainable=True,
+    collections=None,
+    caching_device=None,
+    partitioner=None,
+    rename=None,
+    use_resource=None,
+    synchronization=tf_variables.VariableSynchronization.AUTO,
+    aggregation=tf_variables.VariableAggregation.NONE,
+    **_):
   """Getter that uses model_variable for compatibility with core layers."""
   short_name = name.split('/')[-1]
   if rename and short_name in rename:
@@ -1732,7 +1735,9 @@ def _model_variable_getter(getter,
       caching_device=caching_device,
       partitioner=partitioner,
       custom_getter=getter,
-      use_resource=use_resource)
+      use_resource=use_resource,
+      synchronization=synchronization,
+      aggregation=aggregation)
 
 
 def _build_variable_getter(rename=None):
diff --git a/tensorflow/contrib/lite/allocation.cc b/tensorflow/contrib/lite/allocation.cc
index c42622ff02..ef6c14f085 100644
--- a/tensorflow/contrib/lite/allocation.cc
+++ b/tensorflow/contrib/lite/allocation.cc
@@ -99,7 +99,9 @@ FileCopyAllocation::FileCopyAllocation(const char* filename,
                             filename);
     return;
   }
-  copied_buffer_ = std::move(buffer);
+  // Versions of GCC before 6.2.0 don't support std::move from non-const
+  // char[] to const char[] unique_ptrs.
+  copied_buffer_.reset(const_cast<char const*>(buffer.release()));
 }
 
 FileCopyAllocation::~FileCopyAllocation() {}
diff --git a/tensorflow/contrib/lite/arena_planner.cc b/tensorflow/contrib/lite/arena_planner.cc
index 16a0e71624..02442575b3 100644
--- a/tensorflow/contrib/lite/arena_planner.cc
+++ b/tensorflow/contrib/lite/arena_planner.cc
@@ -17,14 +17,6 @@ limitations under the License.
 
 namespace tflite {
 
-namespace {
-
-// Memory allocation tuning
-constexpr const int kDefaultArenaAlignment = 64;
-constexpr const int kDefaultTensorAlignment = 4;
-
-}  // namespace
-
 struct AllocationInfo {
   // The node index requesting this allocation.
   int node;
@@ -36,13 +28,16 @@ struct AllocationInfo {
 
 ArenaPlanner::ArenaPlanner(TfLiteContext* context,
                            std::unique_ptr<GraphInfo> graph_info,
-                           bool preserve_inputs, bool preserve_intermediates)
+                           bool preserve_inputs, bool preserve_intermediates,
+                           int tensor_alignment)
     : context_(context),
       graph_info_(std::move(graph_info)),
       arena_(kDefaultArenaAlignment),
       persistent_arena_(kDefaultArenaAlignment),
       preserve_inputs_(preserve_inputs),
-      preserve_intermediates_(preserve_intermediates) {}
+      preserve_intermediates_(preserve_intermediates),
+      tensor_alignment_(tensor_alignment) {}
+
 ArenaPlanner::~ArenaPlanner() {}
 
 int64_t ArenaPlanner::BasePointer(TfLiteAllocationType type) {
@@ -264,14 +259,12 @@ TfLiteStatus ArenaPlanner::ResolveTensorAllocation(int tensor_index) {
 TfLiteStatus ArenaPlanner::CalculateTensorAllocation(int tensor_index) {
   TfLiteTensor& tensor = *graph_info_->tensor(tensor_index);
   if (tensor.allocation_type == kTfLiteArenaRw) {
-    TF_LITE_ENSURE_STATUS(arena_.Allocate(context_, kDefaultTensorAlignment,
-                                          tensor.bytes,
-                                          &allocs_[tensor_index]));
+    TF_LITE_ENSURE_STATUS(arena_.Allocate(
+        context_, tensor_alignment_, tensor.bytes, &allocs_[tensor_index]));
   }
   if (tensor.allocation_type == kTfLiteArenaRwPersistent) {
-    TF_LITE_ENSURE_STATUS(
-        persistent_arena_.Allocate(context_, kDefaultTensorAlignment,
-                                   tensor.bytes, &allocs_[tensor_index]));
+    TF_LITE_ENSURE_STATUS(persistent_arena_.Allocate(
+        context_, tensor_alignment_, tensor.bytes, &allocs_[tensor_index]));
   }
   return kTfLiteOk;
 }
diff --git a/tensorflow/contrib/lite/arena_planner.h b/tensorflow/contrib/lite/arena_planner.h
index 82c866734f..55003cf4e9 100644
--- a/tensorflow/contrib/lite/arena_planner.h
+++ b/tensorflow/contrib/lite/arena_planner.h
@@ -25,6 +25,10 @@ limitations under the License.
 
 namespace tflite {
 
+// Memory allocation tuning
+constexpr const int kDefaultArenaAlignment = 64;
+constexpr const int kDefaultTensorAlignment = 64;
+
 struct AllocationInfo;
 
 // A memory planner that makes all the allocations using arenas.
@@ -47,7 +51,8 @@ class ArenaPlanner : public MemoryPlanner {
   // graph will not share memory with any other tensor, effectively preserving
   // them until the end of inference.
   ArenaPlanner(TfLiteContext* context, std::unique_ptr<GraphInfo> graph_info,
-               bool preserve_inputs, bool preserve_intermediates);
+               bool preserve_inputs, bool preserve_intermediates,
+               int tensor_alignment = kDefaultTensorAlignment);
   ~ArenaPlanner() override;
   ArenaPlanner(const ArenaPlanner&) = delete;
   ArenaPlanner& operator=(const ArenaPlanner&) = delete;
@@ -112,6 +117,9 @@ class ArenaPlanner : public MemoryPlanner {
   // If true, then no overlapping of memory areas is done, meaning intermediates
   // results can be queried after running (modulo running delegates).
   bool preserve_intermediates_;
+
+  // Number of bytes that tensor buffers should be aligned to.
+  int tensor_alignment_;
 };
 
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/arena_planner_test.cc b/tensorflow/contrib/lite/arena_planner_test.cc
index 1adb426d58..7d7c41289c 100644
--- a/tensorflow/contrib/lite/arena_planner_test.cc
+++ b/tensorflow/contrib/lite/arena_planner_test.cc
@@ -24,6 +24,8 @@ limitations under the License.
 namespace tflite {
 namespace {
 
+constexpr const int kTensorAlignment = 4;
+
 // A simple op to be used in tests, as syntactic sugar.
 class TestOp {
  public:
@@ -156,7 +158,7 @@ class ArenaPlannerTest : public ::testing::Test {
     context_.ReportError = ReportError;
     planner_.reset(new ArenaPlanner(
         &context_, std::unique_ptr<GraphInfo>(new TestGraphInfo(graph)),
-        preserve_inputs, /*preserve intermediates*/ false));
+        preserve_inputs, /*preserve intermediates*/ false, kTensorAlignment));
     CHECK(planner_->ResetAllocations() == kTfLiteOk);
     CHECK(planner_->PlanAllocations() == kTfLiteOk);
   }
@@ -178,8 +180,8 @@ class ArenaPlannerTest : public ::testing::Test {
     const TfLiteTensor& tensor = (*graph_->tensors())[tensor_index];
     int64_t offset = GetOffset(tensor_index) + tensor.bytes;
     // We must make sure the offset is aligned to kDefaultArenaAlignment.
-    if (offset % 4 != 0) {
-      offset += 4 - offset % 4;
+    if (offset % kTensorAlignment != 0) {
+      offset += kTensorAlignment - offset % kTensorAlignment;
     }
     return offset;
   };
diff --git a/tensorflow/contrib/lite/build_def.bzl b/tensorflow/contrib/lite/build_def.bzl
index bed862454e..efe8857344 100644
--- a/tensorflow/contrib/lite/build_def.bzl
+++ b/tensorflow/contrib/lite/build_def.bzl
@@ -1,193 +1,216 @@
 """Generate Flatbuffer binary from json."""
+
 load(
     "//tensorflow:tensorflow.bzl",
+    "tf_cc_shared_object",
     "tf_cc_test",
 )
 
 def tflite_copts():
-  """Defines compile time flags."""
-  copts = [
-      "-DFARMHASH_NO_CXX_STRING",
-  ] + select({
-          str(Label("//tensorflow:android_arm64")): [
-              "-std=c++11",
-              "-O3",
-          ],
-          str(Label("//tensorflow:android_arm")): [
-              "-mfpu=neon",
-              "-mfloat-abi=softfp",
-              "-std=c++11",
-              "-O3",
-          ],
-          str(Label("//tensorflow:android_x86")): [
-              "-DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK",
-          ],
-          str(Label("//tensorflow:ios_x86_64")): [
-              "-msse4.1",
-          ],
-          "//conditions:default": [],
-  }) + select({
-      str(Label("//tensorflow:with_default_optimizations")): [],
-      "//conditions:default": ["-DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK"],
-  })
+    """Defines compile time flags."""
+    copts = [
+        "-DFARMHASH_NO_CXX_STRING",
+    ] + select({
+        str(Label("//tensorflow:android_arm64")): [
+            "-std=c++11",
+            "-O3",
+        ],
+        str(Label("//tensorflow:android_arm")): [
+            "-mfpu=neon",
+            "-mfloat-abi=softfp",
+            "-std=c++11",
+            "-O3",
+        ],
+        str(Label("//tensorflow:android_x86")): [
+            "-DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK",
+        ],
+        str(Label("//tensorflow:ios_x86_64")): [
+            "-msse4.1",
+        ],
+        str(Label("//tensorflow:windows")): [
+            "/DTF_COMPILE_LIBRARY",
+        ],
+        "//conditions:default": [],
+    }) + select({
+        str(Label("//tensorflow:with_default_optimizations")): [],
+        "//conditions:default": ["-DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK"],
+    })
 
-  return copts
+    return copts
 
 LINKER_SCRIPT = "//tensorflow/contrib/lite/java/src/main/native:version_script.lds"
 
 def tflite_linkopts_unstripped():
-  """Defines linker flags to reduce size of TFLite binary.
+    """Defines linker flags to reduce size of TFLite binary.
 
-     These are useful when trying to investigate the relative size of the
-     symbols in TFLite.
+       These are useful when trying to investigate the relative size of the
+       symbols in TFLite.
 
-  Returns:
-     a select object with proper linkopts
-  """
-  return select({
-      "//tensorflow:android": [
-          "-Wl,--no-export-dynamic", # Only inc syms referenced by dynamic obj.
-          "-Wl,--exclude-libs,ALL",  # Exclude syms in all libs from auto export.
-          "-Wl,--gc-sections", # Eliminate unused code and data.
-          "-Wl,--as-needed", # Don't link unused libs.
-      ],
-      "//tensorflow/contrib/lite:mips": [],
-      "//tensorflow/contrib/lite:mips64": [],
-      "//conditions:default": [
-          "-Wl,--icf=all",  # Identical code folding.
-      ],
-  })
+    Returns:
+       a select object with proper linkopts
+    """
+    return select({
+        "//tensorflow:android": [
+            "-Wl,--no-export-dynamic",  # Only inc syms referenced by dynamic obj.
+            "-Wl,--exclude-libs,ALL",  # Exclude syms in all libs from auto export.
+            "-Wl,--gc-sections",  # Eliminate unused code and data.
+            "-Wl,--as-needed",  # Don't link unused libs.
+        ],
+        "//tensorflow/contrib/lite:mips": [],
+        "//tensorflow/contrib/lite:mips64": [],
+        "//conditions:default": [
+            "-Wl,--icf=all",  # Identical code folding.
+        ],
+    })
 
 def tflite_jni_linkopts_unstripped():
-  """Defines linker flags to reduce size of TFLite binary with JNI.
+    """Defines linker flags to reduce size of TFLite binary with JNI.
 
-     These are useful when trying to investigate the relative size of the
-     symbols in TFLite.
+       These are useful when trying to investigate the relative size of the
+       symbols in TFLite.
 
-  Returns:
-     a select object with proper linkopts
-  """
-  return select({
-      "//tensorflow:android": [
-          "-Wl,--gc-sections", # Eliminate unused code and data.
-          "-Wl,--as-needed", # Don't link unused libs.
-      ],
-      "//tensorflow/contrib/lite:mips": [],
-      "//tensorflow/contrib/lite:mips64": [],
-      "//conditions:default": [
-          "-Wl,--icf=all",  # Identical code folding.
-      ],
-  })
+    Returns:
+       a select object with proper linkopts
+    """
+    return select({
+        "//tensorflow:android": [
+            "-Wl,--gc-sections",  # Eliminate unused code and data.
+            "-Wl,--as-needed",  # Don't link unused libs.
+        ],
+        "//tensorflow/contrib/lite:mips": [],
+        "//tensorflow/contrib/lite:mips64": [],
+        "//conditions:default": [
+            "-Wl,--icf=all",  # Identical code folding.
+        ],
+    })
 
 def tflite_linkopts():
-  """Defines linker flags to reduce size of TFLite binary."""
-  return tflite_linkopts_unstripped() + select({
-      "//tensorflow:android": [
-          "-s",  # Omit symbol table.
-      ],
-      "//conditions:default": [],
-  })
+    """Defines linker flags to reduce size of TFLite binary."""
+    return tflite_linkopts_unstripped() + select({
+        "//tensorflow:android": [
+            "-s",  # Omit symbol table.
+        ],
+        "//conditions:default": [],
+    })
 
 def tflite_jni_linkopts():
-  """Defines linker flags to reduce size of TFLite binary with JNI."""
-  return tflite_jni_linkopts_unstripped() + select({
-      "//tensorflow:android": [
-          "-s",  # Omit symbol table.
-          "-latomic",  # Required for some uses of ISO C++11 <atomic> in x86.
-      ],
-      "//conditions:default": [],
-  })
+    """Defines linker flags to reduce size of TFLite binary with JNI."""
+    return tflite_jni_linkopts_unstripped() + select({
+        "//tensorflow:android": [
+            "-s",  # Omit symbol table.
+            "-latomic",  # Required for some uses of ISO C++11 <atomic> in x86.
+        ],
+        "//conditions:default": [],
+    })
+
+def tflite_jni_binary(
+        name,
+        copts = tflite_copts(),
+        linkopts = tflite_jni_linkopts(),
+        linkscript = LINKER_SCRIPT,
+        linkshared = 1,
+        linkstatic = 1,
+        deps = []):
+    """Builds a jni binary for TFLite."""
+    linkopts = linkopts + [
+        "-Wl,--version-script",  # Export only jni functions & classes.
+        "$(location {})".format(linkscript),
+    ]
+    native.cc_binary(
+        name = name,
+        copts = copts,
+        linkshared = linkshared,
+        linkstatic = linkstatic,
+        deps = deps + [linkscript],
+        linkopts = linkopts,
+    )
 
-def tflite_jni_binary(name,
-                      copts=tflite_copts(),
-                      linkopts=tflite_jni_linkopts(),
-                      linkscript=LINKER_SCRIPT,
-                      linkshared=1,
-                      linkstatic=1,
-                      deps=[]):
-  """Builds a jni binary for TFLite."""
-  linkopts = linkopts + [
-      "-Wl,--version-script",  # Export only jni functions & classes.
-      "$(location {})".format(linkscript),
-  ]
-  native.cc_binary(
-      name=name,
-      copts=copts,
-      linkshared=linkshared,
-      linkstatic=linkstatic,
-      deps= deps + [linkscript],
-      linkopts=linkopts)
+def tflite_cc_shared_object(
+        name,
+        copts = tflite_copts(),
+        linkopts = [],
+        linkstatic = 1,
+        deps = []):
+    """Builds a shared object for TFLite."""
+    tf_cc_shared_object(
+        name = name,
+        copts = copts,
+        linkstatic = linkstatic,
+        linkopts = linkopts + tflite_jni_linkopts(),
+        framework_so = [],
+        deps = deps,
+    )
 
 def tf_to_tflite(name, src, options, out):
-  """Convert a frozen tensorflow graphdef to TF Lite's flatbuffer.
+    """Convert a frozen tensorflow graphdef to TF Lite's flatbuffer.
 
-  Args:
-    name: Name of rule.
-    src: name of the input graphdef file.
-    options: options passed to TOCO.
-    out: name of the output flatbuffer file.
-  """
+    Args:
+      name: Name of rule.
+      src: name of the input graphdef file.
+      options: options passed to TOCO.
+      out: name of the output flatbuffer file.
+    """
 
-  toco_cmdline = " ".join([
-      "//tensorflow/contrib/lite/toco:toco",
-      "--input_format=TENSORFLOW_GRAPHDEF",
-      "--output_format=TFLITE",
-      ("--input_file=$(location %s)" % src),
-      ("--output_file=$(location %s)" % out),
-  ] + options )
-  native.genrule(
-      name = name,
-      srcs=[src],
-      outs=[out],
-      cmd = toco_cmdline,
-      tools= ["//tensorflow/contrib/lite/toco:toco"],
-  )
+    toco_cmdline = " ".join([
+        "//tensorflow/contrib/lite/toco:toco",
+        "--input_format=TENSORFLOW_GRAPHDEF",
+        "--output_format=TFLITE",
+        ("--input_file=$(location %s)" % src),
+        ("--output_file=$(location %s)" % out),
+    ] + options)
+    native.genrule(
+        name = name,
+        srcs = [src],
+        outs = [out],
+        cmd = toco_cmdline,
+        tools = ["//tensorflow/contrib/lite/toco:toco"],
+    )
 
 def tflite_to_json(name, src, out):
-  """Convert a TF Lite flatbuffer to JSON.
+    """Convert a TF Lite flatbuffer to JSON.
 
-  Args:
-    name: Name of rule.
-    src: name of the input flatbuffer file.
-    out: name of the output JSON file.
-  """
+    Args:
+      name: Name of rule.
+      src: name of the input flatbuffer file.
+      out: name of the output JSON file.
+    """
 
-  flatc = "@flatbuffers//:flatc"
-  schema = "//tensorflow/contrib/lite/schema:schema.fbs"
-  native.genrule(
-      name = name,
-      srcs = [schema, src],
-      outs = [out],
-      cmd = ("TMP=`mktemp`; cp $(location %s) $${TMP}.bin &&"  +
-             "$(location %s) --raw-binary --strict-json -t" +
-             " -o /tmp $(location %s) -- $${TMP}.bin &&" +
-             "cp $${TMP}.json $(location %s)")
-            % (src, flatc, schema, out),
-      tools = [flatc],
-  )
+    flatc = "@flatbuffers//:flatc"
+    schema = "//tensorflow/contrib/lite/schema:schema.fbs"
+    native.genrule(
+        name = name,
+        srcs = [schema, src],
+        outs = [out],
+        cmd = ("TMP=`mktemp`; cp $(location %s) $${TMP}.bin &&" +
+               "$(location %s) --raw-binary --strict-json -t" +
+               " -o /tmp $(location %s) -- $${TMP}.bin &&" +
+               "cp $${TMP}.json $(location %s)") %
+              (src, flatc, schema, out),
+        tools = [flatc],
+    )
 
 def json_to_tflite(name, src, out):
-  """Convert a JSON file to TF Lite's flatbuffer.
+    """Convert a JSON file to TF Lite's flatbuffer.
 
-  Args:
-    name: Name of rule.
-    src: name of the input JSON file.
-    out: name of the output flatbuffer file.
-  """
+    Args:
+      name: Name of rule.
+      src: name of the input JSON file.
+      out: name of the output flatbuffer file.
+    """
 
-  flatc = "@flatbuffers//:flatc"
-  schema = "//tensorflow/contrib/lite/schema:schema_fbs"
-  native.genrule(
-      name = name,
-      srcs = [schema, src],
-      outs = [out],
-      cmd = ("TMP=`mktemp`; cp $(location %s) $${TMP}.json &&"  +
-             "$(location %s) --raw-binary --unknown-json --allow-non-utf8 -b" +
-             " -o /tmp $(location %s) $${TMP}.json &&" +
-             "cp $${TMP}.bin $(location %s)")
-      % (src, flatc, schema, out),
-      tools = [flatc],
-  )
+    flatc = "@flatbuffers//:flatc"
+    schema = "//tensorflow/contrib/lite/schema:schema_fbs"
+    native.genrule(
+        name = name,
+        srcs = [schema, src],
+        outs = [out],
+        cmd = ("TMP=`mktemp`; cp $(location %s) $${TMP}.json &&" +
+               "$(location %s) --raw-binary --unknown-json --allow-non-utf8 -b" +
+               " -o /tmp $(location %s) $${TMP}.json &&" +
+               "cp $${TMP}.bin $(location %s)") %
+              (src, flatc, schema, out),
+        tools = [flatc],
+    )
 
 # This is the master list of generated examples that will be made into tests. A
 # function called make_XXX_tests() must also appear in generate_examples.py.
@@ -222,6 +245,7 @@ def generated_test_models():
         "local_response_norm",
         "log_softmax",
         "log",
+        "logical_or",
         "lstm",
         "max_pool",
         "maximum",
@@ -230,12 +254,14 @@ def generated_test_models():
         "mul",
         "neg",
         "not_equal",
+        "one_hot",
+        "pack",
         "pad",
         "padv2",
         "prelu",
         "pow",
         "reduce_max",
-        "reduce_prod",
+        #"reduce_prod",  # disabled due to b/111823366
         "relu",
         "relu1",
         "relu6",
@@ -264,58 +290,58 @@ def generated_test_models():
     ]
 
 def gen_zip_test(name, test_name, **kwargs):
-  """Generate a zipped-example test and its dependent zip files.
+    """Generate a zipped-example test and its dependent zip files.
 
-  Args:
-    name: Resulting cc_test target name
-    test_name: Test targets this model. Comes from the list above.
-    **kwargs: tf_cc_test kwargs.
-  """
-  gen_zipped_test_file(
-      name = "zip_%s" % test_name,
-      file = "%s.zip" % test_name,
-  )
-  tf_cc_test(name, **kwargs)
+    Args:
+      name: Resulting cc_test target name
+      test_name: Test targets this model. Comes from the list above.
+      **kwargs: tf_cc_test kwargs.
+    """
+    gen_zipped_test_file(
+        name = "zip_%s" % test_name,
+        file = "%s.zip" % test_name,
+    )
+    tf_cc_test(name, **kwargs)
 
 def gen_zipped_test_file(name, file):
-  """Generate a zip file of tests by using :generate_examples.
+    """Generate a zip file of tests by using :generate_examples.
 
-  Args:
-    name: Name of output. We will produce "`file`.files" as a target.
-    file: The name of one of the generated_examples targets, e.g. "transpose"
-  """
-  toco = "//tensorflow/contrib/lite/toco:toco"
-  native.genrule(
-      name = file + ".files",
-      cmd = ("$(locations :generate_examples) --toco $(locations %s) " % toco
-             + " --zip_to_output " + file + " $(@D)"),
-      outs = [file],
-      tools = [
-          ":generate_examples",
-          toco,
-      ],
-  )
+    Args:
+      name: Name of output. We will produce "`file`.files" as a target.
+      file: The name of one of the generated_examples targets, e.g. "transpose"
+    """
+    toco = "//tensorflow/contrib/lite/toco:toco"
+    native.genrule(
+        name = file + ".files",
+        cmd = ("$(locations :generate_examples) --toco $(locations %s) " % toco +
+               " --zip_to_output " + file + " $(@D)"),
+        outs = [file],
+        tools = [
+            ":generate_examples",
+            toco,
+        ],
+    )
 
-  native.filegroup(
-      name = name,
-      srcs = [file],
-  )
+    native.filegroup(
+        name = name,
+        srcs = [file],
+    )
 
 def gen_selected_ops(name, model):
-  """Generate the library that includes only used ops.
+    """Generate the library that includes only used ops.
 
-  Args:
-    name: Name of the generated library.
-    model: TFLite model to interpret.
-  """
-  out = name + "_registration.cc"
-  tool = "//tensorflow/contrib/lite/tools:generate_op_registrations"
-  tflite_path = "//tensorflow/contrib/lite"
-  native.genrule(
-      name = name,
-      srcs = [model],
-      outs = [out],
-      cmd = ("$(location %s) --input_model=$(location %s) --output_registration=$(location %s) --tflite_path=%s")
-      % (tool, model, out, tflite_path[2:]),
-      tools = [tool],
-  )
+    Args:
+      name: Name of the generated library.
+      model: TFLite model to interpret.
+    """
+    out = name + "_registration.cc"
+    tool = "//tensorflow/contrib/lite/tools:generate_op_registrations"
+    tflite_path = "//tensorflow/contrib/lite"
+    native.genrule(
+        name = name,
+        srcs = [model],
+        outs = [out],
+        cmd = ("$(location %s) --input_model=$(location %s) --output_registration=$(location %s) --tflite_path=%s") %
+              (tool, model, out, tflite_path[2:]),
+        tools = [tool],
+    )
diff --git a/tensorflow/contrib/lite/builtin_op_data.h b/tensorflow/contrib/lite/builtin_op_data.h
index a24aaad7dd..70178b2faa 100644
--- a/tensorflow/contrib/lite/builtin_op_data.h
+++ b/tensorflow/contrib/lite/builtin_op_data.h
@@ -277,6 +277,15 @@ typedef struct {
   bool narrow_range;
 } TfLiteFakeQuantParams;
 
+typedef struct {
+  int values_count;
+  int axis;
+} TfLitePackParams;
+
+typedef struct {
+  int axis;
+} TfLiteOneHotParams;
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/tensorflow/contrib/lite/builtin_ops.h b/tensorflow/contrib/lite/builtin_ops.h
index 558e547121..0b6568fd2f 100644
--- a/tensorflow/contrib/lite/builtin_ops.h
+++ b/tensorflow/contrib/lite/builtin_ops.h
@@ -109,6 +109,8 @@ typedef enum {
   kTfLiteBuiltinReduceProd = 81,
   kTfLiteBuiltinReduceMax = 82,
   kTfLiteBuiltinPack = 83,
+  kTfLiteBuiltinLogicalOr = 84,
+  kTfLiteBuiltinOneHot = 85,
 } TfLiteBuiltinOperator;
 
 #ifdef __cplusplus
diff --git a/tensorflow/contrib/lite/context.h b/tensorflow/contrib/lite/context.h
index 1ff8843fa7..5bc20106d3 100644
--- a/tensorflow/contrib/lite/context.h
+++ b/tensorflow/contrib/lite/context.h
@@ -29,6 +29,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_LITE_CONTEXT_H_
 #define TENSORFLOW_CONTRIB_LITE_CONTEXT_H_
 
+#if defined(_MSC_VER)
+#include <complex.h>
+#endif
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdlib.h>
@@ -180,7 +183,11 @@ typedef union {
   uint8_t* uint8;
   bool* b;
   int16_t* i16;
+#if defined(_MSC_VER)
+  _Fcomplex* c64;
+#else
   _Complex float* c64;
+#endif
 } TfLitePtrUnion;
 
 // Memory allocation strategies. kTfLiteMmapRo is for read-only memory-mapped
@@ -464,6 +471,12 @@ typedef struct _TfLiteDelegate {
 } TfLiteDelegate;
 
 // WARNING: This is an experimental interface that is subject to change.
+//
+// Currently, TfLiteDelegateParams has to be allocated in a way that it's
+// trivially destructable. It will be stored as `builtin_data` field in
+// `TfLiteNode` of the delegate node.
+//
+// See also the `CreateDelegateParams` function in `interpreter.cc` details.
 typedef struct {
   TfLiteDelegate* delegate;
   TfLiteIntArray* nodes_to_replace;
diff --git a/tensorflow/contrib/lite/delegates/eager/BUILD b/tensorflow/contrib/lite/delegates/eager/BUILD
index 03a4b7bf1d..a28707382e 100644
--- a/tensorflow/contrib/lite/delegates/eager/BUILD
+++ b/tensorflow/contrib/lite/delegates/eager/BUILD
@@ -68,6 +68,43 @@ cc_test(
 )
 
 cc_library(
+    name = "kernel",
+    srcs = ["kernel.cc"],
+    hdrs = ["kernel.h"],
+    deps = [
+        ":delegate_data",
+        ":util",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite:kernel_api",
+        "//tensorflow/contrib/lite/kernels:kernel_util",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/common_runtime/eager:context",
+        "//tensorflow/core/common_runtime/eager:execute",
+        "//tensorflow/core/common_runtime/eager:tensor_handle",
+        "@flatbuffers",
+    ],
+)
+
+cc_test(
+    name = "kernel_test",
+    size = "small",
+    srcs = ["kernel_test.cc"],
+    tags = [
+        "no_oss",
+        "tflite_not_portable",
+    ],
+    deps = [
+        ":delegate_data",
+        ":kernel",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "//tensorflow/contrib/lite/testing:util",
+        "@com_google_absl//absl/memory",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
+cc_library(
     name = "util",
     srcs = ["util.cc"],
     hdrs = ["util.h"],
diff --git a/tensorflow/contrib/lite/delegates/eager/buffer_map.cc b/tensorflow/contrib/lite/delegates/eager/buffer_map.cc
index 1d6453f498..e5a19c3997 100644
--- a/tensorflow/contrib/lite/delegates/eager/buffer_map.cc
+++ b/tensorflow/contrib/lite/delegates/eager/buffer_map.cc
@@ -91,6 +91,10 @@ void BufferMap::SetFromTfLite(int tensor_index, const TfLiteTensor* tensor) {
   for (int i = 0; i < num_dims; ++i) {
     shape.AddDim(tensor->dims->data[i]);
   }
+  // TODO(ahentz): we assume this is a new tensor and allocate a new buffer
+  // for it. This is not always the best approach. For example, this might
+  // be a reallocation after resizing tensors. In that case we would be
+  // preferable to somehow reuse the buffer.
   auto* buf = new TfLiteTensorBuffer(tensor);
   tensorflow::Tensor t = tensorflow::TensorCApi::MakeTensor(
       GetTensorFlowDataType(tensor->type), shape, buf);
diff --git a/tensorflow/contrib/lite/delegates/eager/delegate_data.cc b/tensorflow/contrib/lite/delegates/eager/delegate_data.cc
index 29687694bd..0fd5c976f8 100644
--- a/tensorflow/contrib/lite/delegates/eager/delegate_data.cc
+++ b/tensorflow/contrib/lite/delegates/eager/delegate_data.cc
@@ -23,7 +23,8 @@ tensorflow::Status DelegateData::Create(std::unique_ptr<DelegateData>* data) {
   std::vector<tensorflow::Device*> devices;
 
   TF_RETURN_IF_ERROR(tensorflow::DeviceFactory::AddDevices(
-      tensorflow::SessionOptions(), "/device:cpu:*", &devices));
+      tensorflow::SessionOptions(), "/job:localhost/replica:0/task:0",
+      &devices));
 
   std::unique_ptr<tensorflow::DeviceMgr> device_mgr(
       new tensorflow::DeviceMgr(devices));
diff --git a/tensorflow/contrib/lite/delegates/eager/kernel.cc b/tensorflow/contrib/lite/delegates/eager/kernel.cc
new file mode 100644
index 0000000000..1727981807
--- /dev/null
+++ b/tensorflow/contrib/lite/delegates/eager/kernel.cc
@@ -0,0 +1,289 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/delegates/eager/kernel.h"
+
+#include "third_party/flatbuffers/include/flatbuffers/flexbuffers.h"
+#include "tensorflow/contrib/lite/builtin_ops.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/context_util.h"
+#include "tensorflow/contrib/lite/delegates/eager/delegate_data.h"
+#include "tensorflow/contrib/lite/delegates/eager/util.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/common_runtime/eager/execute.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+
+// Note: this is part of TF Lite's Eager delegation code which is to be
+// completed soon.
+
+// This is the TF Lite op that is created by the eager delegate to handle
+// execution of a supported subgraph. The usual flow is that the delegate
+// informs the interpreter of supported nodes in a graph, and each supported
+// subgraph is replaced with one instance of this kernel.
+//
+// The kernel is initialized with TfLiteDelegateParams from which we retrieve
+// the global EagerContext and BufferMap, as well as a list of inputs and
+// outputs to the subgraph. Those are used to build the OpData, with a list of
+// TensorFlow Ops that should be executed in order (which we call an OpNode).
+//
+// For each node included in the subgraph, we query the interpreter and
+// retrieve the associated NodeDef, which is then used to configure the
+// corresponding TensorFlow/Eager Op.
+
+namespace tflite {
+namespace eager {
+namespace kernel {
+
+// Controls the lifetime of tensor handles in a vector.
+class VectorOfHandles {
+ public:
+  explicit VectorOfHandles(int num_elements) : vector_(num_elements, nullptr) {}
+
+  ~VectorOfHandles() {
+    for (auto* handle : vector_) {
+      if (handle) handle->Unref();
+    }
+  }
+
+  tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 2>* GetVector() {
+    return &vector_;
+  }
+
+  tensorflow::TensorHandle* GetHandle(int index) { return vector_[index]; }
+
+ private:
+  tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 2> vector_;
+};
+
+// Executes the TensorFlow op given by 'op_name', with the attributes specified
+// in 'nodedef'. Inputs and outputs are given as indices into the 'buffer_map'.
+tensorflow::Status ExecuteEagerOp(tensorflow::EagerContext* eager_context,
+                                  BufferMap* buffer_map, const string& op_name,
+                                  const tensorflow::NodeDef& nodedef,
+                                  const std::vector<int>& inputs,
+                                  const std::vector<int>& outputs) {
+  const tensorflow::AttrTypeMap* attr_types;
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(
+      tensorflow::AttrTypeMapForOp(op_name.c_str(), &attr_types),
+      " (while processing attributes of '", op_name, "')");
+
+  tensorflow::EagerOperation op(eager_context, op_name.c_str(), attr_types);
+  for (const auto& attr : nodedef.attr()) {
+    op.MutableAttrs()->Set(attr.first, attr.second);
+  }
+
+  for (int input_index : inputs) {
+    if (!buffer_map->HasTensor(input_index)) {
+      return tensorflow::errors::Internal(
+          "Cannot read from invalid tensor index ", input_index);
+    }
+    auto* handle = new tensorflow::TensorHandle(
+        buffer_map->GetTensor(input_index), nullptr, nullptr, nullptr);
+    op.AddInput(handle);
+    handle->Unref();
+  }
+
+  int num_retvals = outputs.size();
+  VectorOfHandles retvals(num_retvals);
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(
+      EagerExecute(&op, retvals.GetVector(), &num_retvals),
+      " (while executing '", op_name, "' via Eager)");
+
+  if (num_retvals != outputs.size()) {
+    return tensorflow::errors::Internal(
+        "Unexpected number of outputs from EagerExecute");
+  }
+
+  for (int i = 0; i < num_retvals; ++i) {
+    const tensorflow::Tensor* tensor = nullptr;
+    TF_RETURN_IF_ERROR(retvals.GetHandle(i)->Tensor(&tensor));
+    buffer_map->SetFromTensorFlow(outputs[i], *tensor);
+  }
+
+  return tensorflow::Status::OK();
+}
+
+// A single node within the larger 'op'. Note that this kernel executes many
+// TensorFlow ops within a single TF Lite op.
+struct OpNode {
+  // The name of the TensorFlow op to execute.
+  string name;
+  // The corresponding NodeDef, containing the attributes for the op.
+  tensorflow::NodeDef nodedef;
+  // List of inputs, as TF Lite tensor indices.
+  std::vector<int> inputs;
+  // List of outputs, as TF Lite tensor indices.
+  std::vector<int> outputs;
+};
+
+// The Larger 'op', which contains all the nodes in a supported subgraph.
+struct OpData {
+  tensorflow::EagerContext* eager_context;
+  BufferMap* buffer_map;
+  std::vector<OpNode> nodes;
+  std::vector<int> subgraph_inputs;
+  std::vector<int> subgraph_outputs;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* op_data = new OpData;
+
+  const TfLiteDelegateParams* params =
+      reinterpret_cast<const TfLiteDelegateParams*>(buffer);
+  CHECK(params);
+  CHECK(params->delegate);
+  CHECK(params->delegate->data_);
+  op_data->eager_context =
+      reinterpret_cast<DelegateData*>(params->delegate->data_)
+          ->GetEagerContext();
+  op_data->buffer_map =
+      reinterpret_cast<DelegateData*>(params->delegate->data_)->GetBufferMap();
+
+  CHECK(params->output_tensors);
+  for (auto tensor_index : TfLiteIntArrayView(params->output_tensors)) {
+    op_data->subgraph_outputs.push_back(tensor_index);
+  }
+
+  CHECK(params->input_tensors);
+  for (auto tensor_index : TfLiteIntArrayView(params->input_tensors)) {
+    op_data->subgraph_inputs.push_back(tensor_index);
+  }
+
+  CHECK(params->nodes_to_replace);
+  for (auto node_index : TfLiteIntArrayView(params->nodes_to_replace)) {
+    TfLiteNode* node;
+    TfLiteRegistration* reg;
+    context->GetNodeAndRegistration(context, node_index, &node, &reg);
+
+    op_data->nodes.push_back(OpNode());
+    OpNode& node_data = op_data->nodes.back();
+
+    node_data.name = "";
+    if (node->custom_initial_data) {
+      // The flexbuffer contains a vector where the first elements is the
+      // op name and the second is a serialized NodeDef.
+      const flexbuffers::Vector& v =
+          flexbuffers::GetRoot(
+              reinterpret_cast<const uint8_t*>(node->custom_initial_data),
+              node->custom_initial_data_size)
+              .AsVector();
+
+      node_data.name = v[0].AsString().str();
+      if (!node_data.nodedef.ParseFromString(v[1].AsString().str())) {
+        // We will just leave the nodedef empty and error out in Eval().
+        node_data.nodedef.Clear();
+      }
+    }
+
+    for (auto input_index : TfLiteIntArrayView(node->inputs)) {
+      node_data.inputs.push_back(input_index);
+    }
+    for (auto output_index : TfLiteIntArrayView(node->outputs)) {
+      node_data.outputs.push_back(output_index);
+    }
+  }
+
+  return op_data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<OpData*>(buffer);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  const auto* op_data = reinterpret_cast<OpData*>(node->user_data);
+  TF_LITE_ENSURE_MSG(
+      context, op_data->eager_context != nullptr,
+      "Failed to initialize eager context. This often happens when a CPU "
+      "device has not been registered, presumably because some symbols from "
+      "tensorflow/core:core_cpu_impl were not linked into the binary.");
+
+  // Whenever we find a constant tensor, insert it in the buffer map.
+  BufferMap* buffer_map = op_data->buffer_map;
+  for (auto tensor_index : op_data->subgraph_inputs) {
+    TfLiteTensor* tensor = &context->tensors[tensor_index];
+    if (IsConstantTensor(tensor)) {
+      if (!buffer_map->HasTensor(tensor_index)) {
+        buffer_map->SetFromTfLite(tensor_index, tensor);
+      }
+    }
+  }
+
+  // All output tensors are allocated by TensorFlow/Eager, so we
+  // mark them as kTfLiteDynamic.
+  for (auto tensor_index : op_data->subgraph_outputs) {
+    SetTensorToDynamic(&context->tensors[tensor_index]);
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const auto* op_data = reinterpret_cast<OpData*>(node->user_data);
+  BufferMap* buffer_map = op_data->buffer_map;
+  tensorflow::EagerContext* eager_context = op_data->eager_context;
+
+  // Insert a tensor in the buffer map for all inputs that are not constant.
+  // Constants were handled in Prepare() already.
+  for (auto tensor_index : op_data->subgraph_inputs) {
+    TfLiteTensor* tensor = &context->tensors[tensor_index];
+    if (!IsConstantTensor(tensor)) {
+      buffer_map->SetFromTfLite(tensor_index, tensor);
+    }
+  }
+
+  // Execute the TensorFlow Ops sequentially.
+  for (const auto& node_data : op_data->nodes) {
+    if (node_data.nodedef.op().empty()) {
+      context->ReportError(context, "Invalid NodeDef in Eager op '%s'",
+                           node_data.name.c_str());
+      return kTfLiteError;
+    }
+    auto status =
+        ExecuteEagerOp(eager_context, buffer_map, node_data.name,
+                       node_data.nodedef, node_data.inputs, node_data.outputs);
+    TF_LITE_ENSURE_OK(context, ConvertStatus(context, status));
+  }
+
+  for (auto tensor_index : op_data->subgraph_outputs) {
+    if (!buffer_map->HasTensor(tensor_index)) {
+      context->ReportError(context, "Cannot write to invalid tensor index %d",
+                           tensor_index);
+      return kTfLiteError;
+    }
+
+    TfLiteTensor* tensor = &context->tensors[tensor_index];
+    TF_LITE_ENSURE_OK(
+        context,
+        CopyShape(context, buffer_map->GetTensor(tensor_index), tensor));
+    tensor->buffer_handle = tensor_index;
+    tensor->data_is_stale = true;
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace kernel
+
+TfLiteRegistration GetKernel() {
+  TfLiteRegistration registration{&kernel::Init,    &kernel::Free,
+                                  &kernel::Prepare, &kernel::Eval,
+                                  nullptr,          kTfLiteBuiltinDelegate};
+  return registration;
+}
+
+}  // namespace eager
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/delegates/eager/kernel.h b/tensorflow/contrib/lite/delegates/eager/kernel.h
new file mode 100644
index 0000000000..100672c82d
--- /dev/null
+++ b/tensorflow/contrib/lite/delegates/eager/kernel.h
@@ -0,0 +1,34 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_DELEGATES_EAGER_KERNEL_H_
+#define TENSORFLOW_CONTRIB_LITE_DELEGATES_EAGER_KERNEL_H_
+
+#include "tensorflow/contrib/lite/context.h"
+
+namespace tflite {
+namespace eager {
+
+// Return the registration object used to initialize and execute ops that will
+// be delegated to TensorFlow's Eager runtime. This TF Lite op is created by
+// the eager delegate to handle execution of a supported subgraph. The usual
+// flow is that the delegate informs the interpreter of supported nodes in a
+// graph, and each supported subgraph is replaced with one instance of this
+// kernel.
+TfLiteRegistration GetKernel();
+
+}  // namespace eager
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_DELEGATES_EAGER_KERNEL_H_
diff --git a/tensorflow/contrib/lite/delegates/eager/kernel_test.cc b/tensorflow/contrib/lite/delegates/eager/kernel_test.cc
new file mode 100644
index 0000000000..7d9dddef93
--- /dev/null
+++ b/tensorflow/contrib/lite/delegates/eager/kernel_test.cc
@@ -0,0 +1,351 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/delegates/eager/kernel.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/memory/memory.h"
+#include "third_party/flatbuffers/include/flatbuffers/flexbuffers.h"
+#include "tensorflow/contrib/lite/delegates/eager/delegate_data.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/testing/util.h"
+
+namespace tflite {
+namespace eager {
+namespace {
+
+using tensorflow::protobuf::TextFormat;
+using ::testing::ContainsRegex;
+using ::testing::ElementsAre;
+
+// We will use these are custom_names, so they need to be static.
+static const char kIdentity[] = "Identity";
+static const char kUnpack[] = "Unpack";
+static const char kAdd[] = "Add";
+static const char kMul[] = "Mul";
+
+TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteDelegate* delegate,
+                            const std::vector<int>& supported_nodes) {
+  TfLiteIntArray* size_and_nodes =
+      ConvertVectorToTfLiteIntArray(supported_nodes);
+  TF_LITE_ENSURE_STATUS(context->ReplaceSubgraphsWithDelegateKernels(
+      context, eager::GetKernel(), size_and_nodes, delegate));
+  TfLiteIntArrayFree(size_and_nodes);
+  return kTfLiteOk;
+}
+
+class KernelTest : public ::testing::Test {
+ public:
+  KernelTest() {
+    CHECK(DelegateData::Create(&delegate_data_).ok());
+    interpreter_.reset(new Interpreter(&error_reporter_));
+  }
+
+  bool Invoke() { return interpreter_->Invoke() == kTfLiteOk; }
+
+  void SetValues(int tensor_index, const std::vector<float>& values) {
+    float* v = interpreter_->typed_tensor<float>(tensor_index);
+    for (float f : values) {
+      *v++ = f;
+    }
+  }
+
+  std::vector<float> GetValues(int tensor_index) {
+    TfLiteTensor* o = interpreter_->tensor(tensor_index);
+    return std::vector<float>(o->data.f, o->data.f + o->bytes / sizeof(float));
+  }
+
+  void SetShape(int tensor_index, const std::vector<int>& values) {
+    ASSERT_EQ(interpreter_->ResizeInputTensor(tensor_index, values), kTfLiteOk);
+    ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+  }
+
+  std::vector<int> GetShape(int tensor_index) {
+    std::vector<int> result;
+    auto* dims = interpreter_->tensor(tensor_index)->dims;
+    for (int i = 0; i < dims->size; ++i) {
+      result.push_back(dims->data[i]);
+    }
+    return result;
+  }
+
+  template <typename T>
+  void ConfigureDelegate(T prepare_function) {
+    delegate_.data_ = delegate_data_.get();
+    delegate_.FreeBufferHandle = nullptr;
+    delegate_.Prepare = prepare_function;
+    delegate_.CopyFromBufferHandle = [](TfLiteDelegate* delegate,
+                                        TfLiteBufferHandle buffer_handle,
+                                        void* data, size_t size) {
+      auto* delegate_data = reinterpret_cast<DelegateData*>(delegate->data_);
+      tensorflow::StringPiece values =
+          delegate_data->GetBufferMap()->GetTensor(buffer_handle).tensor_data();
+      memcpy(data, values.data(), values.size());
+      return kTfLiteOk;
+    };
+    CHECK(interpreter_->ModifyGraphWithDelegate(
+              &delegate_, /*allow_dynamic_tensors=*/true) == kTfLiteOk);
+  }
+
+  void AddOp(const char* name, const std::vector<int>& inputs,
+             const std::vector<int>& outputs) {
+    auto attr = [](const string& key, const string& value) {
+      return " attr{ key: '" + key + "' value {" + value + "}}";
+    };
+
+    string attributes;
+    if (name == string(kUnpack)) {
+      attributes = attr("T", "type: DT_FLOAT") + attr("num", "i: 2") +
+                   attr("axis", "i: 0");
+    } else if (name == string(kIdentity)) {
+      attributes = attr("T", "type: DT_FLOAT");
+    } else if (name == string(kAdd)) {
+      attributes = attr("T", "type: DT_FLOAT");
+    } else if (name == string(kMul)) {
+      attributes = attr("T", "type: DT_FLOAT");
+    }
+    AddTfOp(name, attributes, inputs, outputs);
+  }
+
+  void AddTensors(int num_tensors, const std::vector<int>& inputs,
+                  const std::vector<int>& outputs) {
+    interpreter_->AddTensors(num_tensors);
+    for (int i = 0; i < num_tensors; ++i) {
+      TfLiteQuantizationParams quant;
+      CHECK_EQ(interpreter_->SetTensorParametersReadWrite(i, kTfLiteFloat32,
+                                                          /*name=*/"",
+                                                          /*dims=*/{3}, quant),
+               kTfLiteOk);
+    }
+
+    CHECK_EQ(interpreter_->SetInputs(inputs), kTfLiteOk);
+    CHECK_EQ(interpreter_->SetOutputs(outputs), kTfLiteOk);
+  }
+
+  const TestErrorReporter& error_reporter() const { return error_reporter_; }
+
+  void AddTfLiteOp(const char* name, const std::vector<int>& inputs,
+                   const std::vector<int>& outputs) {
+    CHECK_EQ(string(name), kMul);  // can only add MUL
+    static TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
+    reg.builtin_code = BuiltinOperator_MUL;
+    reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+      auto* i0 = &context->tensors[node->inputs->data[0]];
+      auto* o = &context->tensors[node->outputs->data[0]];
+      return context->ResizeTensor(context, o, TfLiteIntArrayCopy(i0->dims));
+    };
+    reg.invoke = [](TfLiteContext* context, TfLiteNode* node) {
+      auto* i0 = &context->tensors[node->inputs->data[0]];
+      auto* i1 = &context->tensors[node->inputs->data[1]];
+      auto* o = &context->tensors[node->outputs->data[0]];
+      for (int i = 0; i < o->bytes / sizeof(float); ++i) {
+        o->data.f[i] = i0->data.f[i] * i1->data.f[i];
+      }
+      return kTfLiteOk;
+    };
+
+    CHECK_EQ(interpreter_->AddNodeWithParameters(inputs, outputs, nullptr, 0,
+                                                 nullptr, &reg),
+             kTfLiteOk);
+  }
+
+ private:
+  void AddTfOp(const char* name, const string& nodedef_str,
+               const std::vector<int>& inputs,
+               const std::vector<int>& outputs) {
+    static TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
+    reg.builtin_code = BuiltinOperator_CUSTOM;
+    reg.custom_name = name;
+
+    tensorflow::NodeDef nodedef;
+    CHECK(TextFormat::ParseFromString(nodedef_str + " op: '" + name + "'",
+                                      &nodedef));
+    string serialized_nodedef;
+    CHECK(nodedef.SerializeToString(&serialized_nodedef));
+    flexbuffers::Builder fbb;
+    fbb.Vector([&]() {
+      fbb.String(nodedef.op());
+      fbb.String(serialized_nodedef);
+    });
+    fbb.Finish();
+
+    flexbuffers_.push_back(fbb.GetBuffer());
+    auto& buffer = flexbuffers_.back();
+    CHECK_EQ(interpreter_->AddNodeWithParameters(
+                 inputs, outputs, reinterpret_cast<const char*>(buffer.data()),
+                 buffer.size(), nullptr, &reg),
+             kTfLiteOk);
+  }
+
+  std::unique_ptr<Interpreter> interpreter_;
+  std::unique_ptr<DelegateData> delegate_data_;
+  TfLiteDelegate delegate_;
+  std::vector<std::vector<uint8_t>> flexbuffers_;
+  TestErrorReporter error_reporter_;
+};
+
+TEST_F(KernelTest, FullGraph) {
+  // Define the graph.
+  AddTensors(9, {0, 3}, {8});
+
+  AddOp(kUnpack, {0}, {1, 2});
+  AddOp(kUnpack, {3}, {4, 5});
+  AddOp(kAdd, {1, 4}, {6});
+  AddOp(kAdd, {2, 5}, {7});
+  AddOp(kMul, {6, 7}, {8});
+
+  // Apply Delegate.
+  ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) {
+    return GenericPrepare(context, delegate, {0, 1, 2, 3, 4});
+  });
+
+  // Define inputs.
+  SetShape(0, {2, 2, 1});
+  SetValues(0, {1.1f, 2.2f, 3.3f, 4.4f});
+  SetShape(3, {2, 2, 1});
+  SetValues(3, {1.1f, 2.2f, 3.3f, 4.4f});
+
+  ASSERT_TRUE(Invoke());
+
+  ASSERT_THAT(GetShape(8), ElementsAre(2, 1));
+  ASSERT_THAT(GetValues(8), ElementsAre(14.52f, 38.72f));
+}
+
+TEST_F(KernelTest, BadTensorFlowOp) {
+  AddTensors(2, {0}, {1});
+  AddOp("NonExistentOp", {0}, {1});
+
+  ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) {
+    return GenericPrepare(context, delegate, {0});
+  });
+
+  SetShape(0, {2, 2, 1});
+  SetValues(0, {1.1f, 2.2f, 3.3f, 4.4f});
+
+  ASSERT_FALSE(Invoke());
+  ASSERT_THAT(error_reporter().error_messages(),
+              ContainsRegex("while processing attributes of 'NonExistentOp'"));
+}
+
+TEST_F(KernelTest, BadNumberOfOutputs) {
+  AddTensors(3, {0}, {1, 2});
+  AddOp(kIdentity, {0}, {1, 2});
+
+  ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) {
+    return GenericPrepare(context, delegate, {0});
+  });
+
+  SetShape(0, {2, 2, 1});
+  SetValues(0, {1.1f, 2.2f, 3.3f, 4.4f});
+
+  ASSERT_FALSE(Invoke());
+  ASSERT_THAT(error_reporter().error_messages(),
+              ContainsRegex("Unexpected number of outputs"));
+}
+
+TEST_F(KernelTest, IncompatibleNodeDef) {
+  AddTensors(2, {0}, {1});
+
+  // Cast is a TF op, but we don't add the proper nodedef to it in AddOp.
+  AddOp("Cast", {0}, {1});
+
+  ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) {
+    return GenericPrepare(context, delegate, {0});
+  });
+
+  SetShape(0, {2, 2, 1});
+  SetValues(0, {1.1f, 2.2f, 3.3f, 4.4f});
+
+  ASSERT_FALSE(Invoke());
+  ASSERT_THAT(error_reporter().error_messages(),
+              ContainsRegex("while executing 'Cast' via Eager"));
+}
+
+TEST_F(KernelTest, WrongSetOfNodes) {
+  AddTensors(4, {0}, {3});
+  AddOp(kUnpack, {0}, {1, 2});
+  AddTfLiteOp(kMul, {1, 2}, {3});
+
+  // Specify that kMul (#1) is supported when it actually isn't.
+  ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) {
+    return GenericPrepare(context, delegate, {0, 1});
+  });
+
+  SetShape(0, {2, 2, 1});
+  SetValues(0, {1.1f, 2.2f, 3.3f, 4.4f});
+
+  ASSERT_FALSE(Invoke());
+  ASSERT_THAT(error_reporter().error_messages(),
+              ContainsRegex("Invalid NodeDef in Eager op"));
+}
+
+TEST_F(KernelTest, MixedGraph) {
+  AddTensors(9, {0, 3}, {8});
+
+  AddOp(kUnpack, {0}, {1, 2});
+  AddOp(kUnpack, {3}, {4, 5});
+  AddOp(kAdd, {1, 4}, {6});
+  AddOp(kAdd, {2, 5}, {7});
+  AddTfLiteOp(kMul, {6, 7}, {8});
+
+  ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) {
+    return GenericPrepare(context, delegate, {0, 1, 2, 3});
+  });
+
+  SetShape(0, {2, 2, 1});
+  SetValues(0, {1.1f, 2.2f, 3.3f, 4.4f});
+  SetShape(3, {2, 2, 1});
+  SetValues(3, {1.1f, 2.2f, 3.3f, 4.4f});
+
+  ASSERT_TRUE(Invoke());
+
+  ASSERT_THAT(GetShape(8), ElementsAre(2, 1));
+  ASSERT_THAT(GetValues(8), ElementsAre(14.52f, 38.72f));
+}
+
+TEST_F(KernelTest, SplitGraph) {
+  AddTensors(10, {0}, {9});
+
+  AddOp(kUnpack, {0}, {1, 2});
+  AddOp(kAdd, {1, 2}, {3});
+  AddOp(kUnpack, {3}, {4, 5});
+
+  AddTfLiteOp(kMul, {4, 5}, {6});
+
+  AddOp(kUnpack, {6}, {7, 8});
+  AddOp(kAdd, {7, 8}, {9});
+
+  ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) {
+    return GenericPrepare(context, delegate, {0, 1, 2, 4, 5});
+  });
+
+  SetShape(0, {2, 2, 2, 1});
+  SetValues(0, {3.0f, 1.0f, 0.5f, -1.0f, 0.0f, 1.0f, 1.5f, 3.0f});
+
+  ASSERT_TRUE(Invoke());
+
+  ASSERT_THAT(GetShape(9), ElementsAre(1));
+  ASSERT_THAT(GetValues(9), ElementsAre(10.0f));
+}
+
+}  // namespace
+}  // namespace eager
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.cc
index f0d16575ec..60855eb8ed 100644
--- a/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.cc
@@ -436,7 +436,6 @@ class NNAPIDelegateKernel {
         }
         break;
       case kTfLiteBuiltinSqueeze:
-        // Squeeze requires NNAPI1.1.
         if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11) {
           return [](TfLiteContext* context, NNAPIOpBuilder* builder,
                     TfLiteNode* node) -> ANeuralNetworksOperationType {
@@ -452,9 +451,227 @@ class NNAPIDelegateKernel {
         } else {
           return nullptr;
         }
+      case kTfLiteBuiltinL2Normalization: {
+        auto builtin =
+            reinterpret_cast<TfLiteL2NormParams*>(node->builtin_data);
+        if (builtin->activation != kTfLiteActNone) {
+          // NNAPI does not support activations
+          return nullptr;
+        }
+        return [](TfLiteContext* context, NNAPIOpBuilder* builder,
+                  TfLiteNode* node) -> ANeuralNetworksOperationType {
+          return ANEURALNETWORKS_L2_NORMALIZATION;
+        };
+      }
+      case kTfLiteBuiltinLocalResponseNormalization:
+        if (version == 1) {
+          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
+                    TfLiteNode* node) -> ANeuralNetworksOperationType {
+            auto builtin = reinterpret_cast<TfLiteLocalResponseNormParams*>(
+                node->builtin_data);
+            builder->AddScalarInt32Operand(builtin->radius);
+            builder->AddScalarFloat32Operand(builtin->bias);
+            builder->AddScalarFloat32Operand(builtin->alpha);
+            builder->AddScalarFloat32Operand(builtin->beta);
+            return ANEURALNETWORKS_LOCAL_RESPONSE_NORMALIZATION;
+          };
+        } else {
+          // TODO(miaowang): clean-up code and return early in the unsupported
+          // case.
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinLshProjection:
+        if (version == 1) {
+          // NNAPI does not support sparse projection correctly (b/111751836).
+          if (reinterpret_cast<TfLiteLSHProjectionParams*>(node->builtin_data)
+                  ->type == kTfLiteLshProjectionSparse) {
+            return nullptr;
+          }
+          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
+                    TfLiteNode* node) -> ANeuralNetworksOperationType {
+            auto builtin = reinterpret_cast<TfLiteLSHProjectionParams*>(
+                node->builtin_data);
+            builder->AddScalarInt32Operand(builtin->type);
+            return ANEURALNETWORKS_LSH_PROJECTION;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinConcatenation:
+        if (version == 1 &&
+            reinterpret_cast<TfLiteConcatenationParams*>(node->builtin_data)
+                    ->activation == kTfLiteActNone) {
+          if (context->tensors[node->inputs->data[0]].type == kTfLiteUInt8) {
+            // NNAPI only support concatenating quantized tensor of the same
+            // scale and offset.
+            auto first_param = context->tensors[node->inputs->data[0]].params;
+            for (int i = 0; i < node->inputs->size; i++) {
+              auto curr_param = context->tensors[node->inputs->data[i]].params;
+              if (curr_param.scale != first_param.scale ||
+                  curr_param.zero_point != first_param.zero_point) {
+                return nullptr;
+              }
+            }
+          }
+          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
+                    TfLiteNode* node) -> ANeuralNetworksOperationType {
+            auto builtin = reinterpret_cast<TfLiteConcatenationParams*>(
+                node->builtin_data);
+            builder->AddScalarInt32Operand(builtin->axis);
+            return ANEURALNETWORKS_CONCATENATION;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinDequantize:
+        if (version == 1) {
+          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
+                    TfLiteNode* node) -> ANeuralNetworksOperationType {
+            return ANEURALNETWORKS_DEQUANTIZE;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinFloor:
+        if (version == 1) {
+          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
+                    TfLiteNode* node) -> ANeuralNetworksOperationType {
+            return ANEURALNETWORKS_FLOOR;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinRelu:
+        if (version == 1) {
+          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
+                    TfLiteNode* node) -> ANeuralNetworksOperationType {
+            return ANEURALNETWORKS_RELU;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinReluN1To1:
+        if (version == 1) {
+          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
+                    TfLiteNode* node) -> ANeuralNetworksOperationType {
+            return ANEURALNETWORKS_RELU1;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinRelu6:
+        if (version == 1) {
+          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
+                    TfLiteNode* node) -> ANeuralNetworksOperationType {
+            return ANEURALNETWORKS_RELU6;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinLogistic:
+        if (version == 1) {
+          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
+                    TfLiteNode* node) -> ANeuralNetworksOperationType {
+            return ANEURALNETWORKS_LOGISTIC;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinTanh:
+        // TODO(miaowang): add additional checks for the parameters.
+        if (version == 1 &&
+            context->tensors[node->inputs->data[0]].type == kTfLiteFloat32) {
+          // NNAPI only support float tanh.
+          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
+                    TfLiteNode* node) -> ANeuralNetworksOperationType {
+            return ANEURALNETWORKS_TANH;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinSub:
+        if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11 &&
+            context->tensors[node->inputs->data[0]].type == kTfLiteFloat32) {
+          // NNAPI only support float sub.
+          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
+                    TfLiteNode* node) -> ANeuralNetworksOperationType {
+            auto builtin =
+                reinterpret_cast<TfLiteSubParams*>(node->builtin_data);
+            builder->AddScalarInt32Operand(builtin->activation);
+            return ANEURALNETWORKS_SUB;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinDiv:
+        if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11 &&
+            context->tensors[node->inputs->data[0]].type == kTfLiteFloat32) {
+          // NNAPI only support float div.
+          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
+                    TfLiteNode* node) -> ANeuralNetworksOperationType {
+            auto builtin =
+                reinterpret_cast<TfLiteDivParams*>(node->builtin_data);
+            builder->AddScalarInt32Operand(builtin->activation);
+            return ANEURALNETWORKS_DIV;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinPad:
+        if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11 &&
+            node->inputs->size == 2 &&
+            context->tensors[node->inputs->data[0]].type == kTfLiteFloat32) {
+          // NNAPI does not support specifying the padding value.
+          // NNAPI pads physical zero for quantized tensors, so only delegate
+          // float pad to NNAPI.
+          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
+                    TfLiteNode* node) -> ANeuralNetworksOperationType {
+            return ANEURALNETWORKS_PAD;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinSpaceToBatchNd:
+        if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11) {
+          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
+                    TfLiteNode* node) -> ANeuralNetworksOperationType {
+            return ANEURALNETWORKS_SPACE_TO_BATCH_ND;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinStridedSlice:
+        if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11) {
+          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
+                    TfLiteNode* node) -> ANeuralNetworksOperationType {
+            auto builtin =
+                reinterpret_cast<TfLiteStridedSliceParams*>(node->builtin_data);
+            builder->AddScalarInt32Operand(builtin->begin_mask);
+            builder->AddScalarInt32Operand(builtin->end_mask);
+            builder->AddScalarInt32Operand(builtin->shrink_axis_mask);
+            return ANEURALNETWORKS_STRIDED_SLICE;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
       case kTfLiteBuiltinTranspose:
-        // Transpose requires NNAPI1.1. Also note that the permutation input
-        // tensor value dictates the output dimensions.
+        // Note that the permutation input tensor value dictates the output
+        // dimensions.
         // TODO(b/110888333): Support dynamically-sized tensors in delegates.
         if ((version == 1) &&
             (kAndroidSdkVersion >= kMinSdkVersionForNNAPI11) &&
diff --git a/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate_test.cc b/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate_test.cc
index ab2181e8ff..b7b159c59f 100644
--- a/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate_test.cc
+++ b/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate_test.cc
@@ -82,7 +82,7 @@ TEST(NNAPIDelegate, AddWithRelu) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({0.0, 0.4, 1.0, 1.3}));
 }
 
-class FloatMulOpModel : public SingleOpModel {
+class FloatMulOpModel : public SingleOpModelWithNNAPI {
  public:
   FloatMulOpModel(const TensorData& input1, const TensorData& input2,
                   const TensorData& output,
@@ -184,7 +184,7 @@ TEST(NNAPIDelegate, L2PoolWithNoActivation) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({3.5, 6.5}));
 }
 
-class BaseConvolutionOpModel : public SingleOpModel {
+class BaseConvolutionOpModel : public SingleOpModelWithNNAPI {
  public:
   BaseConvolutionOpModel(
       const TensorData& input, const TensorData& filter,
@@ -641,6 +641,41 @@ TEST(NNAPIDelegate, SqueezeWithAxisTest) {
                         17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0}));
 }
 
+class L2NormOpModel : public SingleOpModelWithNNAPI {
+ public:
+  L2NormOpModel(const TensorData& input, const TensorData& output,
+                ActivationFunctionType activation_type) {
+    input_ = AddInput(input);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_L2_NORMALIZATION, BuiltinOptions_L2NormOptions,
+                 CreateL2NormOptions(builder_, activation_type).Union());
+    BuildInterpreter({GetShape(input_)});
+  }
+
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor<float>(input_, data);
+  }
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input_;
+  int new_shape_;
+  int output_;
+};
+
+TEST(NNAPIDelegate, L2NormSimpleTest) {
+  std::initializer_list<float> data = {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1};
+  L2NormOpModel m({TensorType_FLOAT32, {1, 1, 1, 6}},
+                  {TensorType_FLOAT32, {1, 1, 1, 6}},
+                  ActivationFunctionType_NONE);
+  m.SetInput(data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 1, 1, 6}));
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray({-0.55, 0.3, 0.35, 0.6, -0.35, 0.05}));
+}
+
 class TransposeSimpleModel : public SingleOpModelWithNNAPI {
  public:
   TransposeSimpleModel(std::initializer_list<int> input_shape,
@@ -678,6 +713,916 @@ TEST(NNAPIDelegate, TransposeSimpleTest) {
                                 2, 6, 10, 14, 18, 22, 3, 7, 11, 15, 19, 23}));
 }
 
+class FloatSubOpModel : public SingleOpModelWithNNAPI {
+ public:
+  FloatSubOpModel(const TensorData& input1, const TensorData& input2,
+                  const TensorData& output,
+                  ActivationFunctionType activation_type) {
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_SUB, BuiltinOptions_SubOptions,
+                 CreateMulOptions(builder_, activation_type).Union());
+    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+  }
+
+  int input1() { return input1_; }
+  int input2() { return input2_; }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+ protected:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+TEST(NNAPIDelegate, SubWithNoActivation) {
+  FloatSubOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+  m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8});
+  m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear({-2.1, 0.0, 0.4, 0.3})));
+}
+
+class FloatDivOpModel : public SingleOpModelWithNNAPI {
+ public:
+  FloatDivOpModel(const TensorData& input1, const TensorData& input2,
+                  const TensorData& output,
+                  ActivationFunctionType activation_type) {
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_DIV, BuiltinOptions_DivOptions,
+                 CreateMulOptions(builder_, activation_type).Union());
+    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+  }
+
+  int input1() { return input1_; }
+  int input2() { return input2_; }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+ protected:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+TEST(NNAPIDelegate, DivWithNoActivation) {
+  FloatDivOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+  m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.8, 0.8});
+  m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.4, 0.2});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({-20, 1, 2, 4})));
+}
+
+class BaseConcatenationOpModel : public SingleOpModelWithNNAPI {
+ public:
+  BaseConcatenationOpModel() {}
+  BaseConcatenationOpModel(const TensorData& input_template, int axis,
+                           int num_inputs) {
+    std::vector<std::vector<int>> all_input_shapes;
+    for (int i = 0; i < num_inputs; ++i) {
+      all_input_shapes.push_back(input_template.shape);
+      AddInput(input_template);
+    }
+    output_ = AddOutput({input_template.type, /*shape=*/{}, input_template.min,
+                         input_template.max});
+    SetBuiltinOp(
+        BuiltinOperator_CONCATENATION, BuiltinOptions_ConcatenationOptions,
+        CreateConcatenationOptions(builder_, axis, ActivationFunctionType_NONE)
+            .Union());
+    BuildInterpreter(all_input_shapes);
+  }
+
+ protected:
+  int output_;
+};
+
+class ConcatenationOpModel : public BaseConcatenationOpModel {
+ public:
+  using BaseConcatenationOpModel::BaseConcatenationOpModel;
+  void SetInput(int index, std::initializer_list<float> data) {
+    PopulateTensor(index, data);
+  }
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+};
+
+TEST(NNAPIDelegate, ConcatenationThreeDimensionalOneInput) {
+  ConcatenationOpModel m0({TensorType_FLOAT32, {2, 1, 2}}, /*axis=*/1,
+                          /*num_inputs=*/1);
+  m0.SetInput(0, {1.0f, 3.0f, 4.0f, 7.0f});
+  m0.Invoke();
+  EXPECT_THAT(m0.GetOutput(), ElementsAreArray({1, 3, 4, 7}));
+}
+
+TEST(NNAPIDelegate, ConcatenationFourInputs) {
+  ConcatenationOpModel m0({TensorType_FLOAT32, {2, 1, 2}}, /*axis=*/2,
+                          /*num_inputs=*/4);
+  m0.SetInput(0, {1.0f, 3.0f, 4.0f, 7.0f});
+  m0.SetInput(1, {1.1f, 3.1f, 4.1f, 7.1f});
+  m0.SetInput(2, {1.2f, 3.2f, 4.2f, 7.2f});
+  m0.SetInput(3, {1.3f, 3.3f, 4.3f, 7.3f});
+  m0.Invoke();
+  EXPECT_THAT(m0.GetOutput(),
+              ElementsAreArray({
+                  1.0f, 3.0f, 1.1f, 3.1f, 1.2f, 3.2f, 1.3f, 3.3f,  //
+                  4.0f, 7.0f, 4.1f, 7.1f, 4.2f, 7.2f, 4.3f, 7.3f,  //
+              }));
+}
+
+class QuantizedConcatenationOpModel : public BaseConcatenationOpModel {
+ public:
+  using BaseConcatenationOpModel::BaseConcatenationOpModel;
+  QuantizedConcatenationOpModel(const std::vector<TensorData>& input_template,
+                                int axis, int num_inputs,
+                                const TensorData& output_template) {
+    std::vector<std::vector<int>> all_input_shapes;
+    CHECK_EQ(input_template.size(), num_inputs);
+    for (int i = 0; i < num_inputs; ++i) {
+      all_input_shapes.push_back(input_template[i].shape);
+      AddInput(input_template[i]);
+    }
+    output_ = AddOutput({output_template.type, /*shape=*/{},
+                         output_template.min, output_template.max});
+    SetBuiltinOp(
+        BuiltinOperator_CONCATENATION, BuiltinOptions_ConcatenationOptions,
+        CreateConcatenationOptions(builder_, axis, ActivationFunctionType_NONE)
+            .Union());
+    BuildInterpreter(all_input_shapes);
+  }
+  void SetInput(int index, std::initializer_list<float> data) {
+    QuantizeAndPopulate<uint8_t>(index, data);
+  }
+  std::vector<uint8_t> GetOutput() { return ExtractVector<uint8_t>(output_); }
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+                               GetScale(output_), GetZeroPoint(output_));
+  }
+};
+
+TEST(NNAPIDelegate, ConcatenationFourInputsQuantized) {
+  QuantizedConcatenationOpModel m0({TensorType_UINT8, {2, 1, 2}, -12.7, 12.8},
+                                   /*axis=*/2,
+                                   /*num_inputs=*/4);
+
+  m0.SetInput(0, {1.0f, 3.0f, 4.0f, 7.0f});
+  m0.SetInput(1, {1.1f, 3.1f, 4.1f, 7.1f});
+  m0.SetInput(2, {1.2f, 3.2f, 4.2f, 7.2f});
+  m0.SetInput(3, {1.3f, 3.3f, 4.3f, 7.3f});
+  m0.Invoke();
+  EXPECT_THAT(m0.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear({
+                  1.0f, 3.0f, 1.1f, 3.1f, 1.2f, 3.2f, 1.3f, 3.3f,  //
+                  4.0f, 7.0f, 4.1f, 7.1f, 4.2f, 7.2f, 4.3f, 7.3f,  //
+              })));
+  EXPECT_THAT(m0.GetOutput(), ElementsAreArray({
+                                  137, 157, 138, 158, 139, 159, 140, 160,  //
+                                  167, 197, 168, 198, 169, 199, 170, 200,  //
+                              }));
+}
+
+TEST(NNAPIDelegate, ConcatenationFourInputsQuantizedMixedRange) {
+  QuantizedConcatenationOpModel m0({{TensorType_UINT8, {2, 1, 2}, -10.7, 10.8},
+                                    {TensorType_UINT8, {2, 1, 2}, 0, 12.8},
+                                    {TensorType_UINT8, {2, 1, 2}, -11, 11.8},
+                                    {TensorType_UINT8, {2, 1, 2}, 0, 7.4}},
+                                   /*axis=*/2, /*num_inputs=*/4,
+                                   {TensorType_UINT8, {2, 1, 2}, -12.7, 12.8});
+
+  m0.SetInput(0, {1.0f, 3.0f, 4.0f, 7.0f});
+  m0.SetInput(1, {1.1f, 3.1f, 4.1f, 7.1f});
+  m0.SetInput(2, {1.2f, 3.2f, 4.2f, 7.2f});
+  m0.SetInput(3, {1.3f, 3.3f, 4.3f, 7.3f});
+  m0.Invoke();
+  EXPECT_THAT(m0.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear({
+                  1.0f, 3.0f, 1.1f, 3.1f, 1.2f, 3.2f, 1.3f, 3.3f,  //
+                  4.0f, 7.0f, 4.1f, 7.1f, 4.2f, 7.2f, 4.3f, 7.3f,  //
+              })));
+  EXPECT_THAT(m0.GetOutput(), ElementsAreArray({
+                                  137, 157, 138, 158, 139, 159, 140, 160,  //
+                                  167, 197, 168, 198, 169, 199, 170, 200,  //
+                              }));
+}
+
+class DequantizeOpModel : public SingleOpModelWithNNAPI {
+ public:
+  DequantizeOpModel(std::initializer_list<int> shape, float min, float max) {
+    input_ = AddInput({TensorType_UINT8, shape, min, max});
+    output_ = AddOutput({TensorType_FLOAT32, shape});
+    SetBuiltinOp(BuiltinOperator_DEQUANTIZE, BuiltinOptions_DequantizeOptions,
+                 CreateDequantizeOptions(builder_).Union());
+
+    BuildInterpreter({GetShape(input_)});
+  }
+
+  void SetInput(std::initializer_list<uint8_t> data) {
+    PopulateTensor(input_, data);
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+ private:
+  int input_;
+  int output_;
+};
+
+TEST(NNAPIDelegate, DequantizeFourDimensional) {
+  DequantizeOpModel m({2, 5}, -63.5, 64);
+
+  m.SetInput({0, 1, 2, 3, 4, 251, 252, 253, 254, 255});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  {-63.5, -63, -62.5, -62, -61.5, 62, 62.5, 63, 63.5, 64})));
+}
+
+class FloorOpModel : public SingleOpModelWithNNAPI {
+ public:
+  FloorOpModel(std::initializer_list<int> input_shape, TensorType input_type) {
+    input_ = AddInput(TensorType_FLOAT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+    SetBuiltinOp(BuiltinOperator_FLOOR, BuiltinOptions_NONE, 0);
+    BuildInterpreter({
+        input_shape,
+    });
+  }
+
+  int input() { return input_; }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input_;
+  int output_;
+};
+
+TEST(NNAPIDelegate, FloorSingleDim) {
+  FloorOpModel model({2}, TensorType_FLOAT32);
+  model.PopulateTensor<float>(model.input(), {8.5, 0.0});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({8, 0}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2}));
+}
+
+TEST(NNAPIDelegate, FloorMultiDims) {
+  FloorOpModel model({2, 1, 1, 5}, TensorType_FLOAT32);
+  model.PopulateTensor<float>(model.input(), {
+                                                 0.0001,
+                                                 8.0001,
+                                                 0.9999,
+                                                 9.9999,
+                                                 0.5,
+                                                 -0.0001,
+                                                 -8.0001,
+                                                 -0.9999,
+                                                 -9.9999,
+                                                 -0.5,
+                                             });
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({0, 8, 0, 9, 0, -1, -9, -1, -10, -1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 1, 1, 5}));
+}
+
+class LocalResponseNormOpModel : public SingleOpModelWithNNAPI {
+ public:
+  LocalResponseNormOpModel(std::initializer_list<int> input_shape, int radius,
+                           float bias, float alpha, float beta) {
+    input_ = AddInput(TensorType_FLOAT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+    SetBuiltinOp(BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION,
+                 BuiltinOptions_LocalResponseNormalizationOptions,
+                 CreateLocalResponseNormalizationOptions(builder_, radius, bias,
+                                                         alpha, beta)
+                     .Union());
+    BuildInterpreter({input_shape});
+  }
+
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor(input_, data);
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+ private:
+  int input_;
+  int output_;
+};
+
+TEST(NNAPIDelegate, LocalResponseNormSameAsL2Norm) {
+  LocalResponseNormOpModel m({1, 1, 1, 6}, /*radius=*/20, /*bias=*/0.0,
+                             /*alpha=*/1.0, /*beta=*/0.5);
+  m.SetInput({-1.1, 0.6, 0.7, 1.2, -0.7, 0.1});
+  m.Invoke();
+  // The result is every input divided by 2.
+  EXPECT_THAT(
+      m.GetOutput(),
+      ElementsAreArray(ArrayFloatNear({-0.55, 0.3, 0.35, 0.6, -0.35, 0.05})));
+}
+
+TEST(NNAPIDelegate, LocalResponseNormWithAlpha) {
+  LocalResponseNormOpModel m({1, 1, 1, 6}, /*radius=*/20, /*bias=*/0.0,
+                             /*alpha=*/4.0, /*beta=*/0.5);
+  m.SetInput({-1.1, 0.6, 0.7, 1.2, -0.7, 0.1});
+  m.Invoke();
+  // The result is every input divided by 3.
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(
+                                 {-0.275, 0.15, 0.175, 0.3, -0.175, 0.025})));
+}
+
+TEST(NNAPIDelegate, LocalResponseNormWithBias) {
+  LocalResponseNormOpModel m({1, 1, 1, 6}, /*radius=*/20, /*bias=*/9.0,
+                             /*alpha=*/4.0, /*beta=*/0.5);
+  m.SetInput({-1.1, 0.6, 0.7, 1.2, -0.7, 0.1});
+  m.Invoke();
+  // The result is every input divided by 5.
+  EXPECT_THAT(
+      m.GetOutput(),
+      ElementsAreArray(ArrayFloatNear({-0.22, 0.12, 0.14, 0.24, -0.14, 0.02})));
+}
+
+TEST(NNAPIDelegate, LocalResponseNormSmallRadius) {
+  LocalResponseNormOpModel m({1, 1, 1, 6}, /*radius=*/2, /*bias=*/9.0,
+                             /*alpha=*/4.0, /*beta=*/0.5);
+  m.SetInput({-1.1, 0.6, 0.7, 1.2, -0.7, 0.1});
+  m.Invoke();
+  EXPECT_THAT(
+      m.GetOutput(),
+      ElementsAreArray(ArrayFloatNear(
+          {-0.264926, 0.125109, 0.140112, 0.267261, -0.161788, 0.0244266})));
+}
+
+class LSHProjectionOpModel : public SingleOpModelWithNNAPI {
+ public:
+  LSHProjectionOpModel(LSHProjectionType type,
+                       std::initializer_list<int> hash_shape,
+                       std::initializer_list<int> input_shape,
+                       std::initializer_list<int> weight_shape) {
+    hash_ = AddInput(TensorType_FLOAT32);
+    input_ = AddInput(TensorType_INT32);
+    if (weight_shape.size() > 0) {
+      weight_ = AddInput(TensorType_FLOAT32);
+    }
+    output_ = AddOutput(TensorType_INT32);
+
+    SetBuiltinOp(BuiltinOperator_LSH_PROJECTION,
+                 BuiltinOptions_LSHProjectionOptions,
+                 CreateLSHProjectionOptions(builder_, type).Union());
+    if (weight_shape.size() > 0) {
+      BuildInterpreter({hash_shape, input_shape, weight_shape});
+    } else {
+      BuildInterpreter({hash_shape, input_shape});
+    }
+
+    output_size_ = 1;
+    for (int i : hash_shape) {
+      output_size_ *= i;
+      if (type == LSHProjectionType_SPARSE) {
+        break;
+      }
+    }
+  }
+  void SetInput(std::initializer_list<int> data) {
+    PopulateTensor(input_, data);
+  }
+
+  void SetHash(std::initializer_list<float> data) {
+    PopulateTensor(hash_, data);
+  }
+
+  void SetWeight(std::initializer_list<float> f) { PopulateTensor(weight_, f); }
+
+  std::vector<int> GetOutput() { return ExtractVector<int>(output_); }
+
+ private:
+  int input_;
+  int hash_;
+  int weight_;
+  int output_;
+
+  int output_size_;
+};
+
+TEST(NNAPIDelegate, LSHProjectionDense1DInputs) {
+  LSHProjectionOpModel m(LSHProjectionType_DENSE, {3, 2}, {5}, {5});
+
+  m.SetInput({12345, 54321, 67890, 9876, -12345678});
+  m.SetHash({0.123, 0.456, -0.321, 1.234, 5.678, -4.321});
+  m.SetWeight({1.0, 1.0, 1.0, 1.0, 1.0});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), ElementsAre(0, 0, 0, 1, 0, 0));
+}
+
+TEST(NNAPIDelegate, LSHProjectionSparse1DInputs) {
+  LSHProjectionOpModel m(LSHProjectionType_SPARSE, {3, 2}, {5}, {});
+
+  m.SetInput({12345, 54321, 67890, 9876, -12345678});
+  m.SetHash({0.123, 0.456, -0.321, 1.234, 5.678, -4.321});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), ElementsAre(0 + 0, 4 + 1, 8 + 0));
+}
+
+TEST(NNAPIDelegate, LSHProjectionSparse3DInputs) {
+  LSHProjectionOpModel m(LSHProjectionType_SPARSE, {3, 2}, {5, 2, 2}, {5});
+
+  m.SetInput({1234, 2345, 3456, 1234, 4567, 5678, 6789, 4567, 7891, 8912,
+              9123, 7890, -987, -876, -765, -987, -543, -432, -321, -543});
+  m.SetHash({0.123, 0.456, -0.321, 1.234, 5.678, -4.321});
+  m.SetWeight({0.12, 0.34, 0.56, 0.67, 0.78});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), ElementsAre(0 + 2, 4 + 1, 8 + 1));
+}
+
+class BaseActivationsOpModel : public SingleOpModelWithNNAPI {
+ public:
+  // Most activations don't take any options, so this constructor works for
+  // them.
+  BaseActivationsOpModel(BuiltinOperator type, TensorData input) {
+    input_ = AddInput(input);
+    if (input.type == TensorType_UINT8) {
+      output_ = AddOutput({input.type, {}, 0, 0, 1. / 256});
+    } else {
+      output_ = AddOutput({input.type, {}});
+    }
+    SetBuiltinOp(type, BuiltinOptions_NONE, 0);
+    BuildInterpreter({GetShape(input_)});
+  }
+
+  BaseActivationsOpModel(BuiltinOperator type, const TensorData& input,
+                         const TensorData& output) {
+    input_ = AddInput(input);
+    output_ = AddOutput(output);
+    SetBuiltinOp(type, BuiltinOptions_NONE, 0);
+    BuildInterpreter({GetShape(input_)});
+  }
+
+ protected:
+  int input_;
+  int output_;
+};
+
+class FloatActivationsOpModel : public BaseActivationsOpModel {
+ public:
+  using BaseActivationsOpModel::BaseActivationsOpModel;
+
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor(input_, data);
+  }
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+};
+
+const float kQuantizedTolerance = 2 * (1. / 256);
+
+class QuantizedActivationsOpModel : public BaseActivationsOpModel {
+ public:
+  using BaseActivationsOpModel::BaseActivationsOpModel;
+
+  template <typename T>
+  void SetInput(std::initializer_list<float> data) {
+    QuantizeAndPopulate<T>(input_, data);
+  }
+  template <typename T>
+
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
+  template <typename T>
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<T>(ExtractVector<T>(output_), GetScale(output_),
+                         GetZeroPoint(output_));
+  }
+};
+
+TEST(NNAPIDelegate, Relu) {
+  FloatActivationsOpModel m(BuiltinOperator_RELU,
+                            /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}});
+  m.SetInput({
+      0, -6, 2, 4,   //
+      3, -2, 10, 1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 0, 0, 2, 4,   //
+                                 3, 0, 10, 1,  //
+                             }));
+}
+
+TEST(NNAPIDelegate, Relu1) {
+  FloatActivationsOpModel m(BuiltinOperator_RELU_N1_TO_1,
+                            /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}});
+  m.SetInput({
+      0.0, -0.6, 0.2, -0.4,  //
+      0.3, -2.0, 1.1, -0.1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 0.0, -0.6, 0.2, -0.4,  //
+                                 0.3, -1.0, 1.0, -0.1,  //
+                             }));
+}
+
+TEST(NNAPIDelegate, Relu6) {
+  FloatActivationsOpModel m(BuiltinOperator_RELU6,
+                            /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}});
+  m.SetInput({
+      0, -6, 2, 4,   //
+      3, -2, 10, 1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 0, 0, 2, 4,  //
+                                 3, 0, 6, 1,  //
+                             }));
+}
+
+TEST(NNAPIDelegate, Tanh) {
+  FloatActivationsOpModel m(BuiltinOperator_TANH,
+                            /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}});
+  m.SetInput({
+      0, -6, 2, 4,   //
+      3, -2, 10, 1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({
+                                 0, -0.9999877, 0.9640275, 0.999329,    //
+                                 0.99505475, -0.9640275, 1, 0.7615941,  //
+                             })));
+}
+
+TEST(NNAPIDelegate, LogisticFloat) {
+  FloatActivationsOpModel m(BuiltinOperator_LOGISTIC,
+                            /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}});
+  m.SetInput({
+      0, -6, 2, 4,   //
+      3, -2, 10, 1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({
+                                 0.5, 0.002473, 0.880797, 0.982014,       //
+                                 0.952574, 0.119203, 0.999955, 0.731059,  //
+                             })));
+}
+
+TEST(NNAPIDelegate, LogisticQuantized) {
+  QuantizedActivationsOpModel m(
+      BuiltinOperator_LOGISTIC,
+      /*input=*/{TensorType_UINT8, {1, 2, 4, 1}, -10, 10});
+  m.SetInput<uint8_t>({
+      0, -6, 2, 4,   //
+      3, -2, 10, 1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      0.5, 0.002473, 0.880797, 0.982014,       //
+                      0.952574, 0.119203, 0.999955, 0.731059,  //
+                  },
+                  kQuantizedTolerance)));
+  EXPECT_THAT(m.GetOutput<uint8_t>(),
+              ElementsAreArray({128, 1, 227, 251, 244, 32, 255, 188}));
+}
+
+#if 0
+class ResizeBilinearOpModel : public SingleOpModelWithNNAPI {
+ public:
+  ResizeBilinearOpModel(const TensorData& input,
+                        std::initializer_list<int> size_data = {}) {
+    bool const_size = size_data.size() != 0;
+    input_ = AddInput(input);
+    if (const_size) {
+      size_ = AddConstInput(TensorType_INT32, size_data, {2});
+    } else {
+      size_ = AddInput({TensorType_INT32, {2}});
+    }
+    output_ = AddOutput(input.type);
+    SetBuiltinOp(BuiltinOperator_RESIZE_BILINEAR,
+                 BuiltinOptions_ResizeBilinearOptions,
+                 CreateResizeBilinearOptions(builder_).Union());
+    if (const_size) {
+      BuildInterpreter({GetShape(input_)});
+    } else {
+      BuildInterpreter({GetShape(input_), GetShape(size_)});
+    }
+  }
+
+  template <typename T>
+  void SetInput(std::initializer_list<T> data) {
+    PopulateTensor(input_, data);
+  }
+  void SetSize(std::initializer_list<int> data) { PopulateTensor(size_, data); }
+
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
+
+ private:
+  int input_;
+  int size_;
+  int output_;
+};
+
+TEST(NNAPIDelegate, ResizeBilinearHorizontal) {
+  ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 1, 2, 1}});
+  m.SetInput<float>({3, 6});
+  m.SetSize({1, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear({3, 5, 6})));
+
+  ResizeBilinearOpModel const_m({TensorType_FLOAT32, {1, 1, 2, 1}}, {1, 3});
+  const_m.SetInput<float>({3, 6});
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear({3, 5, 6})));
+}
+
+TEST(NNAPIDelegate, ResizeBilinearVertical) {
+  ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 2, 1, 1}});
+  m.SetInput<float>({3, 9});
+  m.SetSize({3, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear({3, 7, 9})));
+
+  ResizeBilinearOpModel const_m({TensorType_FLOAT32, {1, 2, 1, 1}}, {3, 1});
+  const_m.SetInput<float>({3, 9});
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear({3, 7, 9})));
+}
+
+TEST(NNAPIDelegate, ResizeBilinearTwoDimensional) {
+  ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}});
+  m.SetInput<float>({
+      3, 6,  //
+      9, 12  //
+  });
+  m.SetSize({3, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({
+                                        3, 5, 6,    //
+                                        7, 9, 10,   //
+                                        9, 11, 12,  //
+                                    })));
+
+  ResizeBilinearOpModel const_m({TensorType_FLOAT32, {1, 2, 2, 1}}, {3, 3});
+  const_m.SetInput<float>({
+      3, 6,  //
+      9, 12  //
+  });
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({
+                                              3, 5, 6,    //
+                                              7, 9, 10,   //
+                                              9, 11, 12,  //
+                                          })));
+}
+#endif
+
+template <typename T>
+class PadOpModel : public SingleOpModelWithNNAPI {
+ public:
+  void SetInput(std::initializer_list<T> data) {
+    PopulateTensor<T>(input_, data);
+  }
+
+  void SetQuantizedInput(std::initializer_list<float> data) {
+    QuantizeAndPopulate<uint8_t>(input_, data);
+  }
+
+  void SetQuantizedPadValue(float data) {
+    QuantizeAndPopulate<uint8_t>(constant_values_, {data});
+  }
+
+  void SetPaddings(std::initializer_list<int> paddings) {
+    PopulateTensor<int>(paddings_, paddings);
+  }
+
+  std::vector<T> GetOutput() { return ExtractVector<T>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+                               GetScale(output_), GetZeroPoint(output_));
+  }
+
+ protected:
+  int input_;
+  int output_;
+  int paddings_;
+  int constant_values_;
+};
+
+class PadOpConstModel : public PadOpModel<float> {
+ public:
+  PadOpConstModel(const TensorData& input,
+                  std::initializer_list<int> paddings_shape,
+                  std::initializer_list<int> paddings,
+                  const TensorData& output) {
+    input_ = AddInput(input);
+    paddings_ = AddConstInput(TensorType_INT32, paddings, paddings_shape);
+    output_ = AddOutput(output);
+
+    SetBuiltinOp(BuiltinOperator_PAD, BuiltinOptions_PadOptions,
+                 CreatePadOptions(builder_).Union());
+    BuildInterpreter({input.shape});
+  }
+};
+
+TEST(NNAPIDelegate, PadAdvancedConstTest) {
+  PadOpConstModel m({TensorType_FLOAT32, {1, 2, 3, 1}}, {4, 2},
+                    {0, 0, 0, 2, 1, 3, 0, 0}, {TensorType_FLOAT32});
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray({0, 1, 2, 3, 0, 0, 0, 0, 4, 5, 6, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
+}
+
+class SpaceToBatchNDOpModel : public SingleOpModelWithNNAPI {
+ public:
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor<float>(input_, data);
+  }
+
+  void SetBlockShape(std::initializer_list<int> data) {
+    PopulateTensor<int>(block_shape_, data);
+  }
+
+  void SetPaddings(std::initializer_list<int> data) {
+    PopulateTensor<int>(paddings_, data);
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ protected:
+  int input_;
+  int block_shape_;
+  int paddings_;
+  int output_;
+};
+
+class SpaceToBatchNDOpConstModel : public SpaceToBatchNDOpModel {
+ public:
+  SpaceToBatchNDOpConstModel(std::initializer_list<int> input_shape,
+                             std::initializer_list<int> block_shape,
+                             std::initializer_list<int> paddings) {
+    input_ = AddInput(TensorType_FLOAT32);
+    block_shape_ = AddConstInput(TensorType_INT32, block_shape, {2});
+    paddings_ = AddConstInput(TensorType_INT32, paddings, {2, 2});
+    output_ = AddOutput(TensorType_FLOAT32);
+
+    SetBuiltinOp(BuiltinOperator_SPACE_TO_BATCH_ND,
+                 BuiltinOptions_SpaceToBatchNDOptions,
+                 CreateSpaceToBatchNDOptions(builder_).Union());
+    BuildInterpreter({input_shape});
+  }
+};
+
+TEST(NNAPIDelegate, SpaceToBatchNDSimpleConstTest) {
+  SpaceToBatchNDOpConstModel m({1, 4, 4, 1}, {2, 2}, {0, 0, 0, 0});
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 2, 2, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 3, 9, 11, 2, 4, 10, 12, 5, 7,
+                                               13, 15, 6, 8, 14, 16}));
+}
+
+TEST(NNAPIDelegate, SpaceToBatchNDMultipleInputBatchesConstTest) {
+  SpaceToBatchNDOpConstModel m({2, 2, 4, 1}, {2, 2}, {0, 0, 0, 0});
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({8, 1, 2, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 3, 9, 11, 2, 4, 10, 12, 5, 7,
+                                               13, 15, 6, 8, 14, 16}));
+}
+
+TEST(NNAPIDelegate, SpaceToBatchNDSimplePaddingConstTest) {
+  SpaceToBatchNDOpConstModel m({1, 5, 2, 1}, {3, 2}, {1, 0, 2, 0});
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 2, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 0, 0, 0, 5, 0, 0, 0, 6, 0, 1, 0, 7,
+                                 0, 2, 0, 8, 0, 3, 0, 9, 0, 4, 0, 10,
+                             }));
+}
+
+TEST(NNAPIDelegate, SpaceToBatchNDComplexPaddingConstTest) {
+  SpaceToBatchNDOpConstModel m({1, 4, 2, 1}, {3, 2}, {1, 1, 2, 4});
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 4, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0,
+                                 0, 1, 0, 0, 0, 7, 0, 0, 0, 2, 0, 0, 0, 8, 0, 0,
+                                 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0,
+                             }));
+}
+
+template <typename input_type = float,
+          TensorType tensor_input_type = TensorType_FLOAT32>
+class StridedSliceOpModel : public SingleOpModelWithNNAPI {
+ public:
+  StridedSliceOpModel(std::initializer_list<int> input_shape,
+                      std::initializer_list<int> begin_shape,
+                      std::initializer_list<int> end_shape,
+                      std::initializer_list<int> strides_shape, int begin_mask,
+                      int end_mask, int ellipsis_mask, int new_axis_mask,
+                      int shrink_axis_mask) {
+    input_ = AddInput(tensor_input_type);
+    begin_ = AddInput(TensorType_INT32);
+    end_ = AddInput(TensorType_INT32);
+    strides_ = AddInput(TensorType_INT32);
+    output_ = AddOutput(tensor_input_type);
+    SetBuiltinOp(
+        BuiltinOperator_STRIDED_SLICE, BuiltinOptions_StridedSliceOptions,
+        CreateStridedSliceOptions(builder_, begin_mask, end_mask, ellipsis_mask,
+                                  new_axis_mask, shrink_axis_mask)
+            .Union());
+    BuildInterpreter({input_shape, begin_shape, end_shape, strides_shape});
+  }
+
+  void SetInput(std::initializer_list<input_type> data) {
+    PopulateTensor<input_type>(input_, data);
+  }
+  void SetBegin(std::initializer_list<int32_t> data) {
+    PopulateTensor<int32_t>(begin_, data);
+  }
+  void SetEnd(std::initializer_list<int32_t> data) {
+    PopulateTensor<int32_t>(end_, data);
+  }
+  void SetStrides(std::initializer_list<int32_t> data) {
+    PopulateTensor<int32_t>(strides_, data);
+  }
+
+  std::vector<input_type> GetOutput() {
+    return ExtractVector<input_type>(output_);
+  }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input_;
+  int begin_;
+  int end_;
+  int strides_;
+  int output_;
+};
+
+TEST(NNAPIDelegate, StridedSliceIn2D) {
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.SetBegin({1, 0});
+  m.SetEnd({2, 2});
+  m.SetStrides({1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({4, 5}));
+}
+
+TEST(NNAPIDelegate, StridedSliceIn2D_ShrinkAxis_NegativeSlice) {
+  // This is equivalent to tf.range(4)[:, tf.newaxis][-2, -1].
+  StridedSliceOpModel<> m({4, 1}, {2}, {2}, {2}, 0, 0, 0, 0, 3);
+  m.SetInput({0, 1, 2, 3});
+  m.SetBegin({-2, -1});
+  m.SetEnd({-1, 0});
+  m.SetStrides({1, 1});
+
+  m.Invoke();
+  EXPECT_TRUE(m.GetOutputShape().empty());
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({2}));
+}
+
+TEST(NNAPIDelegate, StridedSliceIn2D_ShrinkAxisMask) {
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 3);
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.SetBegin({0, 0});
+  m.SetEnd({1, 1});
+  m.SetStrides({1, 1});
+  m.Invoke();
+  EXPECT_TRUE(m.GetOutputShape().empty());
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1}));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/examples/android/app/README.md b/tensorflow/contrib/lite/examples/android/app/README.md
index 8e12bd04dd..cbdeeac879 100644
--- a/tensorflow/contrib/lite/examples/android/app/README.md
+++ b/tensorflow/contrib/lite/examples/android/app/README.md
@@ -2,9 +2,9 @@
 
 ## Building from Source with Bazel
 
-1. Follow the [Bazel steps for the TF Demo App](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android#bazel).
+1. Install [Bazel](https://docs.bazel.build/versions/master/install.html), the Android NDK and SDK. The recommended versions are specified on this [webpage](https://www.tensorflow.org/mobile/tflite/demo_android#build_tensorflow_lite_and_the_demo_app_from_source).
 
-2. Build the app with Bazel. The demo needs C++11. We configure the fat_apk_cpu flag to package support for 4 hardware variants. You may replace it with --config=android_arm64 on a 64-bit device and --config=android_arm for 32-bit device:
+2. Build this demo app with Bazel. The demo needs C++11. We configure the fat_apk_cpu flag to package support for 4 hardware variants. You may replace it with --config=android_arm64 on a 64-bit device and --config=android_arm for 32-bit device:
 
   ```shell
   bazel build -c opt --cxxopt='--std=c++11' --fat_apk_cpu=x86,x86_64,arm64-v8a,armeabi-v7a \
diff --git a/tensorflow/contrib/lite/experimental/c/BUILD b/tensorflow/contrib/lite/experimental/c/BUILD
new file mode 100644
index 0000000000..50f8da66d0
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/c/BUILD
@@ -0,0 +1,59 @@
+package(default_visibility = ["//visibility:private"])
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow/contrib/lite:build_def.bzl",
+    "tflite_cc_shared_object",
+    "tflite_copts",
+    "tflite_jni_binary",
+)
+
+tflite_cc_shared_object(
+    name = "libtensorflowlite_c.so",
+    linkopts = select({
+        "//tensorflow:darwin": [
+            "-Wl,-exported_symbols_list",  # This line must be directly followed by the exported_symbols.lds file
+            "$(location //tensorflow/contrib/lite/experimental/c:exported_symbols.lds)",
+            "-Wl,-install_name,@rpath/libtensorflowlite_c.so",
+        ],
+        "//tensorflow:windows": [],
+        "//conditions:default": [
+            "-z defs",
+            "-Wl,--version-script",  #  This line must be directly followed by the version_script.lds file
+            "$(location //tensorflow/contrib/lite/experimental/c:version_script.lds)",
+        ],
+    }),
+    deps = [
+        ":c_api",
+        ":exported_symbols.lds",
+        ":version_script.lds",
+    ],
+)
+
+cc_library(
+    name = "c_api",
+    srcs = ["c_api.cc"],
+    hdrs = ["c_api.h"],
+    copts = tflite_copts(),
+    deps = [
+        "//tensorflow/contrib/lite:context",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite:schema_fbs_version",
+        "//tensorflow/contrib/lite/kernels:builtin_ops",
+    ],
+)
+
+cc_test(
+    name = "c_api_test",
+    size = "small",
+    srcs = ["c_api_test.cc"],
+    data = ["//tensorflow/contrib/lite:testdata/add.bin"],
+    deps = [
+        ":c_api",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite:kernel_api",
+        "//tensorflow/contrib/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
diff --git a/tensorflow/contrib/lite/experimental/c/c_api.cc b/tensorflow/contrib/lite/experimental/c/c_api.cc
new file mode 100644
index 0000000000..9d29e8b3e0
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/c/c_api.cc
@@ -0,0 +1,122 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/experimental/c/c_api.h"
+
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/model.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+struct _TFL_Interpreter {
+  std::unique_ptr<tflite::Interpreter> impl;
+};
+
+// LINT.IfChange
+
+TFL_Interpreter* TFL_NewInterpreter(const void* model_data,
+                                    int32_t model_size) {
+  auto model = tflite::FlatBufferModel::BuildFromBuffer(
+      static_cast<const char*>(model_data), static_cast<size_t>(model_size));
+  if (!model) {
+    return nullptr;
+  }
+
+  tflite::ops::builtin::BuiltinOpResolver resolver;
+  tflite::InterpreterBuilder builder(*model, resolver);
+  std::unique_ptr<tflite::Interpreter> interpreter_impl;
+  if (builder(&interpreter_impl) != kTfLiteOk) {
+    return nullptr;
+  }
+
+  return new TFL_Interpreter{std::move(interpreter_impl)};
+}
+
+void TFL_DeleteInterpreter(TFL_Interpreter* interpreter) { delete interpreter; }
+
+int32_t TFL_InterpreterGetInputTensorCount(const TFL_Interpreter* interpreter) {
+  return static_cast<int>(interpreter->impl->inputs().size());
+}
+
+TFL_Tensor* TFL_InterpreterGetInputTensor(const TFL_Interpreter* interpreter,
+                                          int32_t input_index) {
+  return interpreter->impl->tensor(interpreter->impl->inputs()[input_index]);
+}
+
+TFL_Status TFL_InterpreterResizeInputTensor(TFL_Interpreter* interpreter,
+                                            int32_t input_index,
+                                            const int* input_dims,
+                                            int32_t input_dims_size) {
+  std::vector<int> dims{input_dims, input_dims + input_dims_size};
+  return interpreter->impl->ResizeInputTensor(
+      interpreter->impl->inputs()[input_index], dims);
+}
+
+TFL_Status TFL_InterpreterAllocateTensors(TFL_Interpreter* interpreter) {
+  return interpreter->impl->AllocateTensors();
+}
+
+TFL_Status TFL_InterpreterInvoke(TFL_Interpreter* interpreter) {
+  return interpreter->impl->Invoke();
+}
+
+int32_t TFL_InterpreterGetOutputTensorCount(
+    const TFL_Interpreter* interpreter) {
+  return static_cast<int>(interpreter->impl->outputs().size());
+}
+
+const TFL_Tensor* TFL_InterpreterGetOutputTensor(
+    const TFL_Interpreter* interpreter, int32_t output_index) {
+  return interpreter->impl->tensor(interpreter->impl->outputs()[output_index]);
+}
+
+TFL_Type TFL_TensorType(const TFL_Tensor* tensor) { return tensor->type; }
+
+int32_t TFL_TensorNumDims(const TFL_Tensor* tensor) {
+  return tensor->dims->size;
+}
+
+int32_t TFL_TensorDim(const TFL_Tensor* tensor, int32_t dim_index) {
+  return tensor->dims->data[dim_index];
+}
+
+size_t TFL_TensorByteSize(const TFL_Tensor* tensor) { return tensor->bytes; }
+
+TFL_Status TFL_TensorCopyFromBuffer(TFL_Tensor* tensor, const void* input_data,
+                                    int32_t input_data_size) {
+  if (tensor->bytes != static_cast<size_t>(input_data_size)) {
+    return kTfLiteError;
+  }
+  memcpy(tensor->data.raw, input_data, input_data_size);
+  return kTfLiteOk;
+}
+
+TFL_Status TFL_TensorCopyToBuffer(const TFL_Tensor* tensor, void* output_data,
+                                  int32_t output_data_size) {
+  if (tensor->bytes != static_cast<size_t>(output_data_size)) {
+    return kTfLiteError;
+  }
+  memcpy(output_data, tensor->data.raw, output_data_size);
+  return kTfLiteOk;
+}
+
+// LINT.ThenChange(//tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts/Interpreter.cs)
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
diff --git a/tensorflow/contrib/lite/experimental/c/c_api.h b/tensorflow/contrib/lite/experimental/c/c_api.h
new file mode 100644
index 0000000000..070f1add13
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/c/c_api.h
@@ -0,0 +1,149 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_C_C_API_H_
+#define TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_C_C_API_H_
+
+#include <stdint.h>
+
+// Eventually the various C APIs defined in context.h will be migrated into
+// the appropriate /c/c_api*.h header. For now, we pull in existing definitions
+// for convenience.
+#include "tensorflow/contrib/lite/context.h"
+
+// --------------------------------------------------------------------------
+// Experimental C API for TensorFlowLite.
+//
+// The API leans towards simplicity and uniformity instead of convenience, as
+// most usage will be by language-specific wrappers.
+//
+// Conventions:
+// * We use the prefix TFL_ for everything in the API.
+
+#ifdef SWIG
+#define TFL_CAPI_EXPORT
+#else
+#if defined(_WIN32)
+#ifdef TF_COMPILE_LIBRARY
+#define TFL_CAPI_EXPORT __declspec(dllexport)
+#else
+#define TFL_CAPI_EXPORT __declspec(dllimport)
+#endif  // TF_COMPILE_LIBRARY
+#else
+#define TFL_CAPI_EXPORT __attribute__((visibility("default")))
+#endif  // _WIN32
+#endif  // SWIG
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef TfLiteTensor TFL_Tensor;
+typedef TfLiteStatus TFL_Status;
+typedef TfLiteType TFL_Type;
+
+// --------------------------------------------------------------------------
+// TFL_Interpreter provides inference from a provided model.
+typedef struct _TFL_Interpreter TFL_Interpreter;
+
+// Returns an interpreter for the provided model, or null on failure.
+//
+// NOTE: The client *must* explicitly allocate tensors before attempting to
+// access input tensor data or invoke the interpreter.
+TFL_CAPI_EXPORT extern TFL_Interpreter* TFL_NewInterpreter(
+    const void* model_data, int32_t model_size);
+
+// Destroys the interpreter.
+TFL_CAPI_EXPORT extern void TFL_DeleteInterpreter(TFL_Interpreter* interpreter);
+
+// Returns the number of input tensors associated with the model.
+TFL_CAPI_EXPORT extern int TFL_InterpreterGetInputTensorCount(
+    const TFL_Interpreter* interpreter);
+
+// Returns the tensor associated with the input index.
+// REQUIRES: 0 <= input_index < TFL_InterpreterGetInputTensorCount(tensor)
+TFL_CAPI_EXPORT extern TFL_Tensor* TFL_InterpreterGetInputTensor(
+    const TFL_Interpreter* interpreter, int32_t input_index);
+
+// Attempts to resize the specified input tensor.
+// NOTE: After a resize, the client *must* explicitly allocate tensors before
+// attempting to access the resized tensor data or invoke the interpreter.
+// REQUIRES: 0 <= input_index < TFL_InterpreterGetInputTensorCount(tensor)
+TFL_CAPI_EXPORT extern TFL_Status TFL_InterpreterResizeInputTensor(
+    TFL_Interpreter* interpreter, int32_t input_index, const int* input_dims,
+    int32_t input_dims_size);
+
+// Updates allocations for all tensors, resizing dependent tensors using the
+// specified input tensor dimensionality.
+//
+// This is a relatively expensive operation, and need only be called after
+// creating the graph and/or resizing any inputs.
+TFL_CAPI_EXPORT extern TFL_Status TFL_InterpreterAllocateTensors(
+    TFL_Interpreter* interpreter);
+
+// Runs inference for the loaded graph.
+//
+// NOTE: It is possible that the interpreter is not in a ready state to
+// evaluate (e.g., if a ResizeInputTensor() has been performed without a call to
+// AllocateTensors()).
+TFL_CAPI_EXPORT extern TFL_Status TFL_InterpreterInvoke(
+    TFL_Interpreter* interpreter);
+
+// Returns the number of output tensors associated with the model.
+TFL_CAPI_EXPORT extern int32_t TFL_InterpreterGetOutputTensorCount(
+    const TFL_Interpreter* interpreter);
+
+// Returns the tensor associated with the output index.
+// REQUIRES: 0 <= input_index < TFL_InterpreterGetOutputTensorCount(tensor)
+TFL_CAPI_EXPORT extern const TFL_Tensor* TFL_InterpreterGetOutputTensor(
+    const TFL_Interpreter* interpreter, int32_t output_index);
+
+// --------------------------------------------------------------------------
+// TFL_Tensor wraps data associated with a graph tensor.
+//
+// Note that, while the TFL_Tensor struct is not currently opaque, and its
+// fields can be accessed directly, these methods are still convenient for
+// language bindings. In the future the tensor struct will likely be made opaque
+// in the public API.
+
+// Returns the type of a tensor element.
+TFL_CAPI_EXPORT extern TFL_Type TFL_TensorType(const TFL_Tensor* tensor);
+
+// Returns the number of dimensions that the tensor has.
+TFL_CAPI_EXPORT extern int32_t TFL_TensorNumDims(const TFL_Tensor* tensor);
+
+// Returns the length of the tensor in the "dim_index" dimension.
+// REQUIRES: 0 <= dim_index < TFLiteTensorNumDims(tensor)
+TFL_CAPI_EXPORT extern int32_t TFL_TensorDim(const TFL_Tensor* tensor,
+                                             int32_t dim_index);
+
+// Returns the size of the underlying data in bytes.
+TFL_CAPI_EXPORT extern size_t TFL_TensorByteSize(const TFL_Tensor* tensor);
+
+// Copies from the provided input buffer into the tensor's buffer.
+// REQUIRES: input_data_size == TFL_TensorByteSize(tensor)
+TFL_CAPI_EXPORT extern TFL_Status TFL_TensorCopyFromBuffer(
+    TFL_Tensor* tensor, const void* input_data, int32_t input_data_size);
+
+// Copies to the provided output buffer from the tensor's buffer.
+// REQUIRES: output_data_size == TFL_TensorByteSize(tensor)
+TFL_CAPI_EXPORT extern TFL_Status TFL_TensorCopyToBuffer(
+    const TFL_Tensor* output_tensor, void* output_data,
+    int32_t output_data_size);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_C_C_API_H_
diff --git a/tensorflow/contrib/lite/experimental/c/c_api_test.cc b/tensorflow/contrib/lite/experimental/c/c_api_test.cc
new file mode 100644
index 0000000000..bc925e00a6
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/c/c_api_test.cc
@@ -0,0 +1,84 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <array>
+
+#include "tensorflow/contrib/lite/experimental/c/c_api.h"
+
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/allocation.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/testing/util.h"
+
+namespace {
+
+TEST(CApiSimple, Smoke) {
+  tflite::FileCopyAllocation model_file(
+      "tensorflow/contrib/lite/testdata/add.bin",
+      tflite::DefaultErrorReporter());
+
+  TFL_Interpreter* interpreter =
+      TFL_NewInterpreter(model_file.base(), model_file.bytes());
+  ASSERT_NE(interpreter, nullptr);
+  ASSERT_EQ(TFL_InterpreterAllocateTensors(interpreter), kTfLiteOk);
+
+  ASSERT_EQ(TFL_InterpreterGetInputTensorCount(interpreter), 1);
+  ASSERT_EQ(TFL_InterpreterGetOutputTensorCount(interpreter), 1);
+
+  std::array<int, 1> input_dims = {2};
+  ASSERT_EQ(TFL_InterpreterResizeInputTensor(interpreter, 0, input_dims.data(),
+                                             input_dims.size()),
+            kTfLiteOk);
+  ASSERT_EQ(TFL_InterpreterAllocateTensors(interpreter), kTfLiteOk);
+
+  TFL_Tensor* input_tensor = TFL_InterpreterGetInputTensor(interpreter, 0);
+  ASSERT_NE(input_tensor, nullptr);
+  EXPECT_EQ(TFL_TensorType(input_tensor), kTfLiteFloat32);
+  EXPECT_EQ(TFL_TensorNumDims(input_tensor), 1);
+  EXPECT_EQ(TFL_TensorDim(input_tensor, 0), 2);
+  EXPECT_EQ(TFL_TensorByteSize(input_tensor), sizeof(float) * 2);
+
+  std::array<float, 2> input = {1.f, 3.f};
+  ASSERT_EQ(TFL_TensorCopyFromBuffer(input_tensor, input.data(),
+                                     input.size() * sizeof(float)),
+            kTfLiteOk);
+
+  ASSERT_EQ(TFL_InterpreterInvoke(interpreter), kTfLiteOk);
+
+  const TFL_Tensor* output_tensor =
+      TFL_InterpreterGetOutputTensor(interpreter, 0);
+  ASSERT_NE(output_tensor, nullptr);
+  EXPECT_EQ(TFL_TensorType(output_tensor), kTfLiteFloat32);
+  EXPECT_EQ(TFL_TensorNumDims(output_tensor), 1);
+  EXPECT_EQ(TFL_TensorDim(output_tensor, 0), 2);
+  EXPECT_EQ(TFL_TensorByteSize(output_tensor), sizeof(float) * 2);
+
+  std::array<float, 2> output;
+  ASSERT_EQ(TFL_TensorCopyToBuffer(output_tensor, output.data(),
+                                   output.size() * sizeof(float)),
+            kTfLiteOk);
+  EXPECT_EQ(output[0], 3.f);
+  EXPECT_EQ(output[1], 9.f);
+
+  TFL_DeleteInterpreter(interpreter);
+}
+
+}  // namespace
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/experimental/c/exported_symbols.lds b/tensorflow/contrib/lite/experimental/c/exported_symbols.lds
new file mode 100644
index 0000000000..a3ddc6bc8d
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/c/exported_symbols.lds
@@ -0,0 +1 @@
+_TFL_*
diff --git a/tensorflow/contrib/lite/experimental/c/version_script.lds b/tensorflow/contrib/lite/experimental/c/version_script.lds
new file mode 100644
index 0000000000..c0c8a2bca1
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/c/version_script.lds
@@ -0,0 +1,9 @@
+VERS_1.0 {
+  # Export symbols in c_api.h.
+  global:
+    *TFL_*;
+
+  # Hide everything else.
+  local:
+    *;
+};
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/.gitignore b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/.gitignore
new file mode 100644
index 0000000000..c72a5cae9e
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/.gitignore
@@ -0,0 +1,13 @@
+# Unity generated
+Builds/
+Temp/
+Library/
+obj/
+# Visual Studio / MonoDevelop generated
+*.csproj
+*.unityproj
+*.sln
+*.suo
+*.userprefs
+# OS generated
+.DS_Store
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite.meta b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite.meta
new file mode 100644
index 0000000000..ed9337b53e
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite.meta
@@ -0,0 +1,8 @@
+fileFormatVersion: 2
+guid: 71d1b4219b1da4aeaa1cebbec324fc81
+folderAsset: yes
+DefaultImporter:
+  externalObjects: {}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples.meta b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples.meta
new file mode 100644
index 0000000000..edcce00939
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples.meta
@@ -0,0 +1,8 @@
+fileFormatVersion: 2
+guid: d948aead14abd4c88947c9886d16f774
+folderAsset: yes
+DefaultImporter:
+  externalObjects: {}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite.meta b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite.meta
new file mode 100644
index 0000000000..36b35516f0
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite.meta
@@ -0,0 +1,8 @@
+fileFormatVersion: 2
+guid: b810b85b794fa48fd93100acf5525e1f
+folderAsset: yes
+DefaultImporter:
+  externalObjects: {}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes.meta b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes.meta
new file mode 100644
index 0000000000..d4133da49a
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes.meta
@@ -0,0 +1,8 @@
+fileFormatVersion: 2
+guid: 154f4201e2e454d4696fa5834eaa3ad3
+folderAsset: yes
+DefaultImporter:
+  externalObjects: {}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes/HelloTFLite.unity b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes/HelloTFLite.unity
new file mode 100644
index 0000000000..9397d8f27a
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes/HelloTFLite.unity
@@ -0,0 +1,242 @@
+%YAML 1.1
+%TAG !u! tag:unity3d.com,2011:
+--- !u!29 &1
+OcclusionCullingSettings:
+  m_ObjectHideFlags: 0
+  serializedVersion: 2
+  m_OcclusionBakeSettings:
+    smallestOccluder: 5
+    smallestHole: 0.25
+    backfaceThreshold: 100
+  m_SceneGUID: 00000000000000000000000000000000
+  m_OcclusionCullingData: {fileID: 0}
+--- !u!104 &2
+RenderSettings:
+  m_ObjectHideFlags: 0
+  serializedVersion: 8
+  m_Fog: 0
+  m_FogColor: {r: 0.5, g: 0.5, b: 0.5, a: 1}
+  m_FogMode: 3
+  m_FogDensity: 0.01
+  m_LinearFogStart: 0
+  m_LinearFogEnd: 300
+  m_AmbientSkyColor: {r: 0.212, g: 0.227, b: 0.259, a: 1}
+  m_AmbientEquatorColor: {r: 0.114, g: 0.125, b: 0.133, a: 1}
+  m_AmbientGroundColor: {r: 0.047, g: 0.043, b: 0.035, a: 1}
+  m_AmbientIntensity: 1
+  m_AmbientMode: 3
+  m_SubtractiveShadowColor: {r: 0.42, g: 0.478, b: 0.627, a: 1}
+  m_SkyboxMaterial: {fileID: 0}
+  m_HaloStrength: 0.5
+  m_FlareStrength: 1
+  m_FlareFadeSpeed: 3
+  m_HaloTexture: {fileID: 0}
+  m_SpotCookie: {fileID: 10001, guid: 0000000000000000e000000000000000, type: 0}
+  m_DefaultReflectionMode: 0
+  m_DefaultReflectionResolution: 128
+  m_ReflectionBounces: 1
+  m_ReflectionIntensity: 1
+  m_CustomReflection: {fileID: 0}
+  m_Sun: {fileID: 0}
+  m_IndirectSpecularColor: {r: 0, g: 0, b: 0, a: 1}
+--- !u!157 &3
+LightmapSettings:
+  m_ObjectHideFlags: 0
+  serializedVersion: 11
+  m_GIWorkflowMode: 1
+  m_GISettings:
+    serializedVersion: 2
+    m_BounceScale: 1
+    m_IndirectOutputScale: 1
+    m_AlbedoBoost: 1
+    m_TemporalCoherenceThreshold: 1
+    m_EnvironmentLightingMode: 0
+    m_EnableBakedLightmaps: 0
+    m_EnableRealtimeLightmaps: 0
+  m_LightmapEditorSettings:
+    serializedVersion: 9
+    m_Resolution: 2
+    m_BakeResolution: 40
+    m_TextureWidth: 1024
+    m_TextureHeight: 1024
+    m_AO: 0
+    m_AOMaxDistance: 1
+    m_CompAOExponent: 1
+    m_CompAOExponentDirect: 0
+    m_Padding: 2
+    m_LightmapParameters: {fileID: 0}
+    m_LightmapsBakeMode: 1
+    m_TextureCompression: 1
+    m_FinalGather: 0
+    m_FinalGatherFiltering: 1
+    m_FinalGatherRayCount: 256
+    m_ReflectionCompression: 2
+    m_MixedBakeMode: 2
+    m_BakeBackend: 0
+    m_PVRSampling: 1
+    m_PVRDirectSampleCount: 32
+    m_PVRSampleCount: 500
+    m_PVRBounces: 2
+    m_PVRFilterTypeDirect: 0
+    m_PVRFilterTypeIndirect: 0
+    m_PVRFilterTypeAO: 0
+    m_PVRFilteringMode: 1
+    m_PVRCulling: 1
+    m_PVRFilteringGaussRadiusDirect: 1
+    m_PVRFilteringGaussRadiusIndirect: 5
+    m_PVRFilteringGaussRadiusAO: 2
+    m_PVRFilteringAtrousPositionSigmaDirect: 0.5
+    m_PVRFilteringAtrousPositionSigmaIndirect: 2
+    m_PVRFilteringAtrousPositionSigmaAO: 1
+    m_ShowResolutionOverlay: 1
+  m_LightingDataAsset: {fileID: 0}
+  m_UseShadowmask: 1
+--- !u!196 &4
+NavMeshSettings:
+  serializedVersion: 2
+  m_ObjectHideFlags: 0
+  m_BuildSettings:
+    serializedVersion: 2
+    agentTypeID: 0
+    agentRadius: 0.5
+    agentHeight: 2
+    agentSlope: 45
+    agentClimb: 0.4
+    ledgeDropHeight: 0
+    maxJumpAcrossDistance: 0
+    minRegionArea: 2
+    manualCellSize: 0
+    cellSize: 0.16666667
+    manualTileSize: 0
+    tileSize: 256
+    accuratePlacement: 0
+    debug:
+      m_Flags: 0
+  m_NavMeshData: {fileID: 0}
+--- !u!1 &492081941
+GameObject:
+  m_ObjectHideFlags: 0
+  m_PrefabParentObject: {fileID: 0}
+  m_PrefabInternal: {fileID: 0}
+  serializedVersion: 5
+  m_Component:
+  - component: {fileID: 492081945}
+  - component: {fileID: 492081944}
+  - component: {fileID: 492081943}
+  - component: {fileID: 492081942}
+  m_Layer: 0
+  m_Name: Main Camera
+  m_TagString: MainCamera
+  m_Icon: {fileID: 0}
+  m_NavMeshLayer: 0
+  m_StaticEditorFlags: 0
+  m_IsActive: 1
+--- !u!81 &492081942
+AudioListener:
+  m_ObjectHideFlags: 0
+  m_PrefabParentObject: {fileID: 0}
+  m_PrefabInternal: {fileID: 0}
+  m_GameObject: {fileID: 492081941}
+  m_Enabled: 1
+--- !u!124 &492081943
+Behaviour:
+  m_ObjectHideFlags: 0
+  m_PrefabParentObject: {fileID: 0}
+  m_PrefabInternal: {fileID: 0}
+  m_GameObject: {fileID: 492081941}
+  m_Enabled: 1
+--- !u!20 &492081944
+Camera:
+  m_ObjectHideFlags: 0
+  m_PrefabParentObject: {fileID: 0}
+  m_PrefabInternal: {fileID: 0}
+  m_GameObject: {fileID: 492081941}
+  m_Enabled: 1
+  serializedVersion: 2
+  m_ClearFlags: 1
+  m_BackGroundColor: {r: 0.19215687, g: 0.3019608, b: 0.4745098, a: 0}
+  m_NormalizedViewPortRect:
+    serializedVersion: 2
+    x: 0
+    y: 0
+    width: 1
+    height: 1
+  near clip plane: 0.3
+  far clip plane: 1000
+  field of view: 60
+  orthographic: 1
+  orthographic size: 5
+  m_Depth: -1
+  m_CullingMask:
+    serializedVersion: 2
+    m_Bits: 4294967295
+  m_RenderingPath: -1
+  m_TargetTexture: {fileID: 0}
+  m_TargetDisplay: 0
+  m_TargetEye: 3
+  m_HDR: 1
+  m_AllowMSAA: 1
+  m_AllowDynamicResolution: 0
+  m_ForceIntoRT: 0
+  m_OcclusionCulling: 1
+  m_StereoConvergence: 10
+  m_StereoSeparation: 0.022
+--- !u!4 &492081945
+Transform:
+  m_ObjectHideFlags: 0
+  m_PrefabParentObject: {fileID: 0}
+  m_PrefabInternal: {fileID: 0}
+  m_GameObject: {fileID: 492081941}
+  m_LocalRotation: {x: 0, y: 0, z: 0, w: 1}
+  m_LocalPosition: {x: 0, y: 0, z: -10}
+  m_LocalScale: {x: 1, y: 1, z: 1}
+  m_Children:
+  - {fileID: 904015944}
+  m_Father: {fileID: 0}
+  m_RootOrder: 0
+  m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
+--- !u!1 &904015943
+GameObject:
+  m_ObjectHideFlags: 0
+  m_PrefabParentObject: {fileID: 0}
+  m_PrefabInternal: {fileID: 0}
+  serializedVersion: 5
+  m_Component:
+  - component: {fileID: 904015944}
+  - component: {fileID: 904015945}
+  m_Layer: 0
+  m_Name: HelloTFLite
+  m_TagString: Untagged
+  m_Icon: {fileID: 0}
+  m_NavMeshLayer: 0
+  m_StaticEditorFlags: 0
+  m_IsActive: 1
+--- !u!4 &904015944
+Transform:
+  m_ObjectHideFlags: 0
+  m_PrefabParentObject: {fileID: 0}
+  m_PrefabInternal: {fileID: 0}
+  m_GameObject: {fileID: 904015943}
+  m_LocalRotation: {x: 0, y: 0, z: 0, w: 1}
+  m_LocalPosition: {x: 0, y: 0, z: 0}
+  m_LocalScale: {x: 1, y: 1, z: 1}
+  m_Children: []
+  m_Father: {fileID: 492081945}
+  m_RootOrder: 0
+  m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
+--- !u!114 &904015945
+MonoBehaviour:
+  m_ObjectHideFlags: 0
+  m_PrefabParentObject: {fileID: 0}
+  m_PrefabInternal: {fileID: 0}
+  m_GameObject: {fileID: 904015943}
+  m_Enabled: 1
+  m_EditorHideFlags: 0
+  m_Script: {fileID: 11500000, guid: 899510441e0ca4be0879d3055e467878, type: 3}
+  m_Name: 
+  m_EditorClassIdentifier: 
+  model: {fileID: 4900000, guid: adff4e1dbdba344c199ee4fe7e84457e, type: 3}
+  inputs:
+  - 1
+  - 3
+  - 7
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes/HelloTFLite.unity.meta b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes/HelloTFLite.unity.meta
new file mode 100644
index 0000000000..e1e13efb66
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes/HelloTFLite.unity.meta
@@ -0,0 +1,7 @@
+fileFormatVersion: 2
+guid: f8a8c37a396584bb7b21687f33d6d3f8
+DefaultImporter:
+  externalObjects: {}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes/add.bytes b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes/add.bytes
new file mode 100644
index 0000000000..aef0fe3d82
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes/add.bytes
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes/add.bytes.meta b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes/add.bytes.meta
new file mode 100644
index 0000000000..ba24871413
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes/add.bytes.meta
@@ -0,0 +1,7 @@
+fileFormatVersion: 2
+guid: adff4e1dbdba344c199ee4fe7e84457e
+TextScriptImporter:
+  externalObjects: {}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scripts.meta b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scripts.meta
new file mode 100644
index 0000000000..28fde68b8b
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scripts.meta
@@ -0,0 +1,8 @@
+fileFormatVersion: 2
+guid: f7d1e2dec09b64acdb7b8f5aef9fcb44
+folderAsset: yes
+DefaultImporter:
+  externalObjects: {}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scripts/HelloTFLite.cs b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scripts/HelloTFLite.cs
new file mode 100644
index 0000000000..abca814499
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scripts/HelloTFLite.cs
@@ -0,0 +1,70 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+using System;
+using System.Collections;
+using System.Collections.Generic;
+using System.Linq;
+using TensorFlowLite;
+using UnityEngine;
+
+/// <summary>
+/// Simple example demonstrating use of the experimental C# bindings for TensorFlowLite.
+/// </summary>
+public class HelloTFLite : MonoBehaviour {
+
+  [Tooltip("Configurable TFLite model.")]
+  public TextAsset model;
+
+  [Tooltip("Configurable TFLite input tensor data.")]
+  public float[] inputs;
+
+  private Interpreter interpreter;
+  private float[] outputs;
+
+  void Start () {
+    interpreter = new Interpreter(model.bytes);
+    Debug.LogFormat("InputCount: {0}, OutputCount: {1}",
+                    interpreter.GetInputTensorCount(),
+                    interpreter.GetOutputTensorCount());
+  }
+
+  void Update () {
+    if (inputs == null) {
+      return;
+    }
+
+    if (outputs == null || outputs.Length != inputs.Length) {
+      interpreter.ResizeInputTensor(0, new int[]{inputs.Length});
+      interpreter.AllocateTensors();
+      outputs = new float[inputs.Length];
+    }
+
+    interpreter.SetInputTensorData(0, inputs);
+    interpreter.Invoke();
+    interpreter.GetOutputTensorData(0, outputs);
+
+    Debug.LogFormat("Input: {0}, Output: {1}",
+                    ArrayToString(inputs),
+                    ArrayToString(outputs));
+  }
+
+  void OnDestroy() {
+    interpreter.Dispose();
+  }
+
+   private static string ArrayToString(float[] values) {
+    return string.Join(",", values.Select(x => x.ToString()).ToArray());
+  }
+}
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scripts/HelloTFLite.cs.meta b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scripts/HelloTFLite.cs.meta
new file mode 100644
index 0000000000..ba83f45084
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scripts/HelloTFLite.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 899510441e0ca4be0879d3055e467878
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK.meta b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK.meta
new file mode 100644
index 0000000000..bf5ce15c6a
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK.meta
@@ -0,0 +1,8 @@
+fileFormatVersion: 2
+guid: 16dad1655bcdc48f7b325a2a634b9c69
+folderAsset: yes
+DefaultImporter:
+  externalObjects: {}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts.meta b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts.meta
new file mode 100644
index 0000000000..22ed2c466b
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts.meta
@@ -0,0 +1,8 @@
+fileFormatVersion: 2
+guid: d70863368f8904d509a9b73d3a555914
+folderAsset: yes
+DefaultImporter:
+  externalObjects: {}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts/Interpreter.cs b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts/Interpreter.cs
new file mode 100644
index 0000000000..ab966bae2e
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts/Interpreter.cs
@@ -0,0 +1,145 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+using System;
+using System.Runtime.InteropServices;
+
+using TFL_Interpreter = System.IntPtr;
+using TFL_Tensor = System.IntPtr;
+
+namespace TensorFlowLite
+{
+  /// <summary>
+  /// Simple C# bindings for the experimental TensorFlowLite C API.
+  /// </summary>
+  public class Interpreter : IDisposable
+  {
+    private const string TensorFlowLibrary = "tensorflowlite_c";
+
+    private TFL_Interpreter handle;
+
+    public Interpreter(byte[] modelData) {
+      GCHandle modelDataHandle = GCHandle.Alloc(modelData, GCHandleType.Pinned);
+      IntPtr modelDataPtr = modelDataHandle.AddrOfPinnedObject();
+      handle = TFL_NewInterpreter(modelDataPtr, modelData.Length);
+      if (handle == IntPtr.Zero) throw new Exception("Failed to create TensorFlowLite Interpreter");
+    }
+
+    ~Interpreter() {
+      Dispose();
+    }
+
+    public void Dispose() {
+      if (handle != IntPtr.Zero) TFL_DeleteInterpreter(handle);
+      handle = IntPtr.Zero;
+    }
+
+    public void Invoke() {
+      ThrowIfError(TFL_InterpreterInvoke(handle));
+    }
+
+    public int GetInputTensorCount() {
+      return TFL_InterpreterGetInputTensorCount(handle);
+    }
+
+    public void SetInputTensorData(int inputTensorIndex, Array inputTensorData) {
+      GCHandle tensorDataHandle = GCHandle.Alloc(inputTensorData, GCHandleType.Pinned);
+      IntPtr tensorDataPtr = tensorDataHandle.AddrOfPinnedObject();
+      TFL_Tensor tensor = TFL_InterpreterGetInputTensor(handle, inputTensorIndex);
+      ThrowIfError(TFL_TensorCopyFromBuffer(
+          tensor, tensorDataPtr, Buffer.ByteLength(inputTensorData)));
+    }
+
+    public void ResizeInputTensor(int inputTensorIndex, int[] inputTensorShape) {
+      ThrowIfError(TFL_InterpreterResizeInputTensor(
+          handle, inputTensorIndex, inputTensorShape, inputTensorShape.Length));
+    }
+
+    public void AllocateTensors() {
+      ThrowIfError(TFL_InterpreterAllocateTensors(handle));
+    }
+
+    public int GetOutputTensorCount() {
+      return TFL_InterpreterGetOutputTensorCount(handle);
+    }
+
+    public void GetOutputTensorData(int outputTensorIndex, Array outputTensorData) {
+      GCHandle tensorDataHandle = GCHandle.Alloc(outputTensorData, GCHandleType.Pinned);
+      IntPtr tensorDataPtr = tensorDataHandle.AddrOfPinnedObject();
+      TFL_Tensor tensor = TFL_InterpreterGetOutputTensor(handle, outputTensorIndex);
+      ThrowIfError(TFL_TensorCopyToBuffer(
+          tensor, tensorDataPtr, Buffer.ByteLength(outputTensorData)));
+    }
+
+    private static void ThrowIfError(int resultCode) {
+      if (resultCode != 0) throw new Exception("TensorFlowLite operation failed.");
+    }
+
+    #region Externs
+
+    [DllImport (TensorFlowLibrary)]
+    private static extern unsafe TFL_Interpreter TFL_NewInterpreter(
+        IntPtr model_data,
+        int model_size);
+
+    [DllImport (TensorFlowLibrary)]
+    private static extern unsafe void TFL_DeleteInterpreter(TFL_Interpreter interpreter);
+
+    [DllImport (TensorFlowLibrary)]
+    private static extern unsafe int TFL_InterpreterGetInputTensorCount(
+        TFL_Interpreter interpreter);
+
+    [DllImport (TensorFlowLibrary)]
+    private static extern unsafe TFL_Tensor TFL_InterpreterGetInputTensor(
+        TFL_Interpreter interpreter,
+        int input_index);
+
+    [DllImport (TensorFlowLibrary)]
+    private static extern unsafe int TFL_InterpreterResizeInputTensor(
+        TFL_Interpreter interpreter,
+        int input_index,
+        int[] input_dims,
+        int input_dims_size);
+
+    [DllImport (TensorFlowLibrary)]
+    private static extern unsafe int TFL_InterpreterAllocateTensors(
+        TFL_Interpreter interpreter);
+
+    [DllImport (TensorFlowLibrary)]
+    private static extern unsafe int TFL_InterpreterInvoke(TFL_Interpreter interpreter);
+
+    [DllImport (TensorFlowLibrary)]
+    private static extern unsafe int TFL_InterpreterGetOutputTensorCount(
+        TFL_Interpreter interpreter);
+
+    [DllImport (TensorFlowLibrary)]
+    private static extern unsafe TFL_Tensor TFL_InterpreterGetOutputTensor(
+        TFL_Interpreter interpreter,
+        int output_index);
+
+    [DllImport (TensorFlowLibrary)]
+    private static extern unsafe int TFL_TensorCopyFromBuffer(
+        TFL_Tensor tensor,
+        IntPtr input_data,
+        int input_data_size);
+
+    [DllImport (TensorFlowLibrary)]
+    private static extern unsafe int TFL_TensorCopyToBuffer(
+        TFL_Tensor tensor,
+        IntPtr output_data,
+        int output_data_size);
+
+    #endregion
+  }
+}
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts/Interpreter.cs.meta b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts/Interpreter.cs.meta
new file mode 100644
index 0000000000..5ec84ef7f7
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts/Interpreter.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 0bbaf59e6ac914ed1b28174fb9008a09
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/AudioManager.asset b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/AudioManager.asset
new file mode 100644
index 0000000000..da6112576a
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/AudioManager.asset
@@ -0,0 +1,17 @@
+%YAML 1.1
+%TAG !u! tag:unity3d.com,2011:
+--- !u!11 &1
+AudioManager:
+  m_ObjectHideFlags: 0
+  m_Volume: 1
+  Rolloff Scale: 1
+  Doppler Factor: 1
+  Default Speaker Mode: 2
+  m_SampleRate: 0
+  m_DSPBufferSize: 0
+  m_VirtualVoiceCount: 512
+  m_RealVoiceCount: 32
+  m_SpatializerPlugin: 
+  m_AmbisonicDecoderPlugin: 
+  m_DisableAudio: 0
+  m_VirtualizeEffects: 1
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/ClusterInputManager.asset b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/ClusterInputManager.asset
new file mode 100644
index 0000000000..e7886b266a
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/ClusterInputManager.asset
@@ -0,0 +1,6 @@
+%YAML 1.1
+%TAG !u! tag:unity3d.com,2011:
+--- !u!236 &1
+ClusterInputManager:
+  m_ObjectHideFlags: 0
+  m_Inputs: []
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/DynamicsManager.asset b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/DynamicsManager.asset
new file mode 100644
index 0000000000..78992f08c7
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/DynamicsManager.asset
@@ -0,0 +1,29 @@
+%YAML 1.1
+%TAG !u! tag:unity3d.com,2011:
+--- !u!55 &1
+PhysicsManager:
+  m_ObjectHideFlags: 0
+  serializedVersion: 7
+  m_Gravity: {x: 0, y: -9.81, z: 0}
+  m_DefaultMaterial: {fileID: 0}
+  m_BounceThreshold: 2
+  m_SleepThreshold: 0.005
+  m_DefaultContactOffset: 0.01
+  m_DefaultSolverIterations: 6
+  m_DefaultSolverVelocityIterations: 1
+  m_QueriesHitBackfaces: 0
+  m_QueriesHitTriggers: 1
+  m_EnableAdaptiveForce: 0
+  m_ClothInterCollisionDistance: 0
+  m_ClothInterCollisionStiffness: 0
+  m_ContactsGeneration: 1
+  m_LayerCollisionMatrix: ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
+  m_AutoSimulation: 1
+  m_AutoSyncTransforms: 1
+  m_ClothInterCollisionSettingsToggle: 0
+  m_ContactPairsMode: 0
+  m_BroadphaseType: 0
+  m_WorldBounds:
+    m_Center: {x: 0, y: 0, z: 0}
+    m_Extent: {x: 250, y: 250, z: 250}
+  m_WorldSubdivisions: 8
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/EditorBuildSettings.asset b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/EditorBuildSettings.asset
new file mode 100644
index 0000000000..6dc24f7dfd
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/EditorBuildSettings.asset
@@ -0,0 +1,7 @@
+%YAML 1.1
+%TAG !u! tag:unity3d.com,2011:
+--- !u!1045 &1
+EditorBuildSettings:
+  m_ObjectHideFlags: 0
+  serializedVersion: 2
+  m_Scenes: []
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/EditorSettings.asset b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/EditorSettings.asset
new file mode 100644
index 0000000000..fcd016402f
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/EditorSettings.asset
@@ -0,0 +1,21 @@
+%YAML 1.1
+%TAG !u! tag:unity3d.com,2011:
+--- !u!159 &1
+EditorSettings:
+  m_ObjectHideFlags: 0
+  serializedVersion: 7
+  m_ExternalVersionControlSupport: Visible Meta Files
+  m_SerializationMode: 2
+  m_LineEndingsForNewScripts: 1
+  m_DefaultBehaviorMode: 1
+  m_SpritePackerMode: 4
+  m_SpritePackerPaddingPower: 1
+  m_EtcTextureCompressorBehavior: 1
+  m_EtcTextureFastCompressor: 1
+  m_EtcTextureNormalCompressor: 2
+  m_EtcTextureBestCompressor: 4
+  m_ProjectGenerationIncludedExtensions: txt;xml;fnt;cd;asmdef;rsp
+  m_ProjectGenerationRootNamespace: 
+  m_UserGeneratedProjectSuffix: 
+  m_CollabEditorSettings:
+    inProgressEnabled: 1
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/GraphicsSettings.asset b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/GraphicsSettings.asset
new file mode 100644
index 0000000000..74d7b532b0
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/GraphicsSettings.asset
@@ -0,0 +1,61 @@
+%YAML 1.1
+%TAG !u! tag:unity3d.com,2011:
+--- !u!30 &1
+GraphicsSettings:
+  m_ObjectHideFlags: 0
+  serializedVersion: 12
+  m_Deferred:
+    m_Mode: 1
+    m_Shader: {fileID: 69, guid: 0000000000000000f000000000000000, type: 0}
+  m_DeferredReflections:
+    m_Mode: 1
+    m_Shader: {fileID: 74, guid: 0000000000000000f000000000000000, type: 0}
+  m_ScreenSpaceShadows:
+    m_Mode: 1
+    m_Shader: {fileID: 64, guid: 0000000000000000f000000000000000, type: 0}
+  m_LegacyDeferred:
+    m_Mode: 1
+    m_Shader: {fileID: 63, guid: 0000000000000000f000000000000000, type: 0}
+  m_DepthNormals:
+    m_Mode: 1
+    m_Shader: {fileID: 62, guid: 0000000000000000f000000000000000, type: 0}
+  m_MotionVectors:
+    m_Mode: 1
+    m_Shader: {fileID: 75, guid: 0000000000000000f000000000000000, type: 0}
+  m_LightHalo:
+    m_Mode: 1
+    m_Shader: {fileID: 105, guid: 0000000000000000f000000000000000, type: 0}
+  m_LensFlare:
+    m_Mode: 1
+    m_Shader: {fileID: 102, guid: 0000000000000000f000000000000000, type: 0}
+  m_AlwaysIncludedShaders:
+  - {fileID: 7, guid: 0000000000000000f000000000000000, type: 0}
+  - {fileID: 15104, guid: 0000000000000000f000000000000000, type: 0}
+  - {fileID: 15105, guid: 0000000000000000f000000000000000, type: 0}
+  - {fileID: 15106, guid: 0000000000000000f000000000000000, type: 0}
+  - {fileID: 10753, guid: 0000000000000000f000000000000000, type: 0}
+  - {fileID: 10770, guid: 0000000000000000f000000000000000, type: 0}
+  m_PreloadedShaders: []
+  m_SpritesDefaultMaterial: {fileID: 10754, guid: 0000000000000000f000000000000000,
+    type: 0}
+  m_CustomRenderPipeline: {fileID: 0}
+  m_TransparencySortMode: 0
+  m_TransparencySortAxis: {x: 0, y: 0, z: 1}
+  m_DefaultRenderingPath: 1
+  m_DefaultMobileRenderingPath: 1
+  m_TierSettings: []
+  m_LightmapStripping: 0
+  m_FogStripping: 0
+  m_InstancingStripping: 0
+  m_LightmapKeepPlain: 1
+  m_LightmapKeepDirCombined: 1
+  m_LightmapKeepDynamicPlain: 1
+  m_LightmapKeepDynamicDirCombined: 1
+  m_LightmapKeepShadowMask: 1
+  m_LightmapKeepSubtractive: 1
+  m_FogKeepLinear: 1
+  m_FogKeepExp: 1
+  m_FogKeepExp2: 1
+  m_AlbedoSwatchInfos: []
+  m_LightsUseLinearIntensity: 0
+  m_LightsUseColorTemperature: 0
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/InputManager.asset b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/InputManager.asset
new file mode 100644
index 0000000000..17c8f538e2
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/InputManager.asset
@@ -0,0 +1,295 @@
+%YAML 1.1
+%TAG !u! tag:unity3d.com,2011:
+--- !u!13 &1
+InputManager:
+  m_ObjectHideFlags: 0
+  serializedVersion: 2
+  m_Axes:
+  - serializedVersion: 3
+    m_Name: Horizontal
+    descriptiveName: 
+    descriptiveNegativeName: 
+    negativeButton: left
+    positiveButton: right
+    altNegativeButton: a
+    altPositiveButton: d
+    gravity: 3
+    dead: 0.001
+    sensitivity: 3
+    snap: 1
+    invert: 0
+    type: 0
+    axis: 0
+    joyNum: 0
+  - serializedVersion: 3
+    m_Name: Vertical
+    descriptiveName: 
+    descriptiveNegativeName: 
+    negativeButton: down
+    positiveButton: up
+    altNegativeButton: s
+    altPositiveButton: w
+    gravity: 3
+    dead: 0.001
+    sensitivity: 3
+    snap: 1
+    invert: 0
+    type: 0
+    axis: 0
+    joyNum: 0
+  - serializedVersion: 3
+    m_Name: Fire1
+    descriptiveName: 
+    descriptiveNegativeName: 
+    negativeButton: 
+    positiveButton: left ctrl
+    altNegativeButton: 
+    altPositiveButton: mouse 0
+    gravity: 1000
+    dead: 0.001
+    sensitivity: 1000
+    snap: 0
+    invert: 0
+    type: 0
+    axis: 0
+    joyNum: 0
+  - serializedVersion: 3
+    m_Name: Fire2
+    descriptiveName: 
+    descriptiveNegativeName: 
+    negativeButton: 
+    positiveButton: left alt
+    altNegativeButton: 
+    altPositiveButton: mouse 1
+    gravity: 1000
+    dead: 0.001
+    sensitivity: 1000
+    snap: 0
+    invert: 0
+    type: 0
+    axis: 0
+    joyNum: 0
+  - serializedVersion: 3
+    m_Name: Fire3
+    descriptiveName: 
+    descriptiveNegativeName: 
+    negativeButton: 
+    positiveButton: left shift
+    altNegativeButton: 
+    altPositiveButton: mouse 2
+    gravity: 1000
+    dead: 0.001
+    sensitivity: 1000
+    snap: 0
+    invert: 0
+    type: 0
+    axis: 0
+    joyNum: 0
+  - serializedVersion: 3
+    m_Name: Jump
+    descriptiveName: 
+    descriptiveNegativeName: 
+    negativeButton: 
+    positiveButton: space
+    altNegativeButton: 
+    altPositiveButton: 
+    gravity: 1000
+    dead: 0.001
+    sensitivity: 1000
+    snap: 0
+    invert: 0
+    type: 0
+    axis: 0
+    joyNum: 0
+  - serializedVersion: 3
+    m_Name: Mouse X
+    descriptiveName: 
+    descriptiveNegativeName: 
+    negativeButton: 
+    positiveButton: 
+    altNegativeButton: 
+    altPositiveButton: 
+    gravity: 0
+    dead: 0
+    sensitivity: 0.1
+    snap: 0
+    invert: 0
+    type: 1
+    axis: 0
+    joyNum: 0
+  - serializedVersion: 3
+    m_Name: Mouse Y
+    descriptiveName: 
+    descriptiveNegativeName: 
+    negativeButton: 
+    positiveButton: 
+    altNegativeButton: 
+    altPositiveButton: 
+    gravity: 0
+    dead: 0
+    sensitivity: 0.1
+    snap: 0
+    invert: 0
+    type: 1
+    axis: 1
+    joyNum: 0
+  - serializedVersion: 3
+    m_Name: Mouse ScrollWheel
+    descriptiveName: 
+    descriptiveNegativeName: 
+    negativeButton: 
+    positiveButton: 
+    altNegativeButton: 
+    altPositiveButton: 
+    gravity: 0
+    dead: 0
+    sensitivity: 0.1
+    snap: 0
+    invert: 0
+    type: 1
+    axis: 2
+    joyNum: 0
+  - serializedVersion: 3
+    m_Name: Horizontal
+    descriptiveName: 
+    descriptiveNegativeName: 
+    negativeButton: 
+    positiveButton: 
+    altNegativeButton: 
+    altPositiveButton: 
+    gravity: 0
+    dead: 0.19
+    sensitivity: 1
+    snap: 0
+    invert: 0
+    type: 2
+    axis: 0
+    joyNum: 0
+  - serializedVersion: 3
+    m_Name: Vertical
+    descriptiveName: 
+    descriptiveNegativeName: 
+    negativeButton: 
+    positiveButton: 
+    altNegativeButton: 
+    altPositiveButton: 
+    gravity: 0
+    dead: 0.19
+    sensitivity: 1
+    snap: 0
+    invert: 1
+    type: 2
+    axis: 1
+    joyNum: 0
+  - serializedVersion: 3
+    m_Name: Fire1
+    descriptiveName: 
+    descriptiveNegativeName: 
+    negativeButton: 
+    positiveButton: joystick button 0
+    altNegativeButton: 
+    altPositiveButton: 
+    gravity: 1000
+    dead: 0.001
+    sensitivity: 1000
+    snap: 0
+    invert: 0
+    type: 0
+    axis: 0
+    joyNum: 0
+  - serializedVersion: 3
+    m_Name: Fire2
+    descriptiveName: 
+    descriptiveNegativeName: 
+    negativeButton: 
+    positiveButton: joystick button 1
+    altNegativeButton: 
+    altPositiveButton: 
+    gravity: 1000
+    dead: 0.001
+    sensitivity: 1000
+    snap: 0
+    invert: 0
+    type: 0
+    axis: 0
+    joyNum: 0
+  - serializedVersion: 3
+    m_Name: Fire3
+    descriptiveName: 
+    descriptiveNegativeName: 
+    negativeButton: 
+    positiveButton: joystick button 2
+    altNegativeButton: 
+    altPositiveButton: 
+    gravity: 1000
+    dead: 0.001
+    sensitivity: 1000
+    snap: 0
+    invert: 0
+    type: 0
+    axis: 0
+    joyNum: 0
+  - serializedVersion: 3
+    m_Name: Jump
+    descriptiveName: 
+    descriptiveNegativeName: 
+    negativeButton: 
+    positiveButton: joystick button 3
+    altNegativeButton: 
+    altPositiveButton: 
+    gravity: 1000
+    dead: 0.001
+    sensitivity: 1000
+    snap: 0
+    invert: 0
+    type: 0
+    axis: 0
+    joyNum: 0
+  - serializedVersion: 3
+    m_Name: Submit
+    descriptiveName: 
+    descriptiveNegativeName: 
+    negativeButton: 
+    positiveButton: return
+    altNegativeButton: 
+    altPositiveButton: joystick button 0
+    gravity: 1000
+    dead: 0.001
+    sensitivity: 1000
+    snap: 0
+    invert: 0
+    type: 0
+    axis: 0
+    joyNum: 0
+  - serializedVersion: 3
+    m_Name: Submit
+    descriptiveName: 
+    descriptiveNegativeName: 
+    negativeButton: 
+    positiveButton: enter
+    altNegativeButton: 
+    altPositiveButton: space
+    gravity: 1000
+    dead: 0.001
+    sensitivity: 1000
+    snap: 0
+    invert: 0
+    type: 0
+    axis: 0
+    joyNum: 0
+  - serializedVersion: 3
+    m_Name: Cancel
+    descriptiveName: 
+    descriptiveNegativeName: 
+    negativeButton: 
+    positiveButton: escape
+    altNegativeButton: 
+    altPositiveButton: joystick button 1
+    gravity: 1000
+    dead: 0.001
+    sensitivity: 1000
+    snap: 0
+    invert: 0
+    type: 0
+    axis: 0
+    joyNum: 0
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/NavMeshAreas.asset b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/NavMeshAreas.asset
new file mode 100644
index 0000000000..3b0b7c3d18
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/NavMeshAreas.asset
@@ -0,0 +1,91 @@
+%YAML 1.1
+%TAG !u! tag:unity3d.com,2011:
+--- !u!126 &1
+NavMeshProjectSettings:
+  m_ObjectHideFlags: 0
+  serializedVersion: 2
+  areas:
+  - name: Walkable
+    cost: 1
+  - name: Not Walkable
+    cost: 1
+  - name: Jump
+    cost: 2
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  m_LastAgentTypeID: -887442657
+  m_Settings:
+  - serializedVersion: 2
+    agentTypeID: 0
+    agentRadius: 0.5
+    agentHeight: 2
+    agentSlope: 45
+    agentClimb: 0.75
+    ledgeDropHeight: 0
+    maxJumpAcrossDistance: 0
+    minRegionArea: 2
+    manualCellSize: 0
+    cellSize: 0.16666667
+    manualTileSize: 0
+    tileSize: 256
+    accuratePlacement: 0
+    debug:
+      m_Flags: 0
+  m_SettingNames:
+  - Humanoid
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/NetworkManager.asset b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/NetworkManager.asset
new file mode 100644
index 0000000000..5dc6a831d9
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/NetworkManager.asset
@@ -0,0 +1,8 @@
+%YAML 1.1
+%TAG !u! tag:unity3d.com,2011:
+--- !u!149 &1
+NetworkManager:
+  m_ObjectHideFlags: 0
+  m_DebugLevel: 0
+  m_Sendrate: 15
+  m_AssetToPrefab: {}
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/Physics2DSettings.asset b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/Physics2DSettings.asset
new file mode 100644
index 0000000000..132ee6bc86
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/Physics2DSettings.asset
@@ -0,0 +1,37 @@
+%YAML 1.1
+%TAG !u! tag:unity3d.com,2011:
+--- !u!19 &1
+Physics2DSettings:
+  m_ObjectHideFlags: 0
+  serializedVersion: 3
+  m_Gravity: {x: 0, y: -9.81}
+  m_DefaultMaterial: {fileID: 0}
+  m_VelocityIterations: 8
+  m_PositionIterations: 3
+  m_VelocityThreshold: 1
+  m_MaxLinearCorrection: 0.2
+  m_MaxAngularCorrection: 8
+  m_MaxTranslationSpeed: 100
+  m_MaxRotationSpeed: 360
+  m_BaumgarteScale: 0.2
+  m_BaumgarteTimeOfImpactScale: 0.75
+  m_TimeToSleep: 0.5
+  m_LinearSleepTolerance: 0.01
+  m_AngularSleepTolerance: 2
+  m_DefaultContactOffset: 0.01
+  m_AutoSimulation: 1
+  m_QueriesHitTriggers: 1
+  m_QueriesStartInColliders: 1
+  m_ChangeStopsCallbacks: 0
+  m_CallbacksOnDisable: 1
+  m_AutoSyncTransforms: 1
+  m_AlwaysShowColliders: 0
+  m_ShowColliderSleep: 1
+  m_ShowColliderContacts: 0
+  m_ShowColliderAABB: 0
+  m_ContactArrowScale: 0.2
+  m_ColliderAwakeColor: {r: 0.5686275, g: 0.95686275, b: 0.54509807, a: 0.7529412}
+  m_ColliderAsleepColor: {r: 0.5686275, g: 0.95686275, b: 0.54509807, a: 0.36078432}
+  m_ColliderContactColor: {r: 1, g: 0, b: 1, a: 0.6862745}
+  m_ColliderAABBColor: {r: 1, g: 1, b: 0, a: 0.2509804}
+  m_LayerCollisionMatrix: ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/ProjectSettings.asset b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/ProjectSettings.asset
new file mode 100644
index 0000000000..3fbfab76c1
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/ProjectSettings.asset
@@ -0,0 +1,641 @@
+%YAML 1.1
+%TAG !u! tag:unity3d.com,2011:
+--- !u!129 &1
+PlayerSettings:
+  m_ObjectHideFlags: 0
+  serializedVersion: 14
+  productGUID: a084943b991dd4597b140f4ce2b41c65
+  AndroidProfiler: 0
+  AndroidFilterTouchesWhenObscured: 0
+  defaultScreenOrientation: 4
+  targetDevice: 2
+  useOnDemandResources: 0
+  accelerometerFrequency: 60
+  companyName: DefaultCompany
+  productName: TensorFlowLitePlugin
+  defaultCursor: {fileID: 0}
+  cursorHotspot: {x: 0, y: 0}
+  m_SplashScreenBackgroundColor: {r: 0.13725491, g: 0.12156863, b: 0.1254902, a: 1}
+  m_ShowUnitySplashScreen: 1
+  m_ShowUnitySplashLogo: 1
+  m_SplashScreenOverlayOpacity: 1
+  m_SplashScreenAnimation: 1
+  m_SplashScreenLogoStyle: 1
+  m_SplashScreenDrawMode: 0
+  m_SplashScreenBackgroundAnimationZoom: 1
+  m_SplashScreenLogoAnimationZoom: 1
+  m_SplashScreenBackgroundLandscapeAspect: 1
+  m_SplashScreenBackgroundPortraitAspect: 1
+  m_SplashScreenBackgroundLandscapeUvs:
+    serializedVersion: 2
+    x: 0
+    y: 0
+    width: 1
+    height: 1
+  m_SplashScreenBackgroundPortraitUvs:
+    serializedVersion: 2
+    x: 0
+    y: 0
+    width: 1
+    height: 1
+  m_SplashScreenLogos: []
+  m_VirtualRealitySplashScreen: {fileID: 0}
+  m_HolographicTrackingLossScreen: {fileID: 0}
+  defaultScreenWidth: 1024
+  defaultScreenHeight: 768
+  defaultScreenWidthWeb: 960
+  defaultScreenHeightWeb: 600
+  m_StereoRenderingPath: 0
+  m_ActiveColorSpace: 0
+  m_MTRendering: 1
+  m_StackTraceTypes: 010000000100000001000000010000000100000001000000
+  iosShowActivityIndicatorOnLoading: -1
+  androidShowActivityIndicatorOnLoading: -1
+  tizenShowActivityIndicatorOnLoading: -1
+  iosAppInBackgroundBehavior: 0
+  displayResolutionDialog: 1
+  iosAllowHTTPDownload: 1
+  allowedAutorotateToPortrait: 1
+  allowedAutorotateToPortraitUpsideDown: 1
+  allowedAutorotateToLandscapeRight: 1
+  allowedAutorotateToLandscapeLeft: 1
+  useOSAutorotation: 1
+  use32BitDisplayBuffer: 1
+  preserveFramebufferAlpha: 0
+  disableDepthAndStencilBuffers: 0
+  androidBlitType: 0
+  defaultIsFullScreen: 1
+  defaultIsNativeResolution: 1
+  macRetinaSupport: 1
+  runInBackground: 0
+  captureSingleScreen: 0
+  muteOtherAudioSources: 0
+  Prepare IOS For Recording: 0
+  Force IOS Speakers When Recording: 0
+  deferSystemGesturesMode: 0
+  hideHomeButton: 0
+  submitAnalytics: 1
+  usePlayerLog: 1
+  bakeCollisionMeshes: 0
+  forceSingleInstance: 0
+  resizableWindow: 0
+  useMacAppStoreValidation: 0
+  macAppStoreCategory: public.app-category.games
+  gpuSkinning: 0
+  graphicsJobs: 0
+  xboxPIXTextureCapture: 0
+  xboxEnableAvatar: 0
+  xboxEnableKinect: 0
+  xboxEnableKinectAutoTracking: 0
+  xboxEnableFitness: 0
+  visibleInBackground: 1
+  allowFullscreenSwitch: 1
+  graphicsJobMode: 0
+  macFullscreenMode: 2
+  d3d11FullscreenMode: 1
+  xboxSpeechDB: 0
+  xboxEnableHeadOrientation: 0
+  xboxEnableGuest: 0
+  xboxEnablePIXSampling: 0
+  metalFramebufferOnly: 0
+  n3dsDisableStereoscopicView: 0
+  n3dsEnableSharedListOpt: 1
+  n3dsEnableVSync: 0
+  xboxOneResolution: 0
+  xboxOneSResolution: 0
+  xboxOneXResolution: 3
+  xboxOneMonoLoggingLevel: 0
+  xboxOneLoggingLevel: 1
+  xboxOneDisableEsram: 0
+  xboxOnePresentImmediateThreshold: 0
+  videoMemoryForVertexBuffers: 0
+  psp2PowerMode: 0
+  psp2AcquireBGM: 1
+  wiiUTVResolution: 0
+  wiiUGamePadMSAA: 1
+  wiiUSupportsNunchuk: 0
+  wiiUSupportsClassicController: 0
+  wiiUSupportsBalanceBoard: 0
+  wiiUSupportsMotionPlus: 0
+  wiiUSupportsProController: 0
+  wiiUAllowScreenCapture: 1
+  wiiUControllerCount: 0
+  m_SupportedAspectRatios:
+    4:3: 1
+    5:4: 1
+    16:10: 1
+    16:9: 1
+    Others: 1
+  bundleVersion: 1.0
+  preloadedAssets: []
+  metroInputSource: 0
+  wsaTransparentSwapchain: 0
+  m_HolographicPauseOnTrackingLoss: 1
+  xboxOneDisableKinectGpuReservation: 0
+  xboxOneEnable7thCore: 0
+  vrSettings:
+    cardboard:
+      depthFormat: 0
+      enableTransitionView: 0
+    daydream:
+      depthFormat: 0
+      useSustainedPerformanceMode: 0
+      enableVideoLayer: 0
+      useProtectedVideoMemory: 0
+      minimumSupportedHeadTracking: 0
+      maximumSupportedHeadTracking: 1
+    hololens:
+      depthFormat: 1
+      depthBufferSharingEnabled: 0
+    oculus:
+      sharedDepthBuffer: 0
+      dashSupport: 0
+  protectGraphicsMemory: 0
+  useHDRDisplay: 0
+  m_ColorGamuts: 00000000
+  targetPixelDensity: 30
+  resolutionScalingMode: 0
+  androidSupportedAspectRatio: 1
+  androidMaxAspectRatio: 2.1
+  applicationIdentifier: {}
+  buildNumber: {}
+  AndroidBundleVersionCode: 1
+  AndroidMinSdkVersion: 16
+  AndroidTargetSdkVersion: 0
+  AndroidPreferredInstallLocation: 1
+  aotOptions: 
+  stripEngineCode: 1
+  iPhoneStrippingLevel: 0
+  iPhoneScriptCallOptimization: 0
+  ForceInternetPermission: 0
+  ForceSDCardPermission: 0
+  CreateWallpaper: 0
+  APKExpansionFiles: 0
+  keepLoadedShadersAlive: 0
+  StripUnusedMeshComponents: 0
+  VertexChannelCompressionMask:
+    serializedVersion: 2
+    m_Bits: 238
+  iPhoneSdkVersion: 988
+  iOSTargetOSVersionString: 7.0
+  tvOSSdkVersion: 0
+  tvOSRequireExtendedGameController: 0
+  tvOSTargetOSVersionString: 9.0
+  uIPrerenderedIcon: 0
+  uIRequiresPersistentWiFi: 0
+  uIRequiresFullScreen: 1
+  uIStatusBarHidden: 1
+  uIExitOnSuspend: 0
+  uIStatusBarStyle: 0
+  iPhoneSplashScreen: {fileID: 0}
+  iPhoneHighResSplashScreen: {fileID: 0}
+  iPhoneTallHighResSplashScreen: {fileID: 0}
+  iPhone47inSplashScreen: {fileID: 0}
+  iPhone55inPortraitSplashScreen: {fileID: 0}
+  iPhone55inLandscapeSplashScreen: {fileID: 0}
+  iPhone58inPortraitSplashScreen: {fileID: 0}
+  iPhone58inLandscapeSplashScreen: {fileID: 0}
+  iPadPortraitSplashScreen: {fileID: 0}
+  iPadHighResPortraitSplashScreen: {fileID: 0}
+  iPadLandscapeSplashScreen: {fileID: 0}
+  iPadHighResLandscapeSplashScreen: {fileID: 0}
+  appleTVSplashScreen: {fileID: 0}
+  appleTVSplashScreen2x: {fileID: 0}
+  tvOSSmallIconLayers: []
+  tvOSSmallIconLayers2x: []
+  tvOSLargeIconLayers: []
+  tvOSTopShelfImageLayers: []
+  tvOSTopShelfImageLayers2x: []
+  tvOSTopShelfImageWideLayers: []
+  tvOSTopShelfImageWideLayers2x: []
+  iOSLaunchScreenType: 0
+  iOSLaunchScreenPortrait: {fileID: 0}
+  iOSLaunchScreenLandscape: {fileID: 0}
+  iOSLaunchScreenBackgroundColor:
+    serializedVersion: 2
+    rgba: 0
+  iOSLaunchScreenFillPct: 100
+  iOSLaunchScreenSize: 100
+  iOSLaunchScreenCustomXibPath: 
+  iOSLaunchScreeniPadType: 0
+  iOSLaunchScreeniPadImage: {fileID: 0}
+  iOSLaunchScreeniPadBackgroundColor:
+    serializedVersion: 2
+    rgba: 0
+  iOSLaunchScreeniPadFillPct: 100
+  iOSLaunchScreeniPadSize: 100
+  iOSLaunchScreeniPadCustomXibPath: 
+  iOSUseLaunchScreenStoryboard: 0
+  iOSLaunchScreenCustomStoryboardPath: 
+  iOSDeviceRequirements: []
+  iOSURLSchemes: []
+  iOSBackgroundModes: 0
+  iOSMetalForceHardShadows: 0
+  metalEditorSupport: 1
+  metalAPIValidation: 1
+  iOSRenderExtraFrameOnPause: 0
+  appleDeveloperTeamID: 
+  iOSManualSigningProvisioningProfileID: 
+  tvOSManualSigningProvisioningProfileID: 
+  appleEnableAutomaticSigning: 0
+  clonedFromGUID: 00000000000000000000000000000000
+  AndroidTargetDevice: 0
+  AndroidSplashScreenScale: 0
+  androidSplashScreen: {fileID: 0}
+  AndroidKeystoreName: 
+  AndroidKeyaliasName: 
+  AndroidTVCompatibility: 1
+  AndroidIsGame: 1
+  AndroidEnableTango: 0
+  androidEnableBanner: 1
+  androidUseLowAccuracyLocation: 0
+  m_AndroidBanners:
+  - width: 320
+    height: 180
+    banner: {fileID: 0}
+  androidGamepadSupportLevel: 0
+  resolutionDialogBanner: {fileID: 0}
+  m_BuildTargetIcons: []
+  m_BuildTargetBatching: []
+  m_BuildTargetGraphicsAPIs: []
+  m_BuildTargetVRSettings: []
+  m_BuildTargetEnableVuforiaSettings: []
+  openGLRequireES31: 0
+  openGLRequireES31AEP: 0
+  m_TemplateCustomTags: {}
+  mobileMTRendering:
+    Android: 1
+    iPhone: 1
+    tvOS: 1
+  m_BuildTargetGroupLightmapEncodingQuality: []
+  wiiUTitleID: 0005000011000000
+  wiiUGroupID: 00010000
+  wiiUCommonSaveSize: 4096
+  wiiUAccountSaveSize: 2048
+  wiiUOlvAccessKey: 0
+  wiiUTinCode: 0
+  wiiUJoinGameId: 0
+  wiiUJoinGameModeMask: 0000000000000000
+  wiiUCommonBossSize: 0
+  wiiUAccountBossSize: 0
+  wiiUAddOnUniqueIDs: []
+  wiiUMainThreadStackSize: 3072
+  wiiULoaderThreadStackSize: 1024
+  wiiUSystemHeapSize: 128
+  wiiUTVStartupScreen: {fileID: 0}
+  wiiUGamePadStartupScreen: {fileID: 0}
+  wiiUDrcBufferDisabled: 0
+  wiiUProfilerLibPath: 
+  playModeTestRunnerEnabled: 0
+  actionOnDotNetUnhandledException: 1
+  enableInternalProfiler: 0
+  logObjCUncaughtExceptions: 1
+  enableCrashReportAPI: 0
+  cameraUsageDescription: 
+  locationUsageDescription: 
+  microphoneUsageDescription: 
+  switchNetLibKey: 
+  switchSocketMemoryPoolSize: 6144
+  switchSocketAllocatorPoolSize: 128
+  switchSocketConcurrencyLimit: 14
+  switchScreenResolutionBehavior: 2
+  switchUseCPUProfiler: 0
+  switchApplicationID: 0x01004b9000490000
+  switchNSODependencies: 
+  switchTitleNames_0: 
+  switchTitleNames_1: 
+  switchTitleNames_2: 
+  switchTitleNames_3: 
+  switchTitleNames_4: 
+  switchTitleNames_5: 
+  switchTitleNames_6: 
+  switchTitleNames_7: 
+  switchTitleNames_8: 
+  switchTitleNames_9: 
+  switchTitleNames_10: 
+  switchTitleNames_11: 
+  switchTitleNames_12: 
+  switchTitleNames_13: 
+  switchTitleNames_14: 
+  switchPublisherNames_0: 
+  switchPublisherNames_1: 
+  switchPublisherNames_2: 
+  switchPublisherNames_3: 
+  switchPublisherNames_4: 
+  switchPublisherNames_5: 
+  switchPublisherNames_6: 
+  switchPublisherNames_7: 
+  switchPublisherNames_8: 
+  switchPublisherNames_9: 
+  switchPublisherNames_10: 
+  switchPublisherNames_11: 
+  switchPublisherNames_12: 
+  switchPublisherNames_13: 
+  switchPublisherNames_14: 
+  switchIcons_0: {fileID: 0}
+  switchIcons_1: {fileID: 0}
+  switchIcons_2: {fileID: 0}
+  switchIcons_3: {fileID: 0}
+  switchIcons_4: {fileID: 0}
+  switchIcons_5: {fileID: 0}
+  switchIcons_6: {fileID: 0}
+  switchIcons_7: {fileID: 0}
+  switchIcons_8: {fileID: 0}
+  switchIcons_9: {fileID: 0}
+  switchIcons_10: {fileID: 0}
+  switchIcons_11: {fileID: 0}
+  switchIcons_12: {fileID: 0}
+  switchIcons_13: {fileID: 0}
+  switchIcons_14: {fileID: 0}
+  switchSmallIcons_0: {fileID: 0}
+  switchSmallIcons_1: {fileID: 0}
+  switchSmallIcons_2: {fileID: 0}
+  switchSmallIcons_3: {fileID: 0}
+  switchSmallIcons_4: {fileID: 0}
+  switchSmallIcons_5: {fileID: 0}
+  switchSmallIcons_6: {fileID: 0}
+  switchSmallIcons_7: {fileID: 0}
+  switchSmallIcons_8: {fileID: 0}
+  switchSmallIcons_9: {fileID: 0}
+  switchSmallIcons_10: {fileID: 0}
+  switchSmallIcons_11: {fileID: 0}
+  switchSmallIcons_12: {fileID: 0}
+  switchSmallIcons_13: {fileID: 0}
+  switchSmallIcons_14: {fileID: 0}
+  switchManualHTML: 
+  switchAccessibleURLs: 
+  switchLegalInformation: 
+  switchMainThreadStackSize: 1048576
+  switchPresenceGroupId: 
+  switchLogoHandling: 0
+  switchReleaseVersion: 0
+  switchDisplayVersion: 1.0.0
+  switchStartupUserAccount: 0
+  switchTouchScreenUsage: 0
+  switchSupportedLanguagesMask: 0
+  switchLogoType: 0
+  switchApplicationErrorCodeCategory: 
+  switchUserAccountSaveDataSize: 0
+  switchUserAccountSaveDataJournalSize: 0
+  switchApplicationAttribute: 0
+  switchCardSpecSize: -1
+  switchCardSpecClock: -1
+  switchRatingsMask: 0
+  switchRatingsInt_0: 0
+  switchRatingsInt_1: 0
+  switchRatingsInt_2: 0
+  switchRatingsInt_3: 0
+  switchRatingsInt_4: 0
+  switchRatingsInt_5: 0
+  switchRatingsInt_6: 0
+  switchRatingsInt_7: 0
+  switchRatingsInt_8: 0
+  switchRatingsInt_9: 0
+  switchRatingsInt_10: 0
+  switchRatingsInt_11: 0
+  switchLocalCommunicationIds_0: 
+  switchLocalCommunicationIds_1: 
+  switchLocalCommunicationIds_2: 
+  switchLocalCommunicationIds_3: 
+  switchLocalCommunicationIds_4: 
+  switchLocalCommunicationIds_5: 
+  switchLocalCommunicationIds_6: 
+  switchLocalCommunicationIds_7: 
+  switchParentalControl: 0
+  switchAllowsScreenshot: 1
+  switchAllowsVideoCapturing: 1
+  switchAllowsRuntimeAddOnContentInstall: 0
+  switchDataLossConfirmation: 0
+  switchSupportedNpadStyles: 3
+  switchSocketConfigEnabled: 0
+  switchTcpInitialSendBufferSize: 32
+  switchTcpInitialReceiveBufferSize: 64
+  switchTcpAutoSendBufferSizeMax: 256
+  switchTcpAutoReceiveBufferSizeMax: 256
+  switchUdpSendBufferSize: 9
+  switchUdpReceiveBufferSize: 42
+  switchSocketBufferEfficiency: 4
+  switchSocketInitializeEnabled: 1
+  switchNetworkInterfaceManagerInitializeEnabled: 1
+  switchPlayerConnectionEnabled: 1
+  ps4NPAgeRating: 12
+  ps4NPTitleSecret: 
+  ps4NPTrophyPackPath: 
+  ps4ParentalLevel: 11
+  ps4ContentID: ED1633-NPXX51362_00-0000000000000000
+  ps4Category: 0
+  ps4MasterVersion: 01.00
+  ps4AppVersion: 01.00
+  ps4AppType: 0
+  ps4ParamSfxPath: 
+  ps4VideoOutPixelFormat: 0
+  ps4VideoOutInitialWidth: 1920
+  ps4VideoOutBaseModeInitialWidth: 1920
+  ps4VideoOutReprojectionRate: 60
+  ps4PronunciationXMLPath: 
+  ps4PronunciationSIGPath: 
+  ps4BackgroundImagePath: 
+  ps4StartupImagePath: 
+  ps4StartupImagesFolder: 
+  ps4IconImagesFolder: 
+  ps4SaveDataImagePath: 
+  ps4SdkOverride: 
+  ps4BGMPath: 
+  ps4ShareFilePath: 
+  ps4ShareOverlayImagePath: 
+  ps4PrivacyGuardImagePath: 
+  ps4NPtitleDatPath: 
+  ps4RemotePlayKeyAssignment: -1
+  ps4RemotePlayKeyMappingDir: 
+  ps4PlayTogetherPlayerCount: 0
+  ps4EnterButtonAssignment: 1
+  ps4ApplicationParam1: 0
+  ps4ApplicationParam2: 0
+  ps4ApplicationParam3: 0
+  ps4ApplicationParam4: 0
+  ps4DownloadDataSize: 0
+  ps4GarlicHeapSize: 2048
+  ps4ProGarlicHeapSize: 2560
+  ps4Passcode: d3hjjul8UhK6ZnQCEBYYQPozR9sQV066
+  ps4pnSessions: 1
+  ps4pnPresence: 1
+  ps4pnFriends: 1
+  ps4pnGameCustomData: 1
+  playerPrefsSupport: 0
+  restrictedAudioUsageRights: 0
+  ps4UseResolutionFallback: 0
+  ps4ReprojectionSupport: 0
+  ps4UseAudio3dBackend: 0
+  ps4SocialScreenEnabled: 0
+  ps4ScriptOptimizationLevel: 0
+  ps4Audio3dVirtualSpeakerCount: 14
+  ps4attribCpuUsage: 0
+  ps4PatchPkgPath: 
+  ps4PatchLatestPkgPath: 
+  ps4PatchChangeinfoPath: 
+  ps4PatchDayOne: 0
+  ps4attribUserManagement: 0
+  ps4attribMoveSupport: 0
+  ps4attrib3DSupport: 0
+  ps4attribShareSupport: 0
+  ps4attribExclusiveVR: 0
+  ps4disableAutoHideSplash: 0
+  ps4videoRecordingFeaturesUsed: 0
+  ps4contentSearchFeaturesUsed: 0
+  ps4attribEyeToEyeDistanceSettingVR: 0
+  ps4IncludedModules: []
+  monoEnv: 
+  psp2Splashimage: {fileID: 0}
+  psp2NPTrophyPackPath: 
+  psp2NPSupportGBMorGJP: 0
+  psp2NPAgeRating: 12
+  psp2NPTitleDatPath: 
+  psp2NPCommsID: 
+  psp2NPCommunicationsID: 
+  psp2NPCommsPassphrase: 
+  psp2NPCommsSig: 
+  psp2ParamSfxPath: 
+  psp2ManualPath: 
+  psp2LiveAreaGatePath: 
+  psp2LiveAreaBackroundPath: 
+  psp2LiveAreaPath: 
+  psp2LiveAreaTrialPath: 
+  psp2PatchChangeInfoPath: 
+  psp2PatchOriginalPackage: 
+  psp2PackagePassword: 3onkgZsAECEn0fzCoWiCtWCKe4l74pE5
+  psp2KeystoneFile: 
+  psp2MemoryExpansionMode: 0
+  psp2DRMType: 0
+  psp2StorageType: 0
+  psp2MediaCapacity: 0
+  psp2DLCConfigPath: 
+  psp2ThumbnailPath: 
+  psp2BackgroundPath: 
+  psp2SoundPath: 
+  psp2TrophyCommId: 
+  psp2TrophyPackagePath: 
+  psp2PackagedResourcesPath: 
+  psp2SaveDataQuota: 10240
+  psp2ParentalLevel: 1
+  psp2ShortTitle: Not Set
+  psp2ContentID: IV0000-ABCD12345_00-0123456789ABCDEF
+  psp2Category: 0
+  psp2MasterVersion: 01.00
+  psp2AppVersion: 01.00
+  psp2TVBootMode: 0
+  psp2EnterButtonAssignment: 2
+  psp2TVDisableEmu: 0
+  psp2AllowTwitterDialog: 1
+  psp2Upgradable: 0
+  psp2HealthWarning: 0
+  psp2UseLibLocation: 0
+  psp2InfoBarOnStartup: 0
+  psp2InfoBarColor: 0
+  psp2ScriptOptimizationLevel: 0
+  psmSplashimage: {fileID: 0}
+  splashScreenBackgroundSourceLandscape: {fileID: 0}
+  splashScreenBackgroundSourcePortrait: {fileID: 0}
+  spritePackerPolicy: 
+  webGLMemorySize: 256
+  webGLExceptionSupport: 1
+  webGLNameFilesAsHashes: 0
+  webGLDataCaching: 0
+  webGLDebugSymbols: 0
+  webGLEmscriptenArgs: 
+  webGLModulesDirectory: 
+  webGLTemplate: APPLICATION:Default
+  webGLAnalyzeBuildSize: 0
+  webGLUseEmbeddedResources: 0
+  webGLUseWasm: 0
+  webGLCompressionFormat: 1
+  scriptingDefineSymbols: {}
+  platformArchitecture: {}
+  scriptingBackend: {}
+  incrementalIl2cppBuild: {}
+  additionalIl2CppArgs: 
+  scriptingRuntimeVersion: 0
+  apiCompatibilityLevelPerPlatform: {}
+  m_RenderingPath: 1
+  m_MobileRenderingPath: 1
+  metroPackageName: TensorFlowLitePlugin
+  metroPackageVersion: 
+  metroCertificatePath: 
+  metroCertificatePassword: 
+  metroCertificateSubject: 
+  metroCertificateIssuer: 
+  metroCertificateNotAfter: 0000000000000000
+  metroApplicationDescription: TensorFlowLitePlugin
+  wsaImages: {}
+  metroTileShortName: 
+  metroCommandLineArgsFile: 
+  metroTileShowName: 0
+  metroMediumTileShowName: 0
+  metroLargeTileShowName: 0
+  metroWideTileShowName: 0
+  metroDefaultTileSize: 1
+  metroTileForegroundText: 2
+  metroTileBackgroundColor: {r: 0.13333334, g: 0.17254902, b: 0.21568628, a: 0}
+  metroSplashScreenBackgroundColor: {r: 0.12941177, g: 0.17254902, b: 0.21568628,
+    a: 1}
+  metroSplashScreenUseBackgroundColor: 0
+  platformCapabilities: {}
+  metroFTAName: 
+  metroFTAFileTypes: []
+  metroProtocolName: 
+  metroCompilationOverrides: 1
+  tizenProductDescription: 
+  tizenProductURL: 
+  tizenSigningProfileName: 
+  tizenGPSPermissions: 0
+  tizenMicrophonePermissions: 0
+  tizenDeploymentTarget: 
+  tizenDeploymentTargetType: -1
+  tizenMinOSVersion: 1
+  n3dsUseExtSaveData: 0
+  n3dsCompressStaticMem: 1
+  n3dsExtSaveDataNumber: 0x12345
+  n3dsStackSize: 131072
+  n3dsTargetPlatform: 2
+  n3dsRegion: 7
+  n3dsMediaSize: 0
+  n3dsLogoStyle: 3
+  n3dsTitle: GameName
+  n3dsProductCode: 
+  n3dsApplicationId: 0xFF3FF
+  XboxOneProductId: 
+  XboxOneUpdateKey: 
+  XboxOneSandboxId: 
+  XboxOneContentId: 
+  XboxOneTitleId: 
+  XboxOneSCId: 
+  XboxOneGameOsOverridePath: 
+  XboxOnePackagingOverridePath: 
+  XboxOneAppManifestOverridePath: 
+  XboxOnePackageEncryption: 0
+  XboxOnePackageUpdateGranularity: 2
+  XboxOneDescription: 
+  XboxOneLanguage:
+  - enus
+  XboxOneCapability: []
+  XboxOneGameRating: {}
+  XboxOneIsContentPackage: 0
+  XboxOneEnableGPUVariability: 0
+  XboxOneSockets: {}
+  XboxOneSplashScreen: {fileID: 0}
+  XboxOneAllowedProductIds: []
+  XboxOnePersistentLocalStorageSize: 0
+  XboxOneXTitleMemory: 8
+  xboxOneScriptCompiler: 0
+  vrEditorSettings:
+    daydream:
+      daydreamIconForeground: {fileID: 0}
+      daydreamIconBackground: {fileID: 0}
+  cloudServicesEnabled: {}
+  facebookSdkVersion: 7.9.4
+  apiCompatibilityLevel: 2
+  cloudProjectId: 
+  projectName: 
+  organizationId: 
+  cloudEnabled: 0
+  enableNativePlatformBackendsForNewInputSystem: 0
+  disableOldInputManagerSupport: 0
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/ProjectVersion.txt b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/ProjectVersion.txt
new file mode 100644
index 0000000000..4a9cfb61ab
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/ProjectVersion.txt
@@ -0,0 +1 @@
+m_EditorVersion: 2017.4.6f1
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/QualitySettings.asset b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/QualitySettings.asset
new file mode 100644
index 0000000000..05daac3c49
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/QualitySettings.asset
@@ -0,0 +1,191 @@
+%YAML 1.1
+%TAG !u! tag:unity3d.com,2011:
+--- !u!47 &1
+QualitySettings:
+  m_ObjectHideFlags: 0
+  serializedVersion: 5
+  m_CurrentQuality: 5
+  m_QualitySettings:
+  - serializedVersion: 2
+    name: Very Low
+    pixelLightCount: 0
+    shadows: 0
+    shadowResolution: 0
+    shadowProjection: 1
+    shadowCascades: 1
+    shadowDistance: 15
+    shadowNearPlaneOffset: 3
+    shadowCascade2Split: 0.33333334
+    shadowCascade4Split: {x: 0.06666667, y: 0.2, z: 0.46666667}
+    shadowmaskMode: 0
+    blendWeights: 1
+    textureQuality: 1
+    anisotropicTextures: 0
+    antiAliasing: 0
+    softParticles: 0
+    softVegetation: 0
+    realtimeReflectionProbes: 0
+    billboardsFaceCameraPosition: 0
+    vSyncCount: 0
+    lodBias: 0.3
+    maximumLODLevel: 0
+    particleRaycastBudget: 4
+    asyncUploadTimeSlice: 2
+    asyncUploadBufferSize: 4
+    resolutionScalingFixedDPIFactor: 1
+    excludedTargetPlatforms: []
+  - serializedVersion: 2
+    name: Low
+    pixelLightCount: 0
+    shadows: 0
+    shadowResolution: 0
+    shadowProjection: 1
+    shadowCascades: 1
+    shadowDistance: 20
+    shadowNearPlaneOffset: 3
+    shadowCascade2Split: 0.33333334
+    shadowCascade4Split: {x: 0.06666667, y: 0.2, z: 0.46666667}
+    shadowmaskMode: 0
+    blendWeights: 2
+    textureQuality: 0
+    anisotropicTextures: 0
+    antiAliasing: 0
+    softParticles: 0
+    softVegetation: 0
+    realtimeReflectionProbes: 0
+    billboardsFaceCameraPosition: 0
+    vSyncCount: 0
+    lodBias: 0.4
+    maximumLODLevel: 0
+    particleRaycastBudget: 16
+    asyncUploadTimeSlice: 2
+    asyncUploadBufferSize: 4
+    resolutionScalingFixedDPIFactor: 1
+    excludedTargetPlatforms: []
+  - serializedVersion: 2
+    name: Medium
+    pixelLightCount: 1
+    shadows: 1
+    shadowResolution: 0
+    shadowProjection: 1
+    shadowCascades: 1
+    shadowDistance: 20
+    shadowNearPlaneOffset: 3
+    shadowCascade2Split: 0.33333334
+    shadowCascade4Split: {x: 0.06666667, y: 0.2, z: 0.46666667}
+    shadowmaskMode: 0
+    blendWeights: 2
+    textureQuality: 0
+    anisotropicTextures: 1
+    antiAliasing: 0
+    softParticles: 0
+    softVegetation: 0
+    realtimeReflectionProbes: 0
+    billboardsFaceCameraPosition: 0
+    vSyncCount: 1
+    lodBias: 0.7
+    maximumLODLevel: 0
+    particleRaycastBudget: 64
+    asyncUploadTimeSlice: 2
+    asyncUploadBufferSize: 4
+    resolutionScalingFixedDPIFactor: 1
+    excludedTargetPlatforms: []
+  - serializedVersion: 2
+    name: High
+    pixelLightCount: 2
+    shadows: 2
+    shadowResolution: 1
+    shadowProjection: 1
+    shadowCascades: 2
+    shadowDistance: 40
+    shadowNearPlaneOffset: 3
+    shadowCascade2Split: 0.33333334
+    shadowCascade4Split: {x: 0.06666667, y: 0.2, z: 0.46666667}
+    shadowmaskMode: 1
+    blendWeights: 2
+    textureQuality: 0
+    anisotropicTextures: 1
+    antiAliasing: 0
+    softParticles: 0
+    softVegetation: 1
+    realtimeReflectionProbes: 1
+    billboardsFaceCameraPosition: 1
+    vSyncCount: 1
+    lodBias: 1
+    maximumLODLevel: 0
+    particleRaycastBudget: 256
+    asyncUploadTimeSlice: 2
+    asyncUploadBufferSize: 4
+    resolutionScalingFixedDPIFactor: 1
+    excludedTargetPlatforms: []
+  - serializedVersion: 2
+    name: Very High
+    pixelLightCount: 3
+    shadows: 2
+    shadowResolution: 2
+    shadowProjection: 1
+    shadowCascades: 2
+    shadowDistance: 70
+    shadowNearPlaneOffset: 3
+    shadowCascade2Split: 0.33333334
+    shadowCascade4Split: {x: 0.06666667, y: 0.2, z: 0.46666667}
+    shadowmaskMode: 1
+    blendWeights: 4
+    textureQuality: 0
+    anisotropicTextures: 2
+    antiAliasing: 2
+    softParticles: 1
+    softVegetation: 1
+    realtimeReflectionProbes: 1
+    billboardsFaceCameraPosition: 1
+    vSyncCount: 1
+    lodBias: 1.5
+    maximumLODLevel: 0
+    particleRaycastBudget: 1024
+    asyncUploadTimeSlice: 2
+    asyncUploadBufferSize: 4
+    resolutionScalingFixedDPIFactor: 1
+    excludedTargetPlatforms: []
+  - serializedVersion: 2
+    name: Ultra
+    pixelLightCount: 4
+    shadows: 2
+    shadowResolution: 2
+    shadowProjection: 1
+    shadowCascades: 4
+    shadowDistance: 150
+    shadowNearPlaneOffset: 3
+    shadowCascade2Split: 0.33333334
+    shadowCascade4Split: {x: 0.06666667, y: 0.2, z: 0.46666667}
+    shadowmaskMode: 1
+    blendWeights: 4
+    textureQuality: 0
+    anisotropicTextures: 2
+    antiAliasing: 2
+    softParticles: 1
+    softVegetation: 1
+    realtimeReflectionProbes: 1
+    billboardsFaceCameraPosition: 1
+    vSyncCount: 1
+    lodBias: 2
+    maximumLODLevel: 0
+    particleRaycastBudget: 4096
+    asyncUploadTimeSlice: 2
+    asyncUploadBufferSize: 4
+    resolutionScalingFixedDPIFactor: 1
+    excludedTargetPlatforms: []
+  m_PerPlatformDefaultQuality:
+    Android: 2
+    Nintendo 3DS: 5
+    Nintendo Switch: 5
+    PS4: 5
+    PSM: 5
+    PSP2: 2
+    Standalone: 5
+    Tizen: 2
+    WebGL: 3
+    WiiU: 5
+    Windows Store Apps: 5
+    XboxOne: 5
+    iPhone: 2
+    tvOS: 2
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/TagManager.asset b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/TagManager.asset
new file mode 100644
index 0000000000..1c92a7840e
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/TagManager.asset
@@ -0,0 +1,43 @@
+%YAML 1.1
+%TAG !u! tag:unity3d.com,2011:
+--- !u!78 &1
+TagManager:
+  serializedVersion: 2
+  tags: []
+  layers:
+  - Default
+  - TransparentFX
+  - Ignore Raycast
+  - 
+  - Water
+  - UI
+  - 
+  - 
+  - 
+  - 
+  - 
+  - 
+  - 
+  - 
+  - 
+  - 
+  - 
+  - 
+  - 
+  - 
+  - 
+  - 
+  - 
+  - 
+  - 
+  - 
+  - 
+  - 
+  - 
+  - 
+  - 
+  - 
+  m_SortingLayers:
+  - name: Default
+    uniqueID: 0
+    locked: 0
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/TimeManager.asset b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/TimeManager.asset
new file mode 100644
index 0000000000..558a017e1f
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/TimeManager.asset
@@ -0,0 +1,9 @@
+%YAML 1.1
+%TAG !u! tag:unity3d.com,2011:
+--- !u!5 &1
+TimeManager:
+  m_ObjectHideFlags: 0
+  Fixed Timestep: 0.02
+  Maximum Allowed Timestep: 0.33333334
+  m_TimeScale: 1
+  Maximum Particle Timestep: 0.03
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/UnityConnectSettings.asset b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/UnityConnectSettings.asset
new file mode 100644
index 0000000000..3da14d5baf
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/UnityConnectSettings.asset
@@ -0,0 +1,34 @@
+%YAML 1.1
+%TAG !u! tag:unity3d.com,2011:
+--- !u!310 &1
+UnityConnectSettings:
+  m_ObjectHideFlags: 0
+  m_Enabled: 0
+  m_TestMode: 0
+  m_TestEventUrl: 
+  m_TestConfigUrl: 
+  m_TestInitMode: 0
+  CrashReportingSettings:
+    m_EventUrl: https://perf-events.cloud.unity3d.com/api/events/crashes
+    m_NativeEventUrl: https://perf-events.cloud.unity3d.com/symbolicate
+    m_Enabled: 0
+    m_CaptureEditorExceptions: 1
+  UnityPurchasingSettings:
+    m_Enabled: 0
+    m_TestMode: 0
+  UnityAnalyticsSettings:
+    m_Enabled: 0
+    m_InitializeOnStartup: 1
+    m_TestMode: 0
+    m_TestEventUrl: 
+    m_TestConfigUrl: 
+  UnityAdsSettings:
+    m_Enabled: 0
+    m_InitializeOnStartup: 1
+    m_TestMode: 0
+    m_IosGameId: 
+    m_AndroidGameId: 
+    m_GameIds: {}
+    m_GameId: 
+  PerformanceReportingSettings:
+    m_Enabled: 0
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/README.md b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/README.md
new file mode 100644
index 0000000000..0b3813fccb
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/README.md
@@ -0,0 +1,24 @@
+# TF Lite Experimental Unity Plugin
+
+This directoryy contains an experimental sample Unity (2017) Plugin, based on
+the experimental TF Lite C API. The sample demonstrates running inference within
+Unity by way of a C# `Interpreter` wrapper.
+
+Note that the native TF Lite plugin(s) *must* be built before using the Unity
+Plugin, and placed in Assets/TensorFlowLite/SDK/Plugins/. For the editor (note
+that this has only been tested on Linux; the syntax may differ on Mac/Windows):
+
+```sh
+bazel build -c opt --cxxopt=--std=c++11 \
+  //tensorflow/contrib/lite/experimental/c:libtensorflowlite_c.so
+```
+
+and for Android:
+
+```sh
+bazel build -c opt --cxxopt=--std=c++11 \
+  --crosstool_top=//external:android/crosstool \
+  --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
+  --cpu=armeabi-v7a \
+  //tensorflow/contrib/lite/experimental/c:libtensorflowlite_c.so
+```
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/UnityPackageManager/manifest.json b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/UnityPackageManager/manifest.json
new file mode 100644
index 0000000000..526aca6057
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/UnityPackageManager/manifest.json
@@ -0,0 +1,4 @@
+{
+	"dependencies": {
+	}
+}
diff --git a/tensorflow/contrib/lite/g3doc/README.md b/tensorflow/contrib/lite/g3doc/README.md
new file mode 100644
index 0000000000..e3db478481
--- /dev/null
+++ b/tensorflow/contrib/lite/g3doc/README.md
@@ -0,0 +1,4 @@
+This is a *work-in-progress* TF Lite subsite for:
+https://www.tensorflow.org/mobile
+
+DO NOT PUBLISH
diff --git a/tensorflow/contrib/lite/g3doc/_book.yaml b/tensorflow/contrib/lite/g3doc/_book.yaml
new file mode 100644
index 0000000000..98abd5743b
--- /dev/null
+++ b/tensorflow/contrib/lite/g3doc/_book.yaml
@@ -0,0 +1,58 @@
+upper_tabs:
+# Tabs left of dropdown menu
+- include: /_upper_tabs_left.yaml
+# Dropdown menu
+- name: Ecosystem
+  path: /ecosystem
+  is_default: True
+  menu:
+  - include: /ecosystem/_menu_toc.yaml
+  lower_tabs:
+    # Subsite tabs
+    other:
+    - name: Guide
+      contents:
+      - title: Overview
+        path: /mobile/overview
+      - title: Developer Guide
+        path: /mobile/devguide
+      - title: Android Demo App
+        path: /mobile/demo_android
+      - title: iOS Demo App
+        path: /mobile/demo_ios
+      - title: Performance
+        path: /mobile/performance
+      - break: True
+      - title: TensorFlow Lite APIs
+        path: /mobile/apis
+      - title: Custom operators
+        path: /mobile/custom_operators
+      - title: TensorFlow Lite Ops Versioning
+        path: /mobile/ops_versioning
+      - title: TensorFlow Lite Compatibility Guide
+        path: /mobile/tf_ops_compatibility
+      - title: List of Hosted Models
+        path: /mobile/models
+      - title: TensorFlow Lite for iOS
+        path: /mobile/ios
+      - title: TensorFlow Lite for Raspberry Pi
+        path: /mobile/rpi
+
+      - heading: TF Mobile
+        status: deprecated
+      - title: Overview
+        path: /mobile/tfmobile/
+      - title: Building TensorFlow on Android
+        path: /mobile/tfmobile/android_build
+      - title: Building TensorFlow on IOS
+        path: /mobile/tfmobile/ios_build
+      - title: Integrating TensorFlow libraries
+        path: /mobile/tfmobile/linking_libs
+      - title: Preparing models for mobile deployment
+        path: /mobile/tfmobile/prepare_models
+      - title: Optimizing for mobile
+        path: /mobile/tfmobile/optimizing
+
+    - name: API
+      contents:
+      - include: /mobile/api_docs/python/_toc.yaml
diff --git a/tensorflow/contrib/lite/g3doc/_index.yaml b/tensorflow/contrib/lite/g3doc/_index.yaml
new file mode 100644
index 0000000000..9119e49117
--- /dev/null
+++ b/tensorflow/contrib/lite/g3doc/_index.yaml
@@ -0,0 +1,67 @@
+book_path: /mobile/_book.yaml
+project_path: /mobile/_project.yaml
+description: <!--no description-->
+landing_page:
+  rows:
+  - heading: TensorFlow Lite is a lightweight solution for mobile and embedded devices.
+    items:
+    - description: >
+        TensorFlow Lite is TensorFlow’s lightweight solution for mobile and
+        embedded devices. It enables on-device machine learning inference with
+        low latency and a small binary size. TensorFlow Lite also supports
+        hardware acceleration with the
+        <a href='https://developer.android.com/ndk/guides/neuralnetworks/index.html'>Android Neural Networks API</a>.
+      list:
+      - heading: Key point 1
+        description: >
+          [high-level overview]
+        icon:
+          icon_name: chevron_right
+          foreground: theme
+          background: grey
+      - heading: Key point 2
+        description: >
+          [high-level overview]
+        icon:
+          icon_name: chevron_right
+          foreground: theme
+          background: grey
+      - heading: Key point 3
+        description: >
+          [high-level overview]
+        icon:
+          icon_name: chevron_right
+          foreground: theme
+          background: grey
+    - code_block: |
+        <pre class = "prettyprint">
+        $ toco --input_file=$(pwd)/mobilenet_v1_1.0_224/frozen_graph.pb \
+               --input_format=TENSORFLOW_GRAPHDEF \
+               --output_format=TFLITE \
+               --output_file=/tmp/mobilenet_v1_1.0_224.tflite \
+               --inference_type=FLOAT \
+               --input_type=FLOAT \
+               --input_arrays=input \
+               --output_arrays=MobilenetV1/Predictions/Reshape_1 \
+               --input_shapes=1,224,224,3
+        </pre>
+
+  - classname: devsite-landing-row-cards
+    items:
+    - heading: Using TensorFlow Lite on Android
+      image_path: /ecosystem/images/tf-logo-card-16x9.png
+      path: https://medium.com/tensorflow/using-tensorflow-lite-on-android-9bbc9cb7d69d
+      buttons:
+      - label: Read on TensorFlow blog
+        path: https://medium.com/tensorflow/using-tensorflow-lite-on-android-9bbc9cb7d69d
+    - heading: TensorFlow Lite at the Dev Summit
+      youtube_id: FAMfy7izB6A
+      buttons:
+      - label: Watch the video
+        path: https://www.youtube.com/watch?v=FAMfy7izB6A
+    - heading: TensorFlow Lite on GitHub
+      image_path: /ecosystem/images/github-card-16x9.png
+      path: https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite
+      buttons:
+      - label: View on GitHub
+        path: https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite
diff --git a/tensorflow/contrib/lite/g3doc/_project.yaml b/tensorflow/contrib/lite/g3doc/_project.yaml
new file mode 100644
index 0000000000..b39666516b
--- /dev/null
+++ b/tensorflow/contrib/lite/g3doc/_project.yaml
@@ -0,0 +1,10 @@
+name: TensorFlow Lite
+breadcrumb_name: Mobile
+home_url: /mobile/
+parent_project_metadata_path: /_project.yaml
+description: >
+  TensorFlow Lite is a lightweight solution for mobile and embedded devices.
+use_site_branding: True
+hide_from_products_list: True
+content_license: cc3-apache2
+buganizer_id: 316308
diff --git a/tensorflow/contrib/lite/g3doc/api_docs/python/_toc.yaml b/tensorflow/contrib/lite/g3doc/api_docs/python/_toc.yaml
new file mode 100644
index 0000000000..1e1c44c692
--- /dev/null
+++ b/tensorflow/contrib/lite/g3doc/api_docs/python/_toc.yaml
@@ -0,0 +1,6 @@
+# Automatically generated file; please do not edit
+toc:
+  - title: TensorFlow Lite
+    section:
+    - title: Overview
+      path: /mobile/api_docs/python/
diff --git a/tensorflow/contrib/lite/g3doc/api_docs/python/index.md b/tensorflow/contrib/lite/g3doc/api_docs/python/index.md
new file mode 100644
index 0000000000..70031a3c3d
--- /dev/null
+++ b/tensorflow/contrib/lite/g3doc/api_docs/python/index.md
@@ -0,0 +1,10 @@
+Project: /mobile/_project.yaml
+Book: /mobile/_book.yaml
+page_type: reference
+<style> table img { max-width: 100%; } </style>
+<script src="/_static/js/managed/mathjax/MathJax.js?config=TeX-AMS-MML_SVG"></script>
+
+<!-- DO NOT EDIT! Automatically generated file. -->
+# All symbols in TensorFlow Lite
+
+TEMP PAGE
diff --git a/tensorflow/contrib/lite/g3doc/apis.md b/tensorflow/contrib/lite/g3doc/apis.md
index a591a353dd..776803da8c 100644
--- a/tensorflow/contrib/lite/g3doc/apis.md
+++ b/tensorflow/contrib/lite/g3doc/apis.md
@@ -1,3 +1,6 @@
+book_path: /mobile/_book.yaml
+project_path: /mobile/_project.yaml
+
 # TensorFlow Lite APIs
 
 TensorFlow Lite provides programming APIs in C++ and Java, and in both cases
@@ -53,6 +56,7 @@ typedef enum {
 ```
 
 Failures can be easily verified with:
+
 ```c++
 if (status != kTfLiteOk) {
   // ... error handling here ...
diff --git a/tensorflow/contrib/lite/g3doc/benchmarks.md b/tensorflow/contrib/lite/g3doc/benchmarks.md
deleted file mode 100644
index 96536cba27..0000000000
--- a/tensorflow/contrib/lite/g3doc/benchmarks.md
+++ /dev/null
@@ -1,178 +0,0 @@
-# Performance Benchmark numbers
-
-This document contains the performance benchmark numbers for running a few well
-known models on some Android and iOS devices.
-
-The benchmark numbers were generated by running the [TFLite benchmark
-binary](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/tools/benchmark)
-on Android and running the [iOS benchmark
-app](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/tools/benchmark/ios)
-on iOS.
-
-# Android benchmarks
-
-When running Android benchmarks, the CPU affinity is set to use big cores on the
-device to reduce variance (see
-[details](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/tools/benchmark#reducing-variance-between-runs-on-android)).
-
-Models are assumed to have been downloaded from the link, unzipped and pushed to
-`/data/local/tmp/tflite_models` folder. The benchmark binary is built according
-to instructions listed
-[here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/tools/benchmark#on-android)
-and is assumed to have been pushed to `/data/local/tmp`.
-
-The following command was used to run the benchmark:
-
-```
-adb shell taskset ${CPU_MASK} /data/local/tmp/benchmark_model \
-  --num_threads=1 \
-  --graph=/data/local/tmp/tflite_models/${GRAPH} \
-  --warmup_runs=1 \
-  --num_runs=50 \
-  --use_nnapi=false
-```
-
-where `${GRAPH}` is the name of model and `${CPU_MASK}` is the CPU affinity
-chosen according to the following table:
-
-Device | CPU_MASK |
--------| ----------
-Pixel 2 | f0 |
-Pixel xl | 0c |
-
-
-<table>
-  <thead>
-    <tr>
-      <th>Model Name</th>
-      <th>Device </th>
-      <th>Mean inference time (std dev)</th>
-    </tr>
-  </thead>
-  <tr>
-    <td rowspan = 2>
-      <a href="http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224.tgz">Mobilenet_1.0_224(float)</a>
-    </td>
-    <td>Pixel 2 </td>
-    <td>166.5 ms (2.6 ms)</td>
-  </tr>
-   <tr>
-     <td>Pixel xl </td>
-     <td>122.9 ms (1.8 ms)  </td>
-  </tr>
-  <tr>
-    <td rowspan = 2>
-      <a href="http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224_quant.tgz">Mobilenet_1.0_224 (quant)</a>
-    </td>
-    <td>Pixel 2 </td>
-    <td>69.5 ms (0.9 ms)</td>
-  </tr>
-   <tr>
-     <td>Pixel xl </td>
-     <td>78.9 ms (2.2 ms)  </td>
-  </tr>
-  <tr>
-    <td rowspan = 2>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/nasnet_mobile_2018_04_27.tgz">NASNet mobile</a>
-    </td>
-    <td>Pixel 2 </td>
-    <td>273.8 ms (3.5 ms)</td>
-  </tr>
-   <tr>
-     <td>Pixel xl </td>
-     <td>210.8 ms (4.2 ms)</td>
-  </tr>
-  <tr>
-    <td rowspan = 2>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/squeezenet_2018_04_27.tgz">SqueezeNet</a>
-    </td>
-    <td>Pixel 2 </td>
-    <td>234.0 ms (2.1 ms)</td>
-  </tr>
-   <tr>
-     <td>Pixel xl </td>
-     <td>158.0 ms (2.1 ms)</td>
-  </tr>
-  <tr>
-    <td rowspan = 2>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_resnet_v2_2018_04_27.tgz">Inception_ResNet_V2</a>
-    </td>
-    <td>Pixel 2 </td>
-    <td>2846.0 ms (15.0 ms)</td>
-  </tr>
-   <tr>
-     <td>Pixel xl </td>
-     <td>1973.0 ms (15.0 ms)  </td>
-  </tr>
-  <tr>
-    <td rowspan = 2>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_v4_2018_04_27.tgz">Inception_V4</a>
-    </td>
-    <td>Pixel 2 </td>
-    <td>3180.0 ms (11.7 ms)</td>
-  </tr>
-   <tr>
-     <td>Pixel xl </td>
-     <td>2262.0 ms (21.0 ms)  </td>
-  </tr>
-
- </table>
-
-# iOS benchmarks
-
-For running iOS benchmarks, the [benchmark
-app](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/tools/benchmark/ios)
-was modified to include the appropriate model and `benchmark_params.json` was
-modified  to set `num_threads` to 1.
-
-<table>
-  <thead>
-    <tr>
-      <th>Model Name</th>
-      <th>Device </th>
-      <th>Mean inference time (std dev)</th>
-    </tr>
-  </thead>
-  <tr>
-    <td>
-      <a href="http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224.tgz">Mobilenet_1.0_224(float)</a>
-    </td>
-    <td>iPhone 8 </td>
-    <td>32.2 ms (0.8 ms)</td>
-  </tr>
-  <tr>
-    <td>
-      <a href="http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224_quant.tgz)">Mobilenet_1.0_224 (quant)</a>
-    </td>
-    <td>iPhone 8 </td>
-    <td>24.4 ms (0.8 ms)</td>
-  </tr>
-  <tr>
-    <td>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/nasnet_mobile_2018_04_27.tgz">NASNet mobile</a>
-    </td>
-    <td>iPhone 8 </td>
-    <td>60.3 ms (0.6 ms)</td>
-  </tr>
-  <tr>
-    <td>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/squeezenet_2018_04_27.tgz">SqueezeNet</a>
-    </td>
-    <td>iPhone 8 </td>
-    <td>44.3 (0.7 ms)</td>
-  </tr>
-  <tr>
-    <td>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_resnet_v2_2018_04_27.tgz">Inception_ResNet_V2</a>
-    </td>
-    <td>iPhone 8</td>
-    <td>562.4 ms (18.2 ms)</td>
-  </tr>
-  <tr>
-    <td>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_v4_2018_04_27.tgz">Inception_V4</a>
-    </td>
-    <td>iPhone 8 </td>
-    <td>661.0 ms (29.2 ms)</td>
-  </tr>
- </table>
diff --git a/tensorflow/contrib/lite/g3doc/custom_operators.md b/tensorflow/contrib/lite/g3doc/custom_operators.md
index 972e57f73e..d979353bb3 100644
--- a/tensorflow/contrib/lite/g3doc/custom_operators.md
+++ b/tensorflow/contrib/lite/g3doc/custom_operators.md
@@ -1,3 +1,6 @@
+book_path: /mobile/_book.yaml
+project_path: /mobile/_project.yaml
+
 # How to use custom operators
 
 TensorFlow Lite currently supports a subset of TensorFlow operators. However, it
@@ -89,3 +92,83 @@ builtins.AddCustom("Sin", Register_SIN());
 
 Note that a similar process as above can be followed for supporting for a set of
 operations instead of a single operator.
+
+## Best Practices for writing custom operators
+
+1.  Optimize memory allocations and de-allocations cautiously. It is more
+    efficient to allocate memory in Prepare() instead of Invoke(), and allocate
+    memory before a loop instead of in every iteration. Use temporary tensors
+    data rather than mallocing yourself (see item 2). Use pointers/references
+    instead of copying as much as possible.
+
+2.  If a data structure will persist during the entire operation, we advise
+    pre-allocating the memory using temporary tensors. You may need to use
+    OpData struct to reference the tensor indices in other functions. See
+    example in the
+    [kernel for convolution](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/kernels/conv.cc).
+    A sample code snippet is below
+
+    ```
+    auto* op_data = reinterpret_cast<OpData*>(node->user_data);
+    TfLiteIntArrayFree(node->temporaries);
+    node->temporaries = TfLiteIntArrayCreate(1);
+    node->temporaries->data[0] = op_data->temp_tensor_index;
+    TfLiteTensor* temp_tensor = &context->tensors[op_data->temp_tensor_index];
+    temp_tensor->type =  kTfLiteFloat32;
+    temp_tensor->allocation_type = kTfLiteArenaRw;
+    ```
+
+3.  If it doesn't cost too much wasted memory, prefer using a static fixed size
+    array (or in Resize() pre-allocated std::vector) rather than using a
+    dynamically allocating std::vector every iteration of execution.
+
+4.  Avoid instantiating standard library container templates that don't already
+    exist, because they affect binary size. For example, if you need a std::map
+    in your operation that doesn't exist in other kernels, using a std::vector
+    with direct indexing mapping could work while keeping the binary size small.
+    See what other kernels use to gain insight (or ask).
+
+5.  Check the pointer to the memory returned by malloc. If this pointer is
+    nullptr, no operations should be performed using that pointer. If you
+    malloc() in a function and have an error exit, deallocate memory before you
+    exit.
+
+6.  Use TF_LITE_ENSURE(context, condition) to check for a specific condition.
+    Your code must not leave memory hanging when TF_LITE_ENSURE is done, i.e.,
+    these should be done before any resources are allocated that will leak.
+
+## Special TF Graph Attributes
+
+When Toco convertes a TF graph into TFLite format, it makes some assumption
+about custom operations that might be not correct. In this case, the generated
+graph can be not executable.
+
+It is possible to add aditional information about your custom op output to TF
+graph before it is converted. The following attributes are supported:
+
+-   **_output_quantized** a boolean attribute, true if the operation outputs are
+    quantized
+-   **_output_types** a list of types for output tensors
+-   **_output_shapes** a list of shapes for output tensors
+
+### Setting the Attributes
+
+This is an example how the attributes can be set:
+
+```python
+frozen_graph_def = tf.graph_util.convert_variables_to_constants(...)
+for node in frozen_graph_def.node:
+    if node.op == 'sin':
+      node.attr['_output_types'].list.type.extend([
+          types_pb2.DT_FLOAT,
+      ])
+      node.attr['_output_shapes'].list.shape.extend([
+          tf.TensorShape([10]),
+      ])
+      node.attr['_output_quantized'].b = False
+tflite_model = tf.contrib.lite.toco_convert(
+        frozen_graph_def,...)
+```
+
+**Note:** After the attributes are set, the graph can not be executed by
+Tensorflow, therefore it should be done just before the conversion.
diff --git a/tensorflow/docs_src/mobile/tflite/demo_android.md b/tensorflow/contrib/lite/g3doc/demo_android.md
index fdf0bcf3c1..d79a2696b4 100644
--- a/tensorflow/docs_src/mobile/tflite/demo_android.md
+++ b/tensorflow/contrib/lite/g3doc/demo_android.md
@@ -1,3 +1,6 @@
+book_path: /mobile/_book.yaml
+project_path: /mobile/_project.yaml
+
 # Android Demo App
 
 An example Android application using TensorFLow Lite is available
diff --git a/tensorflow/docs_src/mobile/tflite/demo_ios.md b/tensorflow/contrib/lite/g3doc/demo_ios.md
index 3be21da89f..a554898899 100644
--- a/tensorflow/docs_src/mobile/tflite/demo_ios.md
+++ b/tensorflow/contrib/lite/g3doc/demo_ios.md
@@ -1,3 +1,6 @@
+book_path: /mobile/_book.yaml
+project_path: /mobile/_project.yaml
+
 # iOS Demo App
 
 The TensorFlow Lite demo is a camera app that continuously classifies whatever
diff --git a/tensorflow/docs_src/mobile/tflite/devguide.md b/tensorflow/contrib/lite/g3doc/devguide.md
index b168d6c183..dc9cc98c08 100644
--- a/tensorflow/docs_src/mobile/tflite/devguide.md
+++ b/tensorflow/contrib/lite/g3doc/devguide.md
@@ -1,3 +1,6 @@
+book_path: /mobile/_book.yaml
+project_path: /mobile/_project.yaml
+
 # Developer Guide
 
 Using a TensorFlow Lite model in your mobile app requires multiple
@@ -56,7 +59,7 @@ both floating point and quantized inference.
 A developer may choose to train a custom model using Tensorflow (see the
 [TensorFlow tutorials](../../tutorials/) for examples of building and training
 models). If you have already written a model, the first step is to export this
-to a @{tf.GraphDef} file. This is required because some formats do not store the
+to a `tf.GraphDef` file. This is required because some formats do not store the
 model structure outside the code, and we must communicate with other parts of the
 framework. See
 [Exporting the Inference Graph](https://github.com/tensorflow/models/blob/master/research/slim/README.md)
@@ -71,12 +74,12 @@ grow in future Tensorflow Lite releases.
 ## 2. Convert the model format
 
 The model generated (or downloaded) in the previous step is a *standard*
-Tensorflow model and you should now have a .pb or .pbtxt @{tf.GraphDef} file.
+Tensorflow model and you should now have a .pb or .pbtxt `tf.GraphDef` file.
 Models generated with transfer learning (re-training) or custom models must be
 converted—but, we must first freeze the graph to convert the model to the
 Tensorflow Lite format. This process uses several model formats:
 
-* @{tf.GraphDef} (.pb) —A protobuf that represents the TensorFlow training or
+* `tf.GraphDef` (.pb) —A protobuf that represents the TensorFlow training or
   computation graph. It contains operators, tensors, and variables definitions.
 * *CheckPoint* (.ckpt) —Serialized variables from a TensorFlow graph. Since this
   does not contain a graph structure, it cannot be interpreted by itself.
@@ -143,11 +146,11 @@ containing the model architecture. The [frozen_graph.pb](https://storage.googlea
 file used here is available for download. `output_file` is where the TensorFlow
 Lite model will get generated. The `input_type` and `inference_type`
 arguments should be set to `FLOAT`, unless converting a
-@{$performance/quantization$quantized model}. Setting the `input_array`,
-`output_array`, and `input_shape` arguments are not as straightforward. The
-easiest way to find these values is to explore the graph using Tensorboard. Reuse
-the arguments for specifying the output nodes for inference in the
-`freeze_graph` step.
+<a href="https://www.tensorflow.org/performance/quantization">quantized model</a>.
+Setting the `input_array`, `output_array`, and `input_shape` arguments are not as
+straightforward. The easiest way to find these values is to explore the graph
+using Tensorboard. Reuse the arguments for specifying the output nodes for
+inference in the `freeze_graph` step.
 
 It is also possible to use the Tensorflow Optimizing Converter with protobufs
 from either Python or from the command line (see the 
@@ -204,16 +207,16 @@ The open source Android demo app uses the JNI interface and is available
 [on GitHub](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/java/demo/app).
 You can also download a
 [prebuilt APK](http://download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk).
-See the @{$tflite/demo_android} guide for details.
+See the <a href="../demo_android.md">Android demo</a> guide for details.
 
-The @{$mobile/android_build} guide has instructions for installing TensorFlow on
-Android and setting up `bazel` and Android Studio.
+The <a href="./android_build.md">Android mobile</a> guide has instructions for
+installing TensorFlow on Android and setting up `bazel` and Android Studio.
 
 ### iOS
 
 To integrate a TensorFlow model in an iOS app, see the
 [TensorFlow Lite for iOS](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/ios.md)
-guide and @{$tflite/demo_ios} guide.
+guide and <a href="../demo_ios.md">iOS demo</a> guide.
 
 #### Core ML support
 
diff --git a/tensorflow/contrib/lite/g3doc/ios.md b/tensorflow/contrib/lite/g3doc/ios.md
index e0358a444d..d78d373ccf 100644
--- a/tensorflow/contrib/lite/g3doc/ios.md
+++ b/tensorflow/contrib/lite/g3doc/ios.md
@@ -1,3 +1,6 @@
+book_path: /mobile/_book.yaml
+project_path: /mobile/_project.yaml
+
 # TensorFlow Lite for iOS
 
 ## Building
diff --git a/tensorflow/contrib/lite/g3doc/models.md b/tensorflow/contrib/lite/g3doc/models.md
index 4e7d33a1b6..3292aece0e 100644
--- a/tensorflow/contrib/lite/g3doc/models.md
+++ b/tensorflow/contrib/lite/g3doc/models.md
@@ -1,3 +1,6 @@
+book_path: /mobile/_book.yaml
+project_path: /mobile/_project.yaml
+
 # List of Hosted Models
 
 ## Image classification (Float Models)
diff --git a/tensorflow/contrib/lite/g3doc/ops_versioning.md b/tensorflow/contrib/lite/g3doc/ops_versioning.md
index bd2f797e6c..b06f4fd3b8 100644
--- a/tensorflow/contrib/lite/g3doc/ops_versioning.md
+++ b/tensorflow/contrib/lite/g3doc/ops_versioning.md
@@ -1,3 +1,6 @@
+book_path: /mobile/_book.yaml
+project_path: /mobile/_project.yaml
+
 # TensorFlow Lite Ops Versioning
 
 This document describes TensorFlow Lite's op versioning schema. Op
diff --git a/tensorflow/docs_src/mobile/tflite/index.md b/tensorflow/contrib/lite/g3doc/overview.md
index cc4af2a875..be60d7941a 100644
--- a/tensorflow/docs_src/mobile/tflite/index.md
+++ b/tensorflow/contrib/lite/g3doc/overview.md
@@ -1,3 +1,6 @@
+book_path: /mobile/_book.yaml
+project_path: /mobile/_project.yaml
+
 # Introduction to TensorFlow Lite
 
 TensorFlow Lite is TensorFlow’s lightweight solution for mobile and embedded
diff --git a/tensorflow/docs_src/mobile/tflite/performance.md b/tensorflow/contrib/lite/g3doc/performance.md
index 79bacaaa1b..613e9f97c3 100644
--- a/tensorflow/docs_src/mobile/tflite/performance.md
+++ b/tensorflow/contrib/lite/g3doc/performance.md
@@ -1,3 +1,6 @@
+book_path: /mobile/_book.yaml
+project_path: /mobile/_project.yaml
+
 # Performance
 
 This document lists TensorFlow Lite performance benchmarks when running well
diff --git a/tensorflow/contrib/lite/g3doc/rpi.md b/tensorflow/contrib/lite/g3doc/rpi.md
index ab50789307..cdc9172d87 100644
--- a/tensorflow/contrib/lite/g3doc/rpi.md
+++ b/tensorflow/contrib/lite/g3doc/rpi.md
@@ -1,3 +1,6 @@
+book_path: /mobile/_book.yaml
+project_path: /mobile/_project.yaml
+
 # TensorFlow Lite for Raspberry Pi
 
 ## Cross compiling
diff --git a/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md b/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
index 49d00a66ba..aa65ec9988 100644
--- a/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
+++ b/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
@@ -1,3 +1,6 @@
+book_path: /mobile/_book.yaml
+project_path: /mobile/_project.yaml
+
 # TensorFlow Lite & TensorFlow Compatibility Guide
 
 TensorFlow Lite supports a number of TensorFlow operations used in common
@@ -59,6 +62,7 @@ counterparts:
 *   [tf.nn.softmax](https://www.tensorflow.org/api_docs/python/tf/nn/softmax) -
     *as long as tensors are 2D and axis is the last dimension*
 *   [tf.nn.top_k](https://www.tensorflow.org/api_docs/python/tf/nn/top_k)
+*   [tf.one_hot](https://www.tensorflow.org/api_docs/python/tf/one_hot)
 *   [tf.pad](https://www.tensorflow.org/api_docs/python/tf/pad) - *as long as
     mode and constant_values are not used*
 *   [tf.reduce_mean](https://www.tensorflow.org/api_docs/python/tf/reduce_mean) -
@@ -815,6 +819,30 @@ Outputs {
 }
 ```
 
+**PACK**
+
+```
+Inputs {
+  0: a list of tensors.
+  1: an integer.
+}
+Outputs {
+  0: A tensor of stacked tensors.
+}
+```
+
+**LOGICAL_OR**
+
+```
+Inputs {
+  0: a list of tensors.
+  1: a list of tensors.
+}
+Outputs {
+  0: A tensor of logical_or output tensors.
+}
+```
+
 And these are TensorFlow Lite operations that are present but not ready for
 custom models yet:
 
diff --git a/tensorflow/docs_src/mobile/android_build.md b/tensorflow/contrib/lite/g3doc/tfmobile/android_build.md
index f4b07db459..76e16fc9db 100644
--- a/tensorflow/docs_src/mobile/android_build.md
+++ b/tensorflow/contrib/lite/g3doc/tfmobile/android_build.md
@@ -1,3 +1,6 @@
+book_path: /mobile/_book.yaml
+project_path: /mobile/_project.yaml
+
 # Building TensorFlow on Android
 
 To get you started working with TensorFlow on Android, we'll walk through two
@@ -91,7 +94,8 @@ using [ADB](https://developer.android.com/studio/command-line/adb.html). This
 requires some knowledge of build systems and Android developer tools, but we'll
 guide you through the basics here.
 
-- First, follow our instructions for @{$install/install_sources$installing from sources}.
+- First, follow our instructions for
+  <a href="http://www.tensorflow.org/install/install_sources">installing from sources</a>.
   This will also guide you through installing Bazel and cloning the
   TensorFlow code.
 
diff --git a/tensorflow/docs_src/mobile/mobile_intro.md b/tensorflow/contrib/lite/g3doc/tfmobile/index.md
index baad443308..bd047bfcec 100644
--- a/tensorflow/docs_src/mobile/mobile_intro.md
+++ b/tensorflow/contrib/lite/g3doc/tfmobile/index.md
@@ -1,4 +1,45 @@
-# Introduction to TensorFlow Mobile
+book_path: /mobile/_book.yaml
+project_path: /mobile/_project.yaml
+
+# Overview
+
+TensorFlow was designed to be a good deep learning solution for mobile
+platforms. Currently we have two solutions for deploying machine learning
+applications on mobile and embedded devices: TensorFlow for Mobile and
+<a href="../index.md">TensorFlow Lite</a>.
+
+## TensorFlow Lite versus TensorFlow Mobile
+
+Here are a few of the differences between the two:
+
+- TensorFlow Lite is an evolution of TensorFlow Mobile.  In most cases, apps
+  developed with TensorFlow Lite will have a smaller binary size, fewer
+  dependencies, and better performance.
+
+- TensorFlow Lite is in developer preview, so not all use cases are covered yet.
+  We expect you to use TensorFlow Mobile to cover production cases.
+
+- TensorFlow Lite supports only a limited set of operators, so not all models
+  will work on it by default. TensorFlow for Mobile has a fuller set of
+  supported functionality.
+
+TensorFlow Lite provides better performance and a small binary size on mobile
+platforms as well as the ability to leverage hardware acceleration if available
+on their platforms. In addition, it has many fewer dependencies so it can be
+built and hosted on simpler, more constrained device scenarios. TensorFlow Lite
+also allows targeting accelerators through the [Neural Networks
+API](https://developer.android.com/ndk/guides/neuralnetworks/index.html).
+
+TensorFlow Lite currently has coverage for a limited set of operators. While
+TensorFlow for Mobile supports only a constrained set of ops by default, in
+principle if you use an arbitrary operator in TensorFlow, it can be customized
+to build that kernel. Thus use cases which are not currently supported by
+TensorFlow Lite should continue to use TensorFlow for Mobile. As TensorFlow Lite
+evolves, it will gain additional operators, and the decision will be easier to
+make.
+
+
+## Introduction to TensorFlow Mobile
 
 TensorFlow was designed from the ground up to be a good deep learning solution
 for mobile platforms like Android and iOS. This mobile guide should help you
@@ -167,7 +208,7 @@ interesting products possible.
 
 TensorFlow runs on Ubuntu Linux, Windows 10, and OS X. For a list of all
 supported operating systems and instructions to install TensorFlow, see
-@{$install$Installing Tensorflow}.
+<a href="https://www.tensorflow.org/install">Installing Tensorflow</a>.
 
 Note that some of the sample code we provide for mobile TensorFlow requires you
 to compile TensorFlow from source, so you’ll need more than just `pip install`
@@ -241,8 +282,3 @@ results you’ll see. It’s common for an algorithm to get great training accur
 numbers but then fail to be useful within a real application because there’s a
 mismatch between the dataset and real usage. Prototype end-to-end usage as soon
 as possible to create a consistent user experience.
-
-## Next Steps
-
-We suggest you get started by building one of our demos for
-@{$mobile/android_build$Android} or @{$mobile/ios_build$iOS}.
diff --git a/tensorflow/docs_src/mobile/ios_build.md b/tensorflow/contrib/lite/g3doc/tfmobile/ios_build.md
index 4c84a1214a..6223707892 100644
--- a/tensorflow/docs_src/mobile/ios_build.md
+++ b/tensorflow/contrib/lite/g3doc/tfmobile/ios_build.md
@@ -1,3 +1,6 @@
+book_path: /mobile/_book.yaml
+project_path: /mobile/_project.yaml
+
 # Building TensorFlow on iOS
 
 ## Using CocoaPods
diff --git a/tensorflow/docs_src/mobile/linking_libs.md b/tensorflow/contrib/lite/g3doc/tfmobile/linking_libs.md
index efef5dd0da..4c2071ed05 100644
--- a/tensorflow/docs_src/mobile/linking_libs.md
+++ b/tensorflow/contrib/lite/g3doc/tfmobile/linking_libs.md
@@ -1,3 +1,6 @@
+book_path: /mobile/_book.yaml
+project_path: /mobile/_project.yaml
+
 # Integrating TensorFlow libraries
 
 Once you have made some progress on a model that addresses the problem you’re
@@ -14,11 +17,11 @@ TensorFlow mobile demo apps.
 
 After you've managed to build the examples, you'll probably want to call
 TensorFlow from one of your existing applications. The very easiest way to do
-this is to use the Pod installation steps described
-@{$mobile/ios_build#using_cocoapods$here}, but if you want to build TensorFlow
-from source (for example to customize which operators are included) you'll need
-to break out TensorFlow as a framework, include the right header files, and link
-against the built libraries and dependencies.
+this is to use the Pod installation steps described in
+<a href="./ios_build.md">Building TensorFlow on iOS</a>, but if you want to build
+TensorFlow from source (for example to customize which operators are included)
+you'll need to break out TensorFlow as a framework, include the right header
+files, and link against the built libraries and dependencies.
 
 ### Android
 
@@ -82,10 +85,12 @@ recompile of the core.
 To achieve this capability, TensorFlow uses a registration pattern in a lot of
 places. In the code, it looks like this:
 
-    class MulKernel : OpKernel {
-      Status Compute(OpKernelContext* context) { … }
-    };
-    REGISTER_KERNEL(MulKernel, “Mul”);
+```
+class MulKernel : OpKernel {
+	Status Compute(OpKernelContext* context) { … }
+};
+REGISTER_KERNEL(MulKernel, “Mul”);
+```
 
 This would be in a standalone `.cc` file linked into your application, either
 as part of the main set of kernels or as a separate custom library. The magic
@@ -101,15 +106,17 @@ doesn’t offer a good mechanism for doing this sort of registration, so we have
 to resort to some tricky code. Under the hood, the macro is implemented so that
 it produces something like this:
 
-    class RegisterMul {
-     public:
-      RegisterMul() {
-        global_kernel_registry()->Register(“Mul”, [](){
-          return new MulKernel()
-        });
-      }
-    };
-    RegisterMul g_register_mul;
+```
+class RegisterMul {
+	public:
+		RegisterMul() {
+			global_kernel_registry()->Register(“Mul”, [](){
+				return new MulKernel()
+			});
+	}
+};
+RegisterMul g_register_mul;
+```
 
 This sets up a class `RegisterMul` with a constructor that tells the global
 kernel registry what function to call when somebody asks it how to create a
@@ -176,8 +183,10 @@ have an experimental script at [rename_protobuf.sh](https://github.com/tensorflo
 You need to run this as part of the makefile build, after you’ve downloaded all
 the dependencies:
 
-    tensorflow/contrib/makefile/download_dependencies.sh
-    tensorflow/contrib/makefile/rename_protobuf.sh
+```
+tensorflow/contrib/makefile/download_dependencies.sh
+tensorflow/contrib/makefile/rename_protobuf.sh
+```
 
 ## Calling the TensorFlow API
 
@@ -193,18 +202,20 @@ use case, while on iOS and Raspberry Pi you call directly into the C++ API.
 
 Here’s what a typical Inference Library sequence looks like on Android:
 
-    // Load the model from disk.
-    TensorFlowInferenceInterface inferenceInterface =
-    new TensorFlowInferenceInterface(assetManager, modelFilename);
+```
+// Load the model from disk.
+TensorFlowInferenceInterface inferenceInterface =
+new TensorFlowInferenceInterface(assetManager, modelFilename);
 
-    // Copy the input data into TensorFlow.
-    inferenceInterface.feed(inputName, floatValues, 1, inputSize, inputSize, 3);
+// Copy the input data into TensorFlow.
+inferenceInterface.feed(inputName, floatValues, 1, inputSize, inputSize, 3);
 
-    // Run the inference call.
-    inferenceInterface.run(outputNames, logStats);
+// Run the inference call.
+inferenceInterface.run(outputNames, logStats);
 
-    // Copy the output Tensor back into the output array.
-    inferenceInterface.fetch(outputName, outputs);
+// Copy the output Tensor back into the output array.
+inferenceInterface.fetch(outputName, outputs);
+```
 
 You can find the source of this code in the [Android examples](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowImageClassifier.java#L107).
 
@@ -212,27 +223,29 @@ You can find the source of this code in the [Android examples](https://github.co
 
 Here’s the equivalent code for iOS and Raspberry Pi:
 
-    // Load the model.
-    PortableReadFileToProto(file_path, &tensorflow_graph);
-
-    // Create a session from the model.
-    tensorflow::Status s = session->Create(tensorflow_graph);
-    if (!s.ok()) {
-      LOG(FATAL) << "Could not create TensorFlow Graph: " << s;
-    }
-
-    // Run the model.
-    std::string input_layer = "input";
-    std::string output_layer = "output";
-    std::vector<tensorflow::Tensor> outputs;
-    tensorflow::Status run_status = session->Run({{input_layer, image_tensor}},
+```
+// Load the model.
+PortableReadFileToProto(file_path, &tensorflow_graph);
+
+// Create a session from the model.
+tensorflow::Status s = session->Create(tensorflow_graph);
+if (!s.ok()) {
+    LOG(FATAL) << "Could not create TensorFlow Graph: " << s;
+}
+
+// Run the model.
+std::string input_layer = "input";
+std::string output_layer = "output";
+std::vector<tensorflow::Tensor> outputs;
+tensorflow::Status run_status = session->Run({\{input_layer, image_tensor}},
                                {output_layer}, {}, &outputs);
-    if (!run_status.ok()) {
-      LOG(FATAL) << "Running model failed: " << run_status;
-    }
+if (!run_status.ok()) {
+    LOG(FATAL) << "Running model failed: " << run_status;
+}
 
-    // Access the output data.
-    tensorflow::Tensor* output = &outputs[0];
+// Access the output data.
+tensorflow::Tensor* output = &outputs[0];
+```
 
 This is all based on the
 [iOS sample code](https://www.tensorflow.org/code/tensorflow/examples/ios/simple/RunModelViewController.mm),
diff --git a/tensorflow/docs_src/mobile/optimizing.md b/tensorflow/contrib/lite/g3doc/tfmobile/optimizing.md
index 778e4d3a62..a0192c3541 100644
--- a/tensorflow/docs_src/mobile/optimizing.md
+++ b/tensorflow/contrib/lite/g3doc/tfmobile/optimizing.md
@@ -1,3 +1,6 @@
+book_path: /mobile/_book.yaml
+project_path: /mobile/_project.yaml
+
 # Optimizing for mobile
 
 There are some special issues that you have to deal with when you’re trying to
@@ -77,7 +80,7 @@ out of a mobile device's memory faster.
 
 To understand how large your network will be on disk, start by looking at the
 size on disk of your `GraphDef` file after you’ve run `freeze_graph` and
-`strip_unused_nodes` on it (see @{$mobile/prepare_models$Preparing models} for
+`strip_unused_nodes` on it (see <a href="./prepare_models.md">Preparing models</a> for
 more details on these tools), since then it should only contain
 inference-related nodes. To double-check that your results are as expected, run
 the `summarize_graph` tool to see how many parameters are in constants:
@@ -103,7 +106,8 @@ you multiply the number of const parameters by four, you should get something
 that’s close to the size of the file on disk. You can often get away with only
 eight-bits per parameter with very little loss of accuracy in the final result,
 so if your file size is too large you can try using
-@{$performance/quantization$quantize_weights} to transform the parameters down.
+<a href="https://www.tensorflow.org/performance/quantization">quantize_weights</a>
+to transform the parameters down.
 
     bazel build tensorflow/tools/graph_transforms:transform_graph && \
     bazel-bin/tensorflow/tools/graph_transforms/transform_graph \
@@ -292,7 +296,8 @@ run it on a 64-bit ARM device:
 
 You can interpret the results in exactly the same way as the desktop version
 above. If you have any trouble figuring out what the right input and output
-names and types are, take a look at the @{$mobile/prepare_models$Preparing models}
+names and types are, take a look at the
+<a href="./prepare_models">Preparing models</a>
 page for details about detecting these for your model, and look at the
 `summarize_graph` tool which may give you
 helpful information.
diff --git a/tensorflow/docs_src/mobile/prepare_models.md b/tensorflow/contrib/lite/g3doc/tfmobile/prepare_models.md
index 2b84dbb973..6b4e4a92bd 100644
--- a/tensorflow/docs_src/mobile/prepare_models.md
+++ b/tensorflow/contrib/lite/g3doc/tfmobile/prepare_models.md
@@ -1,3 +1,6 @@
+book_path: /mobile/_book.yaml
+project_path: /mobile/_project.yaml
+
 # Preparing models for mobile deployment
 
 The requirements for storing model information during training are very
@@ -255,8 +258,8 @@ The criteria for including ops and types fall into several categories:
 These ops are trimmed by default to optimize for inference on mobile, but it is
 possible to alter some build files to change the default.  After alternating the
 build files, you will need to recompile TensorFlow.  See below for more details
-on how to do this, and also see @{$mobile/optimizing#binary_size$Optimizing} for
-more on reducing your binary size.
+on how to do this, and also see <a href="./optimizing.md">optimizing binary size</a>
+for more on reducing your binary size.
 
 ### Locate the implementation
 
diff --git a/tensorflow/contrib/lite/interpreter.cc b/tensorflow/contrib/lite/interpreter.cc
index 26fecceab0..e38597495d 100644
--- a/tensorflow/contrib/lite/interpreter.cc
+++ b/tensorflow/contrib/lite/interpreter.cc
@@ -43,10 +43,13 @@ namespace {
 TfLiteStatus ReportOpError(TfLiteContext* context, const TfLiteNode& node,
                            const TfLiteRegistration& registration,
                            int node_index, const char* message) {
-  context->ReportError(context, "Node number %d (%s) %s.\n", node_index,
-                       EnumNameBuiltinOperator(static_cast<BuiltinOperator>(
-                           registration.builtin_code)),
-                       message);
+  context->ReportError(
+      context, "Node number %d (%s) %s.\n", node_index,
+      registration.custom_name
+          ? registration.custom_name
+          : EnumNameBuiltinOperator(
+                static_cast<BuiltinOperator>(registration.builtin_code)),
+      message);
   return kTfLiteError;
 }
 
@@ -131,9 +134,7 @@ Interpreter::Interpreter(ErrorReporter* error_reporter)
   context_.SetExternalContext = SetExternalContext;
 
   // Invalid to call these these except from TfLiteDelegate
-  SetForbiddenContextFunction(&context_.GetNodeAndRegistration);
-  SetForbiddenContextFunction(&context_.ReplaceSubgraphsWithDelegateKernels);
-  SetForbiddenContextFunction(&context_.GetExecutionPlan);
+  SwitchToKernelContext();
 
   // Reserve some space for the tensors to avoid excessive resizing.
   tensors_.reserve(kTensorsReservedCapacity);
@@ -278,8 +279,9 @@ TfLiteStatus Interpreter::ReplaceSubgraphsWithDelegateKernels(
         int node_index;
 
         TfLiteDelegateParams* params = CreateDelegateParams(delegate, subgraph);
-        AddNodeWithParameters(subgraph.input_tensors, subgraph.output_tensors,
-                              nullptr, 0, params, &registration, &node_index);
+        TF_LITE_ENSURE_STATUS(AddNodeWithParameters(
+            subgraph.input_tensors, subgraph.output_tensors, nullptr, 0, params,
+            &registration, &node_index));
 
         // Initialize the output tensors's delegate-related fields.
         for (int tensor_index : subgraph.output_tensors) {
@@ -924,6 +926,19 @@ void Interpreter::SetNumThreads(int num_threads) {
   }
 }
 
+void Interpreter::SwitchToDelegateContext() {
+  context_.GetNodeAndRegistration = GetNodeAndRegistration;
+  context_.ReplaceSubgraphsWithDelegateKernels =
+      ReplaceSubgraphsWithDelegateKernels;
+  context_.GetExecutionPlan = GetExecutionPlan;
+}
+
+void Interpreter::SwitchToKernelContext() {
+  SetForbiddenContextFunction(&context_.GetNodeAndRegistration);
+  SetForbiddenContextFunction(&context_.ReplaceSubgraphsWithDelegateKernels);
+  SetForbiddenContextFunction(&context_.GetExecutionPlan);
+}
+
 TfLiteStatus Interpreter::ModifyGraphWithDelegate(TfLiteDelegate* delegate,
                                                   bool allow_dynamic_tensors) {
   if (!allow_dynamic_tensors) {
@@ -950,17 +965,12 @@ TfLiteStatus Interpreter::ModifyGraphWithDelegate(TfLiteDelegate* delegate,
 
   // TODO(aselle): Consider if it is worth storing pointers to delegates.
   // Setup additional context interface.
-  context_.GetNodeAndRegistration = GetNodeAndRegistration;
-  context_.ReplaceSubgraphsWithDelegateKernels =
-      ReplaceSubgraphsWithDelegateKernels;
-  context_.GetExecutionPlan = GetExecutionPlan;
+  SwitchToDelegateContext();
 
   TfLiteStatus status = delegate->Prepare(&context_, delegate);
 
   // Remove additional context info.
-  SetForbiddenContextFunction(&context_.GetNodeAndRegistration);
-  SetForbiddenContextFunction(&context_.ReplaceSubgraphsWithDelegateKernels);
-  SetForbiddenContextFunction(&context_.GetExecutionPlan);
+  SwitchToKernelContext();
 
   TF_LITE_ENSURE_OK(&context_, status);
 
diff --git a/tensorflow/contrib/lite/interpreter.h b/tensorflow/contrib/lite/interpreter.h
index bc608e2fce..be149a8cc0 100644
--- a/tensorflow/contrib/lite/interpreter.h
+++ b/tensorflow/contrib/lite/interpreter.h
@@ -111,7 +111,7 @@ class Interpreter {
   // processing this model will be forwarded to the error_reporter object.
   //
   // Note, if error_reporter is nullptr, then a default StderrReporter is
-  // used.
+  // used. Ownership of 'error_reporter' remains with the caller.
   explicit Interpreter(ErrorReporter* error_reporter = DefaultErrorReporter());
 
   ~Interpreter();
@@ -416,6 +416,13 @@ class Interpreter {
  private:
   friend class InterpreterTest;
 
+  // Prevent 'context_' from accessing functions that are only available to
+  // delegated kernels.
+  void SwitchToKernelContext();
+
+  // Add delegate-only functions to 'context_'.
+  void SwitchToDelegateContext();
+
   // Give 'op_reg' a chance to initialize itself using the contents of
   // 'buffer'.
   void* OpInit(const TfLiteRegistration& op_reg, const char* buffer,
@@ -502,6 +509,7 @@ class Interpreter {
   // Update the execution graph to replace some of the nodes with stub
   // nodes. Specifically any node index that has `nodes[index]==1` will be
   // slated for replacement with a delegate kernel specified by registration.
+  // Ownership of 'nodes_to_replace' and 'delegate' remains with the caller.
   // WARNING: This is an experimental interface that is subject to change.
   TfLiteStatus ReplaceSubgraphsWithDelegateKernels(
       TfLiteRegistration registration, const TfLiteIntArray* nodes_to_replace,
diff --git a/tensorflow/contrib/lite/interpreter_test.cc b/tensorflow/contrib/lite/interpreter_test.cc
index 10119903fe..2bf598bad7 100644
--- a/tensorflow/contrib/lite/interpreter_test.cc
+++ b/tensorflow/contrib/lite/interpreter_test.cc
@@ -647,18 +647,6 @@ TEST(BasicInterpreter, AllocateTwice) {
   ASSERT_EQ(old_tensor1_ptr, interpreter.tensor(1)->data.raw);
 }
 
-struct TestErrorReporter : public ErrorReporter {
-  int Report(const char* format, va_list args) override {
-    char buffer[1024];
-    int size = vsnprintf(buffer, sizeof(buffer), format, args);
-    all_reports += buffer;
-    calls++;
-    return size;
-  }
-  int calls = 0;
-  std::string all_reports;
-};
-
 TEST(BasicInterpreter, TestNullErrorReporter) {
   TestErrorReporter reporter;
   Interpreter interpreter;
@@ -668,8 +656,9 @@ TEST(BasicInterpreter, TestCustomErrorReporter) {
   TestErrorReporter reporter;
   Interpreter interpreter(&reporter);
   ASSERT_NE(interpreter.Invoke(), kTfLiteOk);
-  ASSERT_EQ(reporter.all_reports, "Invoke called on model that is not ready.");
-  ASSERT_EQ(reporter.calls, 1);
+  ASSERT_EQ(reporter.error_messages(),
+            "Invoke called on model that is not ready.");
+  ASSERT_EQ(reporter.num_calls(), 1);
 }
 
 TEST(BasicInterpreter, TestUnsupportedDelegateFunctions) {
diff --git a/tensorflow/contrib/lite/java/AndroidManifest.xml b/tensorflow/contrib/lite/java/AndroidManifest.xml
index c3849e6868..b91c6d149a 100644
--- a/tensorflow/contrib/lite/java/AndroidManifest.xml
+++ b/tensorflow/contrib/lite/java/AndroidManifest.xml
@@ -7,5 +7,6 @@
         android:targetSdkVersion="19" />
 
     <application />
-    
+
 </manifest>
+
diff --git a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
index e2c1edd9af..fdcf00a0a0 100644
--- a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
+++ b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
@@ -152,10 +152,11 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_allocateTensors(
   if (error_reporter == nullptr) return;
 
   if (interpreter->AllocateTensors() != kTfLiteOk) {
-    throwException(env, kNullPointerException,
-                   "Internal error: Cannot allocate memory for the interpreter:"
-                   " %s",
-                   error_reporter->CachedErrorMessage());
+    throwException(
+        env, kIllegalStateException,
+        "Internal error: Unexpected failure when preparing tensor allocations:"
+        " %s",
+        error_reporter->CachedErrorMessage());
   }
 }
 
@@ -336,10 +337,11 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_createInterpreter(
   // allocates memory
   status = interpreter->AllocateTensors();
   if (status != kTfLiteOk) {
-    throwException(env, kNullPointerException,
-                   "Internal error: Cannot allocate memory for the interpreter:"
-                   " %s",
-                   error_reporter->CachedErrorMessage());
+    throwException(
+        env, kIllegalStateException,
+        "Internal error: Unexpected failure when preparing tensor allocations:"
+        " %s",
+        error_reporter->CachedErrorMessage());
     return 0;
   }
   return reinterpret_cast<jlong>(interpreter.release());
diff --git a/tensorflow/contrib/lite/kernels/BUILD b/tensorflow/contrib/lite/kernels/BUILD
index ad30624f40..329c98f91e 100644
--- a/tensorflow/contrib/lite/kernels/BUILD
+++ b/tensorflow/contrib/lite/kernels/BUILD
@@ -58,6 +58,7 @@ cc_library(
     }),
     deps = [
         ":op_macros",
+        "//tensorflow/contrib/lite:arena_planner",
         "//tensorflow/contrib/lite:context",
         "//tensorflow/contrib/lite/kernels/internal:optimized",
     ],
@@ -169,12 +170,15 @@ cc_library(
         "hashtable_lookup.cc",
         "l2norm.cc",
         "local_response_norm.cc",
+        "logical.cc",
         "lsh_projection.cc",
         "lstm.cc",
         "maximum_minimum.cc",
         "mfcc.cc",
         "mul.cc",
         "neg.cc",
+        "one_hot.cc",
+        "pack.cc",
         "pad.cc",
         "pooling.cc",
         "pow.cc",
@@ -1155,6 +1159,47 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "pack_test",
+    size = "small",
+    srcs = ["pack_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:builtin_op_data",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "one_hot_test",
+    size = "small",
+    srcs = ["one_hot_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "logical_test",
+    size = "small",
+    srcs = ["logical_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:builtin_op_data",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/lite/kernels/activations.cc b/tensorflow/contrib/lite/kernels/activations.cc
index 99f81c4a8a..6e13b8c667 100644
--- a/tensorflow/contrib/lite/kernels/activations.cc
+++ b/tensorflow/contrib/lite/kernels/activations.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <unistd.h>
 #include <cassert>
 #include <cmath>
 #include <cstdio>
@@ -186,8 +185,8 @@ TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = GetOutput(context, node, 0);
   TF_LITE_ENSURE_EQ(context, input->type, output->type);
 
-  TF_LITE_ENSURE(context,
-                 NumDimensions(input) == 2 || NumDimensions(input) == 4);
+  const int num_dims = NumDimensions(input);
+  TF_LITE_ENSURE(context, num_dims == 1 || num_dims == 2 || num_dims == 4);
 
   if (input->type == kTfLiteUInt8) {
     TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
@@ -365,13 +364,9 @@ TfLiteStatus SigmoidEval(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
-// Takes a 2D tensor and perform softmax along the second dimension.
-void Softmax2DFloat(const TfLiteTensor* input, TfLiteTensor* output,
-                    TfLiteSoftmaxParams* params) {
-  const int batch_size = input->dims->data[0];
-  const int input_size = input->dims->data[1];
-  float* in = input->data.f;
-  float* out = output->data.f;
+// Performs softmax along the input of size (input_size * batch_size).
+void Softmax(const float* in, const int input_size, const int batch_size,
+             const float beta, float* out) {
   TF_LITE_ASSERT(input_size > 0);
 
   // For each batch
@@ -385,7 +380,7 @@ void Softmax2DFloat(const TfLiteTensor* input, TfLiteTensor* output,
     // Compute the normalized sum of exps.
     float exp_sum = 0.0;
     for (int i = 0; i < input_size; i++) {
-      out[i] = std::exp((in[i] - max_coeff) * params->beta);
+      out[i] = std::exp((in[i] - max_coeff) * beta);
       exp_sum += out[i];
     }
 
@@ -401,6 +396,33 @@ void Softmax2DFloat(const TfLiteTensor* input, TfLiteTensor* output,
   }
 }
 
+// Takes a 1D tensor and performs softmax along it.
+void Softmax1DFloat(const TfLiteTensor* input, TfLiteTensor* output,
+                    TfLiteSoftmaxParams* params) {
+  const int input_size = input->dims->data[0];
+  Softmax(input->data.f, input_size, 1, params->beta, output->data.f);
+}
+
+// Takes a 2D tensor and perform softmax along the last dimension.
+void Softmax2DFloat(const TfLiteTensor* input, TfLiteTensor* output,
+                    TfLiteSoftmaxParams* params) {
+  const int batch_size = input->dims->data[0];
+  const int input_size = input->dims->data[1];
+  Softmax(input->data.f, input_size, batch_size, params->beta, output->data.f);
+}
+
+void Softmax1DQuantized(const TfLiteTensor* input, TfLiteTensor* output,
+                        TfLiteSoftmaxParams* params, OpData* data) {
+  // TODO(ahentz): this is arguably a dirty trick. Since the implementation
+  // always traverses the last dimension of a 4D tensor, we will pretend our 1D
+  // tensor is 4D in a special way. We will convert a (Y) shape into a (1,
+  // 1, 1, Y) shape.
+  const int input_size = input->dims->data[0];
+  optimized_ops::Softmax(
+      GetTensorData<uint8_t>(input), GetTensorShape({1, 1, 1, input_size}),
+      data->input_multiplier, data->input_left_shift, data->diff_min,
+      GetTensorData<uint8_t>(output), GetTensorShape({1, 1, 1, input_size}));
+}
 void Softmax2DQuantized(const TfLiteTensor* input, TfLiteTensor* output,
                         TfLiteSoftmaxParams* params, OpData* data) {
   // TODO(ahentz): this is arguably a dirty trick. Since the implementation
@@ -443,6 +465,10 @@ TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
   // dimensions.
   switch (input->type) {
     case kTfLiteFloat32: {
+      if (NumDimensions(input) == 1) {
+        Softmax1DFloat(input, output, params);
+        return kTfLiteOk;
+      }
       if (NumDimensions(input) == 2) {
         Softmax2DFloat(input, output, params);
         return kTfLiteOk;
@@ -452,11 +478,15 @@ TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
         return kTfLiteOk;
       }
       context->ReportError(
-          context, "Only 2D and 4D tensors supported currently, got %dD.",
+          context, "Only 1D, 2D and 4D tensors supported currently, got %dD.",
           NumDimensions(input));
       return kTfLiteError;
     }
     case kTfLiteUInt8: {
+      if (NumDimensions(input) == 1) {
+        Softmax1DQuantized(input, output, params, data);
+        return kTfLiteOk;
+      }
       if (NumDimensions(input) == 2) {
         Softmax2DQuantized(input, output, params, data);
         return kTfLiteOk;
diff --git a/tensorflow/contrib/lite/kernels/activations_test.cc b/tensorflow/contrib/lite/kernels/activations_test.cc
index 587e1303da..083cdf78d7 100644
--- a/tensorflow/contrib/lite/kernels/activations_test.cc
+++ b/tensorflow/contrib/lite/kernels/activations_test.cc
@@ -339,6 +339,29 @@ TEST(QuantizedActivationsOpTest, Softmax4D) {
                   kQuantizedTolerance)));
 }
 
+TEST(FloatActivationsOpTest, Softmax1D) {
+  FloatActivationsOpModel m(0.1,
+                            /*input=*/{TensorType_FLOAT32, {8}});
+  m.SetInput({0, -6, 2, 4, 3, -2, 10, 1});
+  m.Invoke();
+  EXPECT_THAT(
+      m.GetOutput(),
+      ElementsAreArray(ArrayFloatNear(
+          {.09752, .05352, .11911, .14548, .13164, .07984, .26509, .10778})));
+}
+
+TEST(QuantizedActivationsOpTest, Softmax1D) {
+  QuantizedActivationsOpModel m(0.1,
+                                /*input=*/{TensorType_UINT8, {8}, -10, 10});
+  m.SetInput<uint8_t>({0, -6, 2, 4, 3, -2, 10, 1});
+  m.Invoke();
+  EXPECT_THAT(
+      m.GetDequantizedOutput<uint8_t>(),
+      ElementsAreArray(ArrayFloatNear({0.09766, 0.05469, 0.12109, 0.14453,
+                                       0.13281, 0.07813, 0.26563, 0.10938},
+                                      kQuantizedTolerance)));
+}
+
 TEST(FloatActivationsOpTest, Softmax2D) {
   FloatActivationsOpModel m(0.1,
                             /*input=*/{TensorType_FLOAT32, {2, 4}});
diff --git a/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm.cc b/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm.cc
index 14a19aeef3..a11a59aa05 100644
--- a/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm.cc
+++ b/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <unistd.h>
 #include <cassert>
 #include <cmath>
 #include <cstdio>
diff --git a/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc b/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc
index aa24c1f34c..517309a226 100644
--- a/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc
+++ b/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <unistd.h>
 #include <cassert>
 #include <cmath>
 #include <cstdlib>
diff --git a/tensorflow/contrib/lite/kernels/concatenation.cc b/tensorflow/contrib/lite/kernels/concatenation.cc
index 45ea8d0049..ad211e9c67 100644
--- a/tensorflow/contrib/lite/kernels/concatenation.cc
+++ b/tensorflow/contrib/lite/kernels/concatenation.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <unistd.h>
 #include <cassert>
 #include <cmath>
 #include <cstdio>
diff --git a/tensorflow/contrib/lite/kernels/conv.cc b/tensorflow/contrib/lite/kernels/conv.cc
index a4fe9e5550..6f174763df 100644
--- a/tensorflow/contrib/lite/kernels/conv.cc
+++ b/tensorflow/contrib/lite/kernels/conv.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <unistd.h>
 #include <algorithm>
 #include <cassert>
 #include <cmath>
diff --git a/tensorflow/contrib/lite/kernels/depthwise_conv.cc b/tensorflow/contrib/lite/kernels/depthwise_conv.cc
index 16e5f1d065..21518156b8 100644
--- a/tensorflow/contrib/lite/kernels/depthwise_conv.cc
+++ b/tensorflow/contrib/lite/kernels/depthwise_conv.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <unistd.h>
 #include <cassert>
 #include <cmath>
 #include <cstdio>
diff --git a/tensorflow/contrib/lite/kernels/eigen_support.cc b/tensorflow/contrib/lite/kernels/eigen_support.cc
index 4f0d020793..e542ad0765 100644
--- a/tensorflow/contrib/lite/kernels/eigen_support.cc
+++ b/tensorflow/contrib/lite/kernels/eigen_support.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <utility>
 
+#include "tensorflow/contrib/lite/arena_planner.h"
 #include "tensorflow/contrib/lite/kernels/internal/optimized/eigen_spatial_convolutions.h"
 #include "tensorflow/contrib/lite/kernels/op_macros.h"
 
@@ -23,6 +24,16 @@ namespace tflite {
 namespace eigen_support {
 namespace {
 
+#ifndef EIGEN_DONT_ALIGN
+// Eigen may require buffers to be algiend to 16, 32 or 64 bytes depending on
+// hardware architecture and build configurations.
+// If the static assertion fails, try to increase `kDefaultTensorAlignment` to
+// in `arena_planner.h` to 32 or 64.
+static_assert(
+    kDefaultTensorAlignment % EIGEN_MAX_ALIGN_BYTES == 0,
+    "kDefaultArenaAlignment doesn't comply with Eigen alignment requirement.");
+#endif  // EIGEN_DONT_ALIGN
+
 // We have a single global threadpool for all convolution operations. This means
 // that inferences started from different threads may block each other, but
 // since the underlying resource of CPU cores should be consumed by the
diff --git a/tensorflow/contrib/lite/kernels/embedding_lookup.cc b/tensorflow/contrib/lite/kernels/embedding_lookup.cc
index f550339d03..b2dff87e62 100644
--- a/tensorflow/contrib/lite/kernels/embedding_lookup.cc
+++ b/tensorflow/contrib/lite/kernels/embedding_lookup.cc
@@ -29,7 +29,6 @@ limitations under the License.
 //   When indices are out of bound, the ops will not succeed.
 //
 
-#include <unistd.h>
 #include <cassert>
 #include <cmath>
 #include <cstdio>
diff --git a/tensorflow/contrib/lite/kernels/fully_connected.cc b/tensorflow/contrib/lite/kernels/fully_connected.cc
index 3b203dd480..bc370608c0 100644
--- a/tensorflow/contrib/lite/kernels/fully_connected.cc
+++ b/tensorflow/contrib/lite/kernels/fully_connected.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <unistd.h>
 #include <cassert>
 #include <cmath>
 #include <cstdio>
@@ -71,7 +70,7 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   // Instead, we allocate a new object to carry information from Prepare() to
   // Eval().
   gemm_support::IncrementUsageCounter(context);
-  auto* op_data = new OpData;
+  auto* op_data = new OpData();
   context->AddTensors(context, 1, &op_data->input_quantized_index);
   return op_data;
 }
diff --git a/tensorflow/contrib/lite/kernels/hashtable_lookup.cc b/tensorflow/contrib/lite/kernels/hashtable_lookup.cc
index 41211d41aa..f37c66acb3 100644
--- a/tensorflow/contrib/lite/kernels/hashtable_lookup.cc
+++ b/tensorflow/contrib/lite/kernels/hashtable_lookup.cc
@@ -31,7 +31,6 @@ limitations under the License.
 //   Each item indicates whether the corresponding lookup has a returned value.
 //   0 for missing key, 1 for found key.
 
-#include <unistd.h>
 #include <cassert>
 #include <cmath>
 #include <cstdio>
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.h b/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.h
index 45c9f65b64..63c89d1eee 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.h
@@ -115,10 +115,10 @@ void ClipVector(const float* vector, int v_size, float abs_limit,
 }
 
 void SymmetricQuantizeFloats(const float* values, const int size,
-                             int8_t* quantized_values, float* min, float* max,
-                             float* scaling_factor) {
-  NEON_OR_PORTABLE(SymmetricQuantizeFloats, values, size, quantized_values, min,
-                   max, scaling_factor);
+                             int8_t* quantized_values, float* min_value,
+                             float* max_value, float* scaling_factor) {
+  NEON_OR_PORTABLE(SymmetricQuantizeFloats, values, size, quantized_values,
+                   min_value, max_value, scaling_factor);
 }
 
 void VectorShiftLeft(float* vector, int v_size, float shift_value) {
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h b/tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h
index db7926df9a..010b40b901 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h
@@ -19,6 +19,10 @@ limitations under the License.
 // structure.
 #include "tensorflow/contrib/lite/builtin_op_data.h"
 
+#if defined(_MSC_VER)
+#define __restrict__ __restrict
+#endif
+
 #ifndef USE_NEON
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
 #define USE_NEON
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.cc b/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.cc
index 7ead449ca8..6bd88b5596 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.cc
+++ b/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include <stdlib.h>
 #include <string.h>
+#include <algorithm>
 
 #include "tensorflow/contrib/lite/builtin_op_data.h"
 #include "tensorflow/contrib/lite/kernels/activation_functor.h"
@@ -38,14 +39,14 @@ bool PortableIsZeroVector(const float* vector, int v_size) {
 
 void PortableSymmetricQuantizeFloats(const float* values, const int size,
                                      int8_t* quantized_values,
-                                     float* __restrict__ min,
-                                     float* __restrict__ max,
+                                     float* __restrict__ min_value,
+                                     float* __restrict__ max_value,
                                      float* __restrict__ scaling_factor) {
   auto minmax = std::minmax_element(values, values + size);
-  *min = *minmax.first;
-  *max = *minmax.second;
+  *min_value = *minmax.first;
+  *max_value = *minmax.second;
   const int kScale = 127;
-  const float range = std::max(std::abs(*min), std::abs(*max));
+  const float range = std::max(std::abs(*min_value), std::abs(*max_value));
   if (range == 0) {
     memset(quantized_values, 0, size * sizeof(int8_t));
     *scaling_factor = 1;
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h b/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h
index d3a4fa8507..a375aaffa6 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h
@@ -19,6 +19,10 @@ limitations under the License.
 // structure.
 #include "tensorflow/contrib/lite/builtin_op_data.h"
 
+#if defined(_MSC_VER)
+#define __restrict__ __restrict
+#endif
+
 namespace tflite {
 namespace tensor_utils {
 
@@ -28,8 +32,8 @@ float PortableClip(float f, float abs_limit);
 bool PortableIsZeroVector(const float* vector, int v_size);
 
 void PortableSymmetricQuantizeFloats(const float* values, const int size,
-                                     int8_t* quantized_values, float* min,
-                                     float* max, float* scaling_factor);
+                                     int8_t* quantized_values, float* min_value,
+                                     float* max_value, float* scaling_factor);
 
 // Multiply a matrix by a batch vector, and store results in a batch-size
 // vector.
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index 10e23f0b41..714613b96e 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -1860,6 +1860,26 @@ void Concatenation(int concat_dim, const Scalar* const* input_data,
   }
 }
 
+template <typename Scalar>
+void Pack(int dim, const Scalar* const* input_data,
+          const Dims<4>* const* input_dims, int inputs_count,
+          Scalar* output_data, const Dims<4>& output_dims) {
+  TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
+  int outer_size = 1;
+  for (int i = dim + 1; i < 4; i++) {
+    outer_size *= output_dims.sizes[i];
+  }
+  Scalar* output_ptr = output_data;
+  const int copy_size = FlatSize(**input_dims) / outer_size;
+  for (int k = 0; k < outer_size; k++) {
+    for (int i = 0; i < inputs_count; ++i) {
+      memcpy(output_ptr, input_data[i] + k * copy_size,
+             copy_size * sizeof(Scalar));
+      output_ptr += copy_size;
+    }
+  }
+}
+
 // TODO(prabhumk): This is the same as the optimized implementation.
 // TODO(prabhumk): The quantized implementation of concatentation isn't fully
 // quantized as it takes scale as a floating point value. This should be fixed
@@ -3264,7 +3284,8 @@ inline void SpaceToBatchND(const T* input_data, const Dims<4>& input_dims,
                            const Dims<4>& block_shape_dims,
                            const int32* paddings_data,
                            const Dims<4>& paddings_dims, T* output_data,
-                           const Dims<4>& output_dims) {
+                           const Dims<4>& output_dims,
+                           const int32_t pad_value) {
   const int output_batch_size = ArraySize(output_dims, 3);
   const int output_height = ArraySize(output_dims, 2);
   const int output_width = ArraySize(output_dims, 1);
@@ -3289,7 +3310,7 @@ inline void SpaceToBatchND(const T* input_data, const Dims<4>& input_dims,
                 padding_top + input_height ||
             out_w * block_shape_width + shift_w < padding_left ||
             out_w * block_shape_width + shift_w >= padding_left + input_width) {
-          memset(out, 0, depth * sizeof(T));
+          memset(out, pad_value, depth * sizeof(T));
         } else {
           const T* in =
               input_data +
@@ -3305,6 +3326,17 @@ inline void SpaceToBatchND(const T* input_data, const Dims<4>& input_dims,
 }
 
 template <typename T>
+inline void SpaceToBatchND(const T* input_data, const Dims<4>& input_dims,
+                           const int32* block_shape_data,
+                           const Dims<4>& block_shape_dims,
+                           const int32* paddings_data,
+                           const Dims<4>& paddings_dims, T* output_data,
+                           const Dims<4>& output_dims) {
+  SpaceToBatchND(input_data, input_dims, block_shape_data, block_shape_dims,
+                 paddings_data, paddings_dims, output_data, output_dims, 0);
+}
+
+template <typename T>
 inline void BatchToSpaceND(const T* input_data, const Dims<4>& input_dims,
                            const int32* block_shape_data,
                            const Dims<4>& block_shape_dims,
@@ -3517,7 +3549,6 @@ inline bool Reduce(const In* input_data, const int* input_dims,
                    Out reducer(const Out current, const In in),
                    Out* output_data) {
   // Reset input iterator.
-  TFLITE_DCHECK(input_num_dims > 0);
   for (int idx = 0; idx < input_num_dims; ++idx) {
     input_iter[idx] = 0;
   }
@@ -3537,6 +3568,10 @@ inline bool ResolveAxis(const int num_dims, const int* axis,
                         const int64_t num_axis, int* out_axis,
                         int* out_num_axis) {
   *out_num_axis = 0;  // Just in case.
+  // Short-circuit axis resolution for scalars; the axis will go unused.
+  if (num_dims == 0) {
+    return true;
+  }
   // o(n^2) is fine since out_num_axis should be really small, mostly <= 4
   for (int64_t idx = 0; idx < num_axis; ++idx) {
     // Handle negative index.
@@ -4220,6 +4255,38 @@ inline void BroadcastPow(const T* input1_data, const Dims<4>& input1_dims,
   }
 }
 
+inline void Logical(const bool* input1_data, const Dims<4>& input1_dims,
+                    const bool* input2_data, const Dims<4>& input2_dims,
+                    bool* output_data, const Dims<4>& output_dims,
+                    const std::function<bool(bool, bool)>& func) {
+  const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims);
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = func(input1_data[i], input2_data[i]);
+  }
+}
+
+inline void BroadcastLogical(const bool* input1_data,
+                             const Dims<4>& input1_dims,
+                             const bool* input2_data,
+                             const Dims<4>& input2_dims, bool* output_data,
+                             const Dims<4>& output_dims,
+                             const std::function<bool(bool, bool)>& func) {
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] =
+              func(input1_data[SubscriptToIndex(desc1, c, x, y, b)],
+                   input2_data[SubscriptToIndex(desc2, c, x, y, b)]);
+        }
+      }
+    }
+  }
+}
+
 }  // namespace reference_ops
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/internal/spectrogram.cc b/tensorflow/contrib/lite/kernels/internal/spectrogram.cc
index 4eddf7bf0a..20abcb7258 100644
--- a/tensorflow/contrib/lite/kernels/internal/spectrogram.cc
+++ b/tensorflow/contrib/lite/kernels/internal/spectrogram.cc
@@ -43,13 +43,13 @@ bool Spectrogram::Initialize(int window_length, int step_length) {
   return Initialize(window, step_length);
 }
 
-inline int Log2Floor(uint n) {
+inline int Log2Floor(uint32_t n) {
   if (n == 0) return -1;
   int log = 0;
-  uint value = n;
+  uint32_t value = n;
   for (int i = 4; i >= 0; --i) {
     int shift = (1 << i);
-    uint x = value >> shift;
+    uint32_t x = value >> shift;
     if (x != 0) {
       value = x;
       log += shift;
@@ -58,7 +58,7 @@ inline int Log2Floor(uint n) {
   return log;
 }
 
-inline int Log2Ceiling(uint n) {
+inline int Log2Ceiling(uint32_t n) {
   int floor = Log2Floor(n);
   if (n == (n & ~(n - 1)))  // zero or a power of two
     return floor;
@@ -66,7 +66,7 @@ inline int Log2Ceiling(uint n) {
     return floor + 1;
 }
 
-inline uint NextPowerOfTwo(uint value) {
+inline uint32_t NextPowerOfTwo(uint32_t value) {
   int exponent = Log2Ceiling(value);
   // DCHECK_LT(exponent, std::numeric_limits<uint32>::digits);
   return 1 << exponent;
diff --git a/tensorflow/contrib/lite/kernels/internal/tensor_utils.h b/tensorflow/contrib/lite/kernels/internal/tensor_utils.h
index 82f4503127..1ff8cfe39c 100644
--- a/tensorflow/contrib/lite/kernels/internal/tensor_utils.h
+++ b/tensorflow/contrib/lite/kernels/internal/tensor_utils.h
@@ -17,6 +17,10 @@ limitations under the License.
 
 #include "tensorflow/contrib/lite/builtin_op_data.h"
 
+#if defined(_MSC_VER)
+#define __restrict__ __restrict
+#endif
+
 namespace tflite {
 namespace tensor_utils {
 
@@ -31,8 +35,8 @@ bool IsZeroVector(const float* vector, int v_size);
 // It also outputs the range (min, max) of the floating point buffer, and the
 // scaling factor used to quantize the values.
 void SymmetricQuantizeFloats(const float* values, const int size,
-                             int8_t* quantized_values, float* min, float* max,
-                             float* scaling_factor);
+                             int8_t* quantized_values, float* min_value,
+                             float* max_value, float* scaling_factor);
 
 // Multiplies a matrix by a "batched" vector (i.e. a matrix with a batch
 // dimension composed by input vectors independent from each other). The result
diff --git a/tensorflow/contrib/lite/kernels/internal/types.h b/tensorflow/contrib/lite/kernels/internal/types.h
index fe113dfdd3..c44698b677 100644
--- a/tensorflow/contrib/lite/kernels/internal/types.h
+++ b/tensorflow/contrib/lite/kernels/internal/types.h
@@ -278,7 +278,9 @@ inline tflite::Dims<4> ToRuntimeDims(const tflite::RuntimeShape& array_shape) {
 
 // Gets next index to iterate through a multidimensional array.
 inline bool NextIndex(const int num_dims, const int* dims, int* current) {
-  TFLITE_DCHECK_GT(num_dims, 0);
+  if (num_dims == 0) {
+    return false;
+  }
   TFLITE_DCHECK(dims != nullptr);
   TFLITE_DCHECK(current != nullptr);
   int carry = 1;
@@ -305,7 +307,9 @@ inline bool NextIndex(const int num_dims, const int* dims, int* current) {
 inline size_t ReducedOutputOffset(const int num_dims, const int* dims,
                                   const int* index, const int num_axis,
                                   const int* axis) {
-  TFLITE_DCHECK_GT(num_dims, 0);
+  if (num_dims == 0) {
+    return 0;
+  }
   TFLITE_DCHECK(dims != nullptr);
   TFLITE_DCHECK(index != nullptr);
   size_t offset = 0;
diff --git a/tensorflow/contrib/lite/kernels/logical.cc b/tensorflow/contrib/lite/kernels/logical.cc
new file mode 100644
index 0000000000..3dc39bf79a
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/logical.cc
@@ -0,0 +1,121 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace logical {
+namespace {
+
+// Input/output tensor index.
+constexpr int kInputTensor1 = 0;
+constexpr int kInputTensor2 = 1;
+constexpr int kOutputTensor = 0;
+
+// Op data for logical op.
+struct OpData {
+  bool requires_broadcast;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* data = new OpData;
+  data->requires_broadcast = false;
+  return data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<OpData*>(buffer);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  // Reinterprete the opaque data provided by user.
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE_EQ(context, input1->type, input2->type);
+
+  const TfLiteType type = input1->type;
+  if (type != kTfLiteBool) {
+    context->ReportError(context, "Logical ops only support bool type.");
+    return kTfLiteError;
+  }
+  output->type = type;
+
+  data->requires_broadcast = !HaveSameShapes(input1, input2);
+
+  TfLiteIntArray* output_size = nullptr;
+  if (data->requires_broadcast) {
+    TF_LITE_ENSURE_OK(context, CalculateShapeForBroadcast(
+                                   context, input1, input2, &output_size));
+  } else {
+    output_size = TfLiteIntArrayCopy(input1->dims);
+  }
+
+  return context->ResizeTensor(context, output, output_size);
+}
+
+TfLiteStatus LogicalImpl(TfLiteContext* context, TfLiteNode* node,
+                         const std::function<bool(bool, bool)>& func) {
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  if (data->requires_broadcast) {
+    reference_ops::BroadcastLogical(
+        GetTensorData<bool>(input1), GetTensorDims(input1),
+        GetTensorData<bool>(input2), GetTensorDims(input2),
+        GetTensorData<bool>(output), GetTensorDims(output), func);
+  } else {
+    reference_ops::Logical(GetTensorData<bool>(input1), GetTensorDims(input1),
+                           GetTensorData<bool>(input2), GetTensorDims(input2),
+                           GetTensorData<bool>(output), GetTensorDims(output),
+                           func);
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus LogicalOrEval(TfLiteContext* context, TfLiteNode* node) {
+  const auto logical_or_func = std::logical_or<bool>();
+  return LogicalImpl(context, node, logical_or_func);
+}
+
+}  // namespace
+}  // namespace logical
+
+TfLiteRegistration* Register_LOGICAL_OR() {
+  // Init, Free, Prepare, Eval are satisfying the Interface required by
+  // TfLiteRegistration.
+  static TfLiteRegistration r = {logical::Init, logical::Free, logical::Prepare,
+                                 logical::LogicalOrEval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/logical_test.cc b/tensorflow/contrib/lite/kernels/logical_test.cc
new file mode 100644
index 0000000000..382008245b
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/logical_test.cc
@@ -0,0 +1,87 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAre;
+
+class LogicalOpModel : public SingleOpModel {
+ public:
+  LogicalOpModel(std::initializer_list<int> input1_shape,
+                 std::initializer_list<int> input2_shape, BuiltinOperator op) {
+    input1_ = AddInput(TensorType_BOOL);
+    input2_ = AddInput(TensorType_BOOL);
+    output_ = AddOutput(TensorType_BOOL);
+    ConfigureBuiltinOp(op);
+    BuildInterpreter({input1_shape, input2_shape});
+  }
+
+  int input1() { return input1_; }
+  int input2() { return input2_; }
+
+  std::vector<bool> GetOutput() { return ExtractVector<bool>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input1_;
+  int input2_;
+  int output_;
+
+  void ConfigureBuiltinOp(BuiltinOperator op) {
+    switch (op) {
+      case BuiltinOperator_LOGICAL_OR: {
+        SetBuiltinOp(op, BuiltinOptions_LogicalOrOptions,
+                     CreateLogicalOrOptions(builder_).Union());
+        break;
+      }
+      default: { FAIL() << "We shouldn't get here."; }
+    }
+  }
+};
+
+TEST(LogicalTest, LogicalOr) {
+  LogicalOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, BuiltinOperator_LOGICAL_OR);
+  model.PopulateTensor<bool>(model.input1(), {true, false, false, true});
+  model.PopulateTensor<bool>(model.input2(), {true, false, true, false});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAre(true, false, true, true));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4));
+}
+
+TEST(LogicalTest, BroadcastLogicalOr) {
+  LogicalOpModel model({1, 1, 1, 4}, {1, 1, 1, 1}, BuiltinOperator_LOGICAL_OR);
+  model.PopulateTensor<bool>(model.input1(), {true, false, false, true});
+  model.PopulateTensor<bool>(model.input2(), {false});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAre(true, false, false, true));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/lsh_projection.cc b/tensorflow/contrib/lite/kernels/lsh_projection.cc
index 25d2dc2cdd..69523b02cc 100644
--- a/tensorflow/contrib/lite/kernels/lsh_projection.cc
+++ b/tensorflow/contrib/lite/kernels/lsh_projection.cc
@@ -50,7 +50,6 @@ limitations under the License.
 //     Output.Dim == { Tensor[0].Dim[0] * Tensor[0].Dim[1] }
 //     A flattened tensor represents projected bit vectors.
 
-#include <unistd.h>
 #include <cassert>
 #include <cmath>
 #include <cstdio>
diff --git a/tensorflow/contrib/lite/kernels/lstm.cc b/tensorflow/contrib/lite/kernels/lstm.cc
index 4dfc891548..ba251c451e 100644
--- a/tensorflow/contrib/lite/kernels/lstm.cc
+++ b/tensorflow/contrib/lite/kernels/lstm.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <unistd.h>
 #include <cassert>
 #include <cmath>
 #include <cstdio>
@@ -97,7 +96,7 @@ constexpr int kCellStateTensor = 1;
 constexpr int kOutputTensor = 2;
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  auto* op_data = new OpData;
+  auto* op_data = new OpData();
   op_data->kernel_type = kTfLiteLSTMFullKernel;
   context->AddTensors(context, /*tensors_to_add=*/7,
                       &op_data->scratch_tensor_index);
@@ -847,7 +846,7 @@ enum OutputTensor {
 };
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  auto* op_data = new OpData;
+  auto* op_data = new OpData();
   op_data->kernel_type = kTfLiteLSTMBasicKernel;
   // `scratch_tensor_index` is unused in this kernel.
   op_data->scratch_tensor_index = -1;
diff --git a/tensorflow/contrib/lite/kernels/one_hot.cc b/tensorflow/contrib/lite/kernels/one_hot.cc
new file mode 100644
index 0000000000..9ff3dca932
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/one_hot.cc
@@ -0,0 +1,199 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace one_hot {
+
+constexpr int kIndicesTensor = 0;
+constexpr int kDepthTensor = 1;
+constexpr int kOnValueTensor = 2;
+constexpr int kOffValueTensor = 3;
+constexpr int kOutputTensor = 0;
+
+// Convenience utility for destructuring a node into the appropriate tensors and
+// data for the op. Note that this destructuring is quite cheap, so we can avoid
+// allocating op-specific, persistent data on the heap.
+struct OneHotContext {
+  OneHotContext(TfLiteContext* context, TfLiteNode* node) {
+    indices = GetInput(context, node, kIndicesTensor);
+    depth = GetInput(context, node, kDepthTensor);
+    on_value = GetInput(context, node, kOnValueTensor);
+    off_value = GetInput(context, node, kOffValueTensor);
+    output = GetOutput(context, node, kOutputTensor);
+
+    const auto* params =
+        reinterpret_cast<TfLiteOneHotParams*>(node->builtin_data);
+    const int indices_dims = indices->dims->size;
+    axis = (params->axis == -1) ? indices_dims : params->axis;
+    output_dims = indices_dims + 1;
+    dtype = on_value->type;
+  }
+
+  const TfLiteTensor* indices;
+  const TfLiteTensor* depth;
+  const TfLiteTensor* on_value;
+  const TfLiteTensor* off_value;
+  TfLiteTensor* output;
+  int axis;
+  int output_dims;
+  TfLiteType dtype;
+};
+
+template <typename T, typename TI>
+void OneHotComputeImpl(const OneHotContext& op_context) {
+  // prefix_dim_size == # of elements before the axis
+  // depth == # of elements per axis
+  // suffix_dim_size == # of elements after the axis
+  int prefix_dim_size = 1;
+  for (int i = 0; i < op_context.axis; ++i) {
+    prefix_dim_size *= op_context.indices->dims->data[i];
+  }
+  const int suffix_dim_size = NumElements(op_context.indices) / prefix_dim_size;
+  const int depth = *op_context.depth->data.i32;
+
+  const T on_value = *GetTensorData<T>(op_context.on_value);
+  const T off_value = *GetTensorData<T>(op_context.off_value);
+
+  // View the indices as a matrix of size:
+  //     prefix_dim_size x suffix_dim_size
+  // View the output as a matrix of size:
+  //     prefix_dim_size x depth x suffix_dim_size
+  // Then the output is:
+  //     output(i, j, k) == (indices(i, k) == j) ? on : off
+  T* output = GetTensorData<T>(op_context.output);
+  const TI* indices = GetTensorData<TI>(op_context.indices);
+  for (int i = 0; i < prefix_dim_size; ++i) {
+    for (int j = 0; j < depth; ++j) {
+      for (int k = 0; k < suffix_dim_size; ++k, ++output) {
+        *output = static_cast<int>(indices[i * suffix_dim_size + k]) == j
+                      ? on_value
+                      : off_value;
+      }
+    }
+  }
+}
+
+template <typename T>
+void OneHotCompute(const OneHotContext& op_context) {
+  if (op_context.indices->type == kTfLiteInt64) {
+    OneHotComputeImpl<T, int64_t>(op_context);
+  } else {
+    OneHotComputeImpl<T, int>(op_context);
+  }
+}
+
+TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
+                                const OneHotContext& op_context) {
+  TF_LITE_ENSURE(context, *op_context.depth->data.i32 >= 0);
+  TfLiteIntArray* output_size = TfLiteIntArrayCreate(op_context.output_dims);
+  for (int i = 0; i < op_context.output_dims; ++i) {
+    if (i < op_context.axis) {
+      output_size->data[i] = op_context.indices->dims->data[i];
+    } else if (i == op_context.axis) {
+      output_size->data[i] = *op_context.depth->data.i32;
+    } else {
+      output_size->data[i] = op_context.indices->dims->data[i - 1];
+    }
+  }
+  return context->ResizeTensor(context, op_context.output, output_size);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 4);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  OneHotContext op_context{context, node};
+  switch (op_context.dtype) {
+    // TODO(b/111744875): Support uint8 and quantization.
+    case kTfLiteFloat32:
+    case kTfLiteInt16:
+    case kTfLiteInt32:
+    case kTfLiteInt64:
+    case kTfLiteBool:
+      op_context.output->type = op_context.dtype;
+      break;
+    default:
+      context->ReportError(context, "Unknown output data type: %d",
+                           op_context.dtype);
+      return kTfLiteError;
+  }
+
+  TF_LITE_ENSURE(context, op_context.indices->type == kTfLiteInt32 ||
+                              op_context.indices->type == kTfLiteInt64);
+  TF_LITE_ENSURE(context, op_context.axis >= 0 &&
+                              op_context.axis < op_context.output_dims);
+  TF_LITE_ENSURE_EQ(context, NumElements(op_context.depth), 1);
+  TF_LITE_ENSURE_EQ(context, NumElements(op_context.on_value), 1);
+  TF_LITE_ENSURE_EQ(context, NumElements(op_context.off_value), 1);
+  TF_LITE_ENSURE_EQ(context, op_context.on_value->type, op_context.dtype);
+  TF_LITE_ENSURE_EQ(context, op_context.off_value->type, op_context.dtype);
+
+  if (!IsConstantTensor(op_context.depth)) {
+    SetTensorToDynamic(op_context.output);
+    return kTfLiteOk;
+  }
+
+  return ResizeOutputTensor(context, op_context);
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  OneHotContext op_context{context, node};
+
+  if (IsDynamicTensor(op_context.output)) {
+    ResizeOutputTensor(context, op_context);
+  }
+
+  switch (op_context.output->type) {
+    case kTfLiteFloat32:
+      OneHotCompute<float>(op_context);
+      break;
+    case kTfLiteInt32:
+      OneHotCompute<int>(op_context);
+      break;
+    case kTfLiteInt64:
+      OneHotCompute<int64_t>(op_context);
+      break;
+    case kTfLiteBool:
+      OneHotCompute<bool>(op_context);
+      break;
+    default:
+      return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace one_hot
+
+TfLiteRegistration* Register_ONE_HOT() {
+  static TfLiteRegistration r = {
+      nullptr,
+      nullptr,
+      one_hot::Prepare,
+      one_hot::Eval,
+  };
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/one_hot_test.cc b/tensorflow/contrib/lite/kernels/one_hot_test.cc
new file mode 100644
index 0000000000..6b604ec7a7
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/one_hot_test.cc
@@ -0,0 +1,182 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <initializer_list>
+
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+template <typename T>
+class OneHotOpModel : public SingleOpModel {
+ public:
+  OneHotOpModel(std::initializer_list<int> input_shape, int depth_value,
+                TensorType dtype, int axis = -1, T on_value = 1,
+                T off_value = 0, TensorType indices_type = TensorType_INT32) {
+    indices_ = AddInput(indices_type);
+    int depth = AddInput(TensorType_INT32);
+    int on = AddInput(dtype);
+    int off = AddInput(dtype);
+    output_ = AddOutput(dtype);
+    SetBuiltinOp(BuiltinOperator_ONE_HOT, BuiltinOptions_OneHotOptions,
+                 CreateOneHotOptions(builder_, axis).Union());
+    BuildInterpreter({input_shape});
+
+    PopulateTensor<int>(depth, {depth_value});
+    PopulateTensor<T>(on, {on_value});
+    PopulateTensor<T>(off, {off_value});
+  }
+
+  template <typename TI>
+  void SetIndices(std::initializer_list<TI> data) {
+    PopulateTensor<TI>(indices_, data);
+  }
+
+  TfLiteStatus InvokeWithResult() { return interpreter_->Invoke(); }
+
+  int32_t GetOutputSize() { return GetTensorSize(output_); }
+  std::vector<T> GetOutput() { return ExtractVector<T>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int indices_;
+  int output_;
+};
+
+TEST(OneHotOpTest, BasicFloat) {
+  const int depth = 3;
+  OneHotOpModel<float> model({3}, depth, TensorType_FLOAT32);
+  model.SetIndices({0, 1, 2});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({3, 3}));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({1.f, 0.f, 0.f, 0.f, 1.f, 0.f, 0.f, 0.f, 1.f}));
+}
+
+TEST(OneHotOpTest, BasicInt) {
+  const int depth = 3;
+  OneHotOpModel<int> model({3}, depth, TensorType_INT32);
+  model.SetIndices({0, 1, 2});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({3, 3}));
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 0, 0, 0, 1, 0, 0, 0, 1}));
+}
+
+TEST(OneHotOpTest, BasicBool) {
+  const int depth = 3;
+  OneHotOpModel<bool> model({3}, depth, TensorType_BOOL);
+  model.SetIndices({0, 1, 2});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({3, 3}));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({true, false, false, false, true, false, false,
+                                false, true}));
+}
+
+TEST(OneHotOpTest, SmallDepth) {
+  const int depth = 1;
+  OneHotOpModel<int> model({3}, depth, TensorType_INT32);
+  model.SetIndices({0, 1, 2});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({3, 1}));
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 0, 0}));
+}
+
+TEST(OneHotOpTest, BigDepth) {
+  const int depth = 4;
+  OneHotOpModel<int> model({2}, depth, TensorType_INT32);
+  model.SetIndices({0, 1});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 4}));
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 0, 0, 0, 0, 1, 0, 0}));
+}
+
+TEST(OneHotOpTest, OnOffValues) {
+  const int depth = 3;
+  const int axis = -1;
+  const int on = 5;
+  const int off = 0;
+  OneHotOpModel<int> model({4}, depth, TensorType_INT32, axis, on, off);
+  model.SetIndices({0, 2, -1, 1});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({4, 3}));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({5, 0, 0, 0, 0, 5, 0, 0, 0, 0, 5, 0}));
+}
+
+TEST(OneHotOpTest, ZeroAxis) {
+  const int depth = 3;
+  const int axis = 0;
+  const int on = 5;
+  const int off = 0;
+  OneHotOpModel<int> model({4}, depth, TensorType_INT32, axis, on, off);
+  model.SetIndices({0, 2, -1, 1});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({3, 4}));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({5, 0, 0, 0, 0, 0, 0, 5, 0, 5, 0, 0}));
+}
+
+TEST(OneHotOpTest, MultiDimensionalIndices) {
+  const int depth = 3;
+  const int axis = -1;
+  const float on = 2;
+  const float off = 0;
+  OneHotOpModel<float> model({2, 2}, depth, TensorType_FLOAT32, axis, on, off);
+  model.SetIndices({0, 2, 1, -1});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 2, 3}));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({2, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0}));
+}
+
+TEST(OneHotOpTest, Int64Indices) {
+  const int depth = 3;
+  const int axis = -1;
+  const int on = 1;
+  const int off = 0;
+  OneHotOpModel<int> model({3}, depth, TensorType_INT32, axis, on, off,
+                           TensorType_INT64);
+  std::initializer_list<int64_t> indices = {0, 1, 2};
+  model.SetIndices(indices);
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({3, 3}));
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 0, 0, 0, 1, 0, 0, 0, 1}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/pack.cc b/tensorflow/contrib/lite/kernels/pack.cc
new file mode 100644
index 0000000000..bb3416f6a6
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/pack.cc
@@ -0,0 +1,131 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace pack {
+namespace {
+
+constexpr int kOutputTensor = 0;
+
+// Op data for pack op.
+struct OpData {
+  int values_count;
+  int axis;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* data = new OpData;
+  data->axis = 0;
+  return data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<OpData*>(buffer);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  const OpData* data = reinterpret_cast<OpData*>(node->builtin_data);
+
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), data->values_count);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* input0 = GetInput(context, node, 0);
+  TF_LITE_ENSURE(context, NumDimensions(input0) < 4);
+  TF_LITE_ENSURE(context, NumDimensions(input0) >= data->axis);
+  // TODO(renjieliu): Support negative axis.
+  TF_LITE_ENSURE(context, data->axis >= 0);
+  if (input0->type != kTfLiteInt32 && input0->type != kTfLiteFloat32) {
+    context->ReportError(context,
+                         "Currently pack only supports int32 and float32.");
+    return kTfLiteError;
+  }
+  // Make sure all inputs have the same shape and type.
+  for (int i = 1; i < data->values_count; ++i) {
+    const TfLiteTensor* input = GetInput(context, node, i);
+    TF_LITE_ENSURE(context, HaveSameShapes(input0, input));
+    TF_LITE_ENSURE_EQ(context, input0->type, input->type);
+  }
+
+  // Resize output. rank R will become rank R + 1
+  const int dimension_size = NumDimensions(input0) + 1;
+  const TfLiteIntArray* input_shape = input0->dims;
+  TfLiteIntArray* output_shape = TfLiteIntArrayCreate(dimension_size);
+  int i = 0;
+  for (int index = 0; index < dimension_size; ++index) {
+    if (index == data->axis) {
+      output_shape->data[index] = data->values_count;
+    } else {
+      output_shape->data[index] = input_shape->data[i++];
+    }
+  }
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TF_LITE_ENSURE_EQ(context, output->type, input0->type);
+
+  return context->ResizeTensor(context, output, output_shape);
+}
+
+template <typename T>
+void PackImpl(TfLiteContext* context, TfLiteNode* node, TfLiteTensor* output,
+              int values_count, int axis) {
+  VectorOfTensors<T> all_inputs(*context, *node->inputs);
+  reference_ops::Pack<T>(RemapDim(NumDimensions(output), axis),
+                         all_inputs.data(), all_inputs.dims(), values_count,
+                         GetTensorData<T>(output), GetTensorDims(output));
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const OpData* data = reinterpret_cast<OpData*>(node->builtin_data);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  switch (output->type) {
+    case kTfLiteFloat32: {
+      PackImpl<float>(context, node, output, data->values_count, data->axis);
+      break;
+    }
+    case kTfLiteInt32: {
+      PackImpl<int32_t>(context, node, output, data->values_count, data->axis);
+      break;
+    }
+    default: {
+      context->ReportError(context,
+                           "Currently pack only supports int32 and float32.");
+      return kTfLiteError;
+    }
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace
+}  // namespace pack
+
+TfLiteRegistration* Register_PACK() {
+  static TfLiteRegistration r = {pack::Init, pack::Free, pack::Prepare,
+                                 pack::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/pack_test.cc b/tensorflow/contrib/lite/kernels/pack_test.cc
new file mode 100644
index 0000000000..485a50ad3a
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/pack_test.cc
@@ -0,0 +1,120 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
+
+template <typename T>
+class PackOpModel : public SingleOpModel {
+ public:
+  PackOpModel(const TensorData& input_template, int axis, int values_count) {
+    std::vector<std::vector<int>> all_input_shapes;
+    for (int i = 0; i < values_count; ++i) {
+      all_input_shapes.push_back(input_template.shape);
+      AddInput(input_template);
+    }
+    output_ = AddOutput({input_template.type, /*shape=*/{}, input_template.min,
+                         input_template.max});
+    SetBuiltinOp(BuiltinOperator_PACK, BuiltinOptions_PackOptions,
+                 CreatePackOptions(builder_, values_count, axis).Union());
+    BuildInterpreter(all_input_shapes);
+  }
+
+  void SetInput(int index, std::initializer_list<T> data) {
+    PopulateTensor(index, data);
+  }
+
+  std::vector<T> GetOutput() { return ExtractVector<T>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int output_;
+};
+
+TEST(PackOpTest, FloatThreeInputs) {
+  PackOpModel<float> model({TensorType_FLOAT32, {2}}, 0, 3);
+  model.SetInput(0, {1, 4});
+  model.SetInput(1, {2, 5});
+  model.SetInput(2, {3, 6});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(3, 2));
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 4, 2, 5, 3, 6}));
+}
+
+TEST(PackOpTest, FloatThreeInputsDifferentAxis) {
+  PackOpModel<float> model({TensorType_FLOAT32, {2}}, 1, 3);
+  model.SetInput(0, {1, 4});
+  model.SetInput(1, {2, 5});
+  model.SetInput(2, {3, 6});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(2, 3));
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
+}
+
+TEST(PackOpTest, FloatMultilDimensions) {
+  PackOpModel<float> model({TensorType_FLOAT32, {2, 3}}, 1, 2);
+  model.SetInput(0, {1, 2, 3, 4, 5, 6});
+  model.SetInput(1, {7, 8, 9, 10, 11, 12});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(2, 2, 3));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12}));
+}
+
+TEST(PackOpTest, IntThreeInputs) {
+  PackOpModel<int32_t> model({TensorType_INT32, {2}}, 0, 3);
+  model.SetInput(0, {1, 4});
+  model.SetInput(1, {2, 5});
+  model.SetInput(2, {3, 6});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(3, 2));
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 4, 2, 5, 3, 6}));
+}
+
+TEST(PackOpTest, IntThreeInputsDifferentAxis) {
+  PackOpModel<int32_t> model({TensorType_INT32, {2}}, 1, 3);
+  model.SetInput(0, {1, 4});
+  model.SetInput(1, {2, 5});
+  model.SetInput(2, {3, 6});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(2, 3));
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
+}
+
+TEST(PackOpTest, IntMultilDimensions) {
+  PackOpModel<int32_t> model({TensorType_INT32, {2, 3}}, 1, 2);
+  model.SetInput(0, {1, 2, 3, 4, 5, 6});
+  model.SetInput(1, {7, 8, 9, 10, 11, 12});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(2, 2, 3));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12}));
+}
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/pooling.cc b/tensorflow/contrib/lite/kernels/pooling.cc
index 9b0487ae16..29a5be0683 100644
--- a/tensorflow/contrib/lite/kernels/pooling.cc
+++ b/tensorflow/contrib/lite/kernels/pooling.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <unistd.h>
 #include <cassert>
 #include <cmath>
 #include <cstdio>
diff --git a/tensorflow/contrib/lite/kernels/reduce.cc b/tensorflow/contrib/lite/kernels/reduce.cc
index 52e4084ff8..e99f67c725 100644
--- a/tensorflow/contrib/lite/kernels/reduce.cc
+++ b/tensorflow/contrib/lite/kernels/reduce.cc
@@ -78,6 +78,10 @@ TfLiteStatus ResizeOutputTensor(TfLiteContext* context, OpContext* op_context) {
   size_t num_axis = NumElements(op_context->axis);
   const TfLiteIntArray* input_dims = op_context->input->dims;
   int input_num_dims = NumDimensions(op_context->input);
+  if (input_num_dims == 0) {
+    return context->ResizeTensor(context, op_context->output,
+                                 TfLiteIntArrayCreate(0));
+  }
   const int* axis = GetTensorData<int>(op_context->axis);
   if (op_context->params->keep_dims) {
     TfLiteIntArray* output_dims = TfLiteIntArrayCreate(input_num_dims);
diff --git a/tensorflow/contrib/lite/kernels/reduce_test.cc b/tensorflow/contrib/lite/kernels/reduce_test.cc
index 7d28931ecd..5d432d34ef 100644
--- a/tensorflow/contrib/lite/kernels/reduce_test.cc
+++ b/tensorflow/contrib/lite/kernels/reduce_test.cc
@@ -22,6 +22,7 @@ namespace tflite {
 namespace {
 
 using ::testing::ElementsAreArray;
+using ::testing::IsEmpty;
 
 class BaseOpModel : public SingleOpModel {
  public:
@@ -197,6 +198,16 @@ TEST(ConstFloatMeanOpTest, KeepDims) {
               ElementsAreArray(ArrayFloatNear({10.5, 12.5, 14.5})));
 }
 
+TEST(ConstFloatMeanOpTest, Scalar) {
+  std::vector<float> data = {3.27};
+  MeanOpConstModel m({TensorType_FLOAT32, {}}, {TensorType_FLOAT32, {}}, {},
+                     {0}, true);
+  m.SetInput(data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), IsEmpty());
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({3.27})));
+}
+
 TEST(DynamicFloatMeanOpTest, NotKeepDims) {
   std::vector<float> data = {1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,
                              9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
@@ -342,6 +353,16 @@ TEST(DynamicFloatSumOpTest, NotKeepDims) {
               ElementsAreArray(ArrayFloatNear({144, 156})));
 }
 
+TEST(ConstFloatSumOpTest, Scalar) {
+  std::vector<float> data = {17.};
+  SumOpConstModel m({TensorType_FLOAT32, {}}, {TensorType_FLOAT32, {}}, {}, {0},
+                    false);
+  m.SetInput(data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), IsEmpty());
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({17.})));
+}
+
 TEST(DynamicFloatSumOpTest, KeepDims) {
   std::vector<float> data = {1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,
                              9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
@@ -630,6 +651,20 @@ TEST(DynamicUint8MaxOpTest, KeepDims) {
                   ArrayFloatNear({11.1294, 0.862745}, kQuantizedTolerance)));
 }
 
+TEST(DynamicUint8MaxOpTest, Scalar) {
+  float kQuantizedTolerance = GetTolerance(-10.0, 12.0);
+  std::vector<float> data = {11.14};
+  MaxOpDynamicModel m({TensorType_UINT8, {}, -10.0, 12.0},
+                      {TensorType_UINT8, {}, -10.0, 12.0},
+                      {TensorType_INT32, {1}}, true);
+  std::vector<int> axis = {0};
+  m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), IsEmpty());
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear({11.1294}, kQuantizedTolerance)));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/register.cc b/tensorflow/contrib/lite/kernels/register.cc
index f0f2757277..e632728841 100644
--- a/tensorflow/contrib/lite/kernels/register.cc
+++ b/tensorflow/contrib/lite/kernels/register.cc
@@ -106,6 +106,9 @@ TfLiteRegistration* Register_RSQRT();
 TfLiteRegistration* Register_SHAPE();
 TfLiteRegistration* Register_POW();
 TfLiteRegistration* Register_FAKE_QUANT();
+TfLiteRegistration* Register_PACK();
+TfLiteRegistration* Register_ONE_HOT();
+TfLiteRegistration* Register_LOGICAL_OR();
 
 BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_RELU, Register_RELU());
@@ -195,6 +198,9 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_SHAPE, Register_SHAPE());
   AddBuiltin(BuiltinOperator_POW, Register_POW());
   AddBuiltin(BuiltinOperator_FAKE_QUANT, Register_FAKE_QUANT(), 1, 2);
+  AddBuiltin(BuiltinOperator_PACK, Register_PACK());
+  AddBuiltin(BuiltinOperator_ONE_HOT, Register_ONE_HOT());
+  AddBuiltin(BuiltinOperator_LOGICAL_OR, Register_LOGICAL_OR());
 
   // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that
   // custom ops aren't always included by default.
diff --git a/tensorflow/contrib/lite/kernels/reshape.cc b/tensorflow/contrib/lite/kernels/reshape.cc
index 3287040695..49ba0571e2 100644
--- a/tensorflow/contrib/lite/kernels/reshape.cc
+++ b/tensorflow/contrib/lite/kernels/reshape.cc
@@ -25,16 +25,11 @@ namespace builtin {
 namespace reshape {
 
 constexpr int kInputTensor = 0;
+constexpr int kShapeTensor = 1;
 constexpr int kOutputTensor = 0;
 
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  auto* params = reinterpret_cast<TfLiteReshapeParams*>(node->builtin_data);
-
-  // TODO(ahentz): we are often given a tensor with the shape but we only pay
-  // attention to what the shape specified in 'params'.
-  TF_LITE_ENSURE(context, NumInputs(node) == 1 || NumInputs(node) == 2);
-  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-
+TfLiteStatus ResizeOutput(TfLiteContext* context, TfLiteNode* node,
+                          TfLiteIntArray* output_shape) {
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
@@ -42,37 +37,84 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // special -1 value, meaning it will be calculated automatically based on the
   // input. Here we calculate what that dimension should be so that the number
   // of output elements in the same as the number of input elements.
-  int num_input_elements = 1;
-  for (int i = 0; i < NumDimensions(input); ++i) {
-    num_input_elements *= SizeOfDimension(input, i);
-  }
+  int num_input_elements = NumElements(input);
 
-  TfLiteIntArray* output_size = TfLiteIntArrayCreate(params->num_dimensions);
   int num_output_elements = 1;
   int stretch_dim = -1;
-  for (int i = 0; i < params->num_dimensions; ++i) {
-    int value = params->shape[i];
+  for (int i = 0; i < output_shape->size; ++i) {
+    int value = output_shape->data[i];
     if (value == -1) {
       TF_LITE_ENSURE_EQ(context, stretch_dim, -1);
       stretch_dim = i;
     } else {
       num_output_elements *= value;
-      output_size->data[i] = value;
     }
   }
   if (stretch_dim != -1) {
-    output_size->data[stretch_dim] = num_input_elements / num_output_elements;
-    num_output_elements *= output_size->data[stretch_dim];
+    output_shape->data[stretch_dim] = num_input_elements / num_output_elements;
+    num_output_elements *= output_shape->data[stretch_dim];
   }
 
   TF_LITE_ENSURE_EQ(context, num_input_elements, num_output_elements);
-  return context->ResizeTensor(context, output, output_size);
+  return context->ResizeTensor(context, output, output_shape);
+}
+
+TfLiteStatus ResizeOutputWithShapeTensor(TfLiteContext* context,
+                                         TfLiteNode* node) {
+  const TfLiteTensor* shape = GetInput(context, node, kShapeTensor);
+
+  TfLiteIntArray* output_shape = TfLiteIntArrayCreate(shape->dims->data[0]);
+  for (int i = 0; i < output_shape->size; ++i) {
+    output_shape->data[i] = shape->data.i32[i];
+  }
+  return ResizeOutput(context, node, output_shape);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteReshapeParams*>(node->builtin_data);
+
+  TF_LITE_ENSURE(context, NumInputs(node) == 1 || NumInputs(node) == 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  // Attempt to use shape tensor if it exists.
+  if (NumInputs(node) == 2) {
+    const TfLiteTensor* shape = GetInput(context, node, kShapeTensor);
+    // Check if the shape tensor is valid.
+    if (shape->dims->size == 1 && shape->type == kTfLiteInt32) {
+      // Set the output tensor as dynamic if the shape isn't constnat.
+      if (!IsConstantTensor(shape)) {
+        TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+        SetTensorToDynamic(output);
+        return kTfLiteOk;
+      }
+      // Shape is constant. Resize now.
+      return ResizeOutputWithShapeTensor(context, node);
+    }
+  }
+  // The function is returned above this line if the shape tensor is usable.
+  // Now fallback to the shape parameter in `TfLiteReshapeParams`.
+  int num_dimensions = params->num_dimensions;
+  if (num_dimensions == 1 && params->shape[0] == 0) {
+    // Legacy tflite models use a shape parameter of [0] to indicate scalars,
+    // so adjust accordingly. TODO(b/111614235): Allow zero-sized buffers during
+    // toco conversion.
+    num_dimensions = 0;
+  }
+  TfLiteIntArray* output_shape = TfLiteIntArrayCreate(num_dimensions);
+  for (int i = 0; i < num_dimensions; ++i) {
+    output_shape->data[i] = params->shape[i];
+  }
+  return ResizeOutput(context, node, output_shape);
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
+  if (IsDynamicTensor(output)) {
+    TF_LITE_ENSURE_OK(context, ResizeOutputWithShapeTensor(context, node));
+  }
+
   memcpy(output->data.raw, input->data.raw, input->bytes);
 
   return kTfLiteOk;
diff --git a/tensorflow/contrib/lite/kernels/reshape_test.cc b/tensorflow/contrib/lite/kernels/reshape_test.cc
index aecbd0399f..52d71350d3 100644
--- a/tensorflow/contrib/lite/kernels/reshape_test.cc
+++ b/tensorflow/contrib/lite/kernels/reshape_test.cc
@@ -22,18 +22,27 @@ namespace tflite {
 namespace {
 
 using ::testing::ElementsAreArray;
+using ::testing::IsEmpty;
 
 class ReshapeOpModel : public SingleOpModel {
  public:
   ReshapeOpModel(std::initializer_list<int> input_shape,
-                 std::initializer_list<int> new_shape) {
+                 std::initializer_list<int> new_shape,
+                 bool use_shape_input_tensor = false) {
     input_ = AddInput(TensorType_FLOAT32);
     output_ = AddOutput(TensorType_FLOAT32);
+    int shape_input_tensor =
+        use_shape_input_tensor ? AddInput(TensorType_INT32) : -1;
     SetBuiltinOp(
         BuiltinOperator_RESHAPE, BuiltinOptions_ReshapeOptions,
         CreateReshapeOptions(builder_, builder_.CreateVector<int>(new_shape))
             .Union());
-    BuildInterpreter({input_shape});
+    if (use_shape_input_tensor) {
+      BuildInterpreter({input_shape, GetShape(shape_input_tensor)});
+      PopulateTensor<int>(shape_input_tensor, new_shape);
+    } else {
+      BuildInterpreter({input_shape});
+    }
   }
 
   void SetInput(std::initializer_list<float> data) {
@@ -71,6 +80,14 @@ TEST(ReshapeOpTest, SimpleTest) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2, 2}));
 }
 
+TEST(ReshapeOpTest, ShapeTensorInput) {
+  ReshapeOpModel m({1, 2, 4, 1}, {2, 2, 2}, /*use_shape_input_tensor=*/true);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6, 7, 8}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2, 2}));
+}
+
 TEST(ReshapeOpTest, WithStretchDimension) {
   ReshapeOpModel m({1, 2, 4, 1}, {2, 1, -1});
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8});
@@ -79,6 +96,22 @@ TEST(ReshapeOpTest, WithStretchDimension) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 1, 4}));
 }
 
+TEST(ReshapeOpTest, ScalarOutput) {
+  ReshapeOpModel m({1}, {});
+  m.SetInput({3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3}));
+  EXPECT_THAT(m.GetOutputShape(), IsEmpty());
+}
+
+TEST(ReshapeOpTest, LegacyScalarOutput) {
+  ReshapeOpModel m({1}, {0});
+  m.SetInput({3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3}));
+  EXPECT_THAT(m.GetOutputShape(), IsEmpty());
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc b/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc
index 10caffea03..f4289105f7 100644
--- a/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc
+++ b/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc
@@ -247,7 +247,7 @@ TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatches8Bit) {
       3, 6,   //
       9, 12,  //
       4, 10,  //
-      10, 16  //
+      12, 16  //
   });
   m.SetSize({3, 3});
   m.Invoke();
@@ -256,8 +256,8 @@ TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatches8Bit) {
                                         7, 9, 10,    //
                                         9, 11, 12,   //
                                         4, 8, 10,    //
-                                        8, 12, 14,   //
-                                        10, 13, 16,  //
+                                        9, 12, 14,   //
+                                        12, 14, 16,  //
                                     })));
 
   ResizeBilinearOpModel const_m({TensorType_UINT8, {2, 2, 2, 1}}, {3, 3});
@@ -265,7 +265,7 @@ TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatches8Bit) {
       3, 6,   //
       9, 12,  //
       4, 10,  //
-      10, 16  //
+      12, 16  //
   });
   const_m.Invoke();
   EXPECT_THAT(const_m.GetOutput<uint8>(), ElementsAreArray(ArrayFloatNear({
@@ -273,35 +273,35 @@ TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatches8Bit) {
                                               7, 9, 10,    //
                                               9, 11, 12,   //
                                               4, 8, 10,    //
-                                              8, 12, 14,   //
-                                              10, 13, 16,  //
+                                              9, 12, 14,   //
+                                              12, 14, 16,  //
                                           })));
 }
 
 TEST(ResizeBilinearOpTest, ThreeDimensionalResize8Bit) {
   ResizeBilinearOpModel m({TensorType_UINT8, {1, 2, 2, 2}});
   m.SetInput<uint8>({
-      3, 4, 6, 10,    //
-      9, 10, 12, 16,  //
+      3, 4, 6, 10,     //
+      10, 12, 14, 16,  //
   });
   m.SetSize({3, 3});
   m.Invoke();
   EXPECT_THAT(m.GetOutput<uint8>(), ElementsAreArray(ArrayFloatNear({
-                                        3, 4, 5, 8, 6, 10,      //
-                                        7, 8, 9, 12, 10, 14,    //
-                                        9, 10, 11, 13, 12, 16,  //
+                                        3, 4, 5, 8, 6, 10,       //
+                                        7, 9, 10, 12, 11, 14,    //
+                                        10, 12, 12, 14, 14, 16,  //
                                     })));
 
   ResizeBilinearOpModel const_m({TensorType_UINT8, {1, 2, 2, 2}}, {3, 3});
   const_m.SetInput<uint8>({
-      3, 4, 6, 10,    //
-      9, 10, 12, 16,  //
+      3, 4, 6, 10,     //
+      10, 12, 14, 16,  //
   });
   const_m.Invoke();
   EXPECT_THAT(const_m.GetOutput<uint8>(), ElementsAreArray(ArrayFloatNear({
-                                              3, 4, 5, 8, 6, 10,      //
-                                              7, 8, 9, 12, 10, 14,    //
-                                              9, 10, 11, 13, 12, 16,  //
+                                              3, 4, 5, 8, 6, 10,       //
+                                              7, 9, 10, 12, 11, 14,    //
+                                              10, 12, 12, 14, 14, 16,  //
                                           })));
 }
 }  // namespace
diff --git a/tensorflow/contrib/lite/kernels/space_to_batch_nd.cc b/tensorflow/contrib/lite/kernels/space_to_batch_nd.cc
index c9269599e5..03079f1c3b 100644
--- a/tensorflow/contrib/lite/kernels/space_to_batch_nd.cc
+++ b/tensorflow/contrib/lite/kernels/space_to_batch_nd.cc
@@ -113,7 +113,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
   }
 
-#define TF_LITE_SPACE_TO_BATCH_ND(type, scalar)                        \
+#define TF_LITE_SPACE_TO_BATCH_ND(type, scalar, pad_value)             \
   type::SpaceToBatchND(GetTensorData<scalar>(op_context.input),        \
                        GetTensorDims(op_context.input),                \
                        GetTensorData<int32_t>(op_context.block_shape), \
@@ -121,34 +121,36 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                        GetTensorData<int32_t>(op_context.paddings),    \
                        GetTensorDims(op_context.paddings),             \
                        GetTensorData<scalar>(op_context.output),       \
-                       GetTensorDims(op_context.output))
+                       GetTensorDims(op_context.output), pad_value)
   switch (op_context.input->type) {  // Already know in/out types are same.
     case kTfLiteFloat32:
       if (kernel_type == kReference) {
-        TF_LITE_SPACE_TO_BATCH_ND(reference_ops, float);
+        TF_LITE_SPACE_TO_BATCH_ND(reference_ops, float, 0);
       } else {
-        TF_LITE_SPACE_TO_BATCH_ND(optimized_ops, float);
+        TF_LITE_SPACE_TO_BATCH_ND(optimized_ops, float, 0);
       }
       break;
     case kTfLiteUInt8:
       if (kernel_type == kReference) {
-        TF_LITE_SPACE_TO_BATCH_ND(reference_ops, uint8_t);
+        TF_LITE_SPACE_TO_BATCH_ND(reference_ops, uint8_t,
+                                  op_context.output->params.zero_point);
       } else {
-        TF_LITE_SPACE_TO_BATCH_ND(optimized_ops, uint8_t);
+        TF_LITE_SPACE_TO_BATCH_ND(optimized_ops, uint8_t,
+                                  op_context.output->params.zero_point);
       }
       break;
     case kTfLiteInt32:
       if (kernel_type == kReference) {
-        TF_LITE_SPACE_TO_BATCH_ND(reference_ops, int32_t);
+        TF_LITE_SPACE_TO_BATCH_ND(reference_ops, int32_t, 0);
       } else {
-        TF_LITE_SPACE_TO_BATCH_ND(optimized_ops, int32_t);
+        TF_LITE_SPACE_TO_BATCH_ND(optimized_ops, int32_t, 0);
       }
       break;
     case kTfLiteInt64:
       if (kernel_type == kReference) {
-        TF_LITE_SPACE_TO_BATCH_ND(reference_ops, int64_t);
+        TF_LITE_SPACE_TO_BATCH_ND(reference_ops, int64_t, 0);
       } else {
-        TF_LITE_SPACE_TO_BATCH_ND(optimized_ops, int64_t);
+        TF_LITE_SPACE_TO_BATCH_ND(optimized_ops, int64_t, 0);
       }
       break;
     default:
diff --git a/tensorflow/contrib/lite/kernels/space_to_batch_nd_test.cc b/tensorflow/contrib/lite/kernels/space_to_batch_nd_test.cc
index 92a4a037d5..5756573629 100644
--- a/tensorflow/contrib/lite/kernels/space_to_batch_nd_test.cc
+++ b/tensorflow/contrib/lite/kernels/space_to_batch_nd_test.cc
@@ -23,6 +23,7 @@ namespace tflite {
 namespace {
 
 using ::testing::ElementsAreArray;
+using ::testing::Matcher;
 
 class SpaceToBatchNDOpModel : public SingleOpModel {
  public:
@@ -30,6 +31,10 @@ class SpaceToBatchNDOpModel : public SingleOpModel {
     PopulateTensor<float>(input_, data);
   }
 
+  void SetQuantizedInput(std::initializer_list<float> data) {
+    QuantizeAndPopulate<uint8_t>(input_, data);
+  }
+
   void SetBlockShape(std::initializer_list<int> data) {
     PopulateTensor<int>(block_shape_, data);
   }
@@ -41,6 +46,11 @@ class SpaceToBatchNDOpModel : public SingleOpModel {
   std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+                               GetScale(output_), GetZeroPoint(output_));
+  }
+
  protected:
   int input_;
   int block_shape_;
@@ -56,18 +66,19 @@ class SpaceToBatchNDOpModel : public SingleOpModel {
 //    m.Invoke();
 class SpaceToBatchNDOpConstModel : public SpaceToBatchNDOpModel {
  public:
-  SpaceToBatchNDOpConstModel(std::initializer_list<int> input_shape,
+  SpaceToBatchNDOpConstModel(const TensorData& input,
                              std::initializer_list<int> block_shape,
-                             std::initializer_list<int> paddings) {
-    input_ = AddInput(TensorType_FLOAT32);
+                             std::initializer_list<int> paddings,
+                             const TensorData& output) {
+    input_ = AddInput(input);
     block_shape_ = AddConstInput(TensorType_INT32, block_shape, {2});
     paddings_ = AddConstInput(TensorType_INT32, paddings, {2, 2});
-    output_ = AddOutput(TensorType_FLOAT32);
+    output_ = AddOutput(output);
 
     SetBuiltinOp(BuiltinOperator_SPACE_TO_BATCH_ND,
                  BuiltinOptions_SpaceToBatchNDOptions,
                  CreateSpaceToBatchNDOptions(builder_).Union());
-    BuildInterpreter({input_shape});
+    BuildInterpreter({input.shape});
   }
 };
 
@@ -81,26 +92,30 @@ class SpaceToBatchNDOpConstModel : public SpaceToBatchNDOpModel {
 //    m.Invoke();
 class SpaceToBatchNDOpDynamicModel : public SpaceToBatchNDOpModel {
  public:
-  SpaceToBatchNDOpDynamicModel(std::initializer_list<int> input_shape) {
-    input_ = AddInput(TensorType_FLOAT32);
+  SpaceToBatchNDOpDynamicModel(const TensorData& input,
+                               const TensorData& output) {
+    input_ = AddInput(input);
     block_shape_ = AddInput(TensorType_INT32);
     paddings_ = AddInput(TensorType_INT32);
-    output_ = AddOutput(TensorType_FLOAT32);
+    output_ = AddOutput(output);
 
     SetBuiltinOp(BuiltinOperator_SPACE_TO_BATCH_ND,
                  BuiltinOptions_SpaceToBatchNDOptions,
                  CreateSpaceToBatchNDOptions(builder_).Union());
-    BuildInterpreter({input_shape, {2}, {2, 2}});
+    BuildInterpreter({input.shape, {2}, {2, 2}});
   }
 };
 
 TEST(SpaceToBatchNDOpTest, InvalidShapeTest) {
-  EXPECT_DEATH(SpaceToBatchNDOpConstModel({1, 3, 3, 1}, {2, 2}, {0, 0, 0, 0}),
-               "Cannot allocate tensors");
+  EXPECT_DEATH(
+      SpaceToBatchNDOpConstModel({TensorType_FLOAT32, {1, 3, 3, 1}}, {2, 2},
+                                 {0, 0, 0, 0}, {TensorType_FLOAT32}),
+      "Cannot allocate tensors");
 }
 
 TEST(SpaceToBatchNDOpTest, SimpleConstTest) {
-  SpaceToBatchNDOpConstModel m({1, 4, 4, 1}, {2, 2}, {0, 0, 0, 0});
+  SpaceToBatchNDOpConstModel m({TensorType_FLOAT32, {1, 4, 4, 1}}, {2, 2},
+                               {0, 0, 0, 0}, {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 2, 2, 1}));
@@ -109,7 +124,8 @@ TEST(SpaceToBatchNDOpTest, SimpleConstTest) {
 }
 
 TEST(SpaceToBatchNDOpTest, SimpleDynamicTest) {
-  SpaceToBatchNDOpDynamicModel m({1, 4, 4, 1});
+  SpaceToBatchNDOpDynamicModel m({TensorType_FLOAT32, {1, 4, 4, 1}},
+                                 {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
   m.SetBlockShape({2, 2});
   m.SetPaddings({0, 0, 0, 0});
@@ -120,7 +136,8 @@ TEST(SpaceToBatchNDOpTest, SimpleDynamicTest) {
 }
 
 TEST(SpaceToBatchNDOpTest, MultipleInputBatchesConstTest) {
-  SpaceToBatchNDOpConstModel m({2, 2, 4, 1}, {2, 2}, {0, 0, 0, 0});
+  SpaceToBatchNDOpConstModel m({TensorType_FLOAT32, {2, 2, 4, 1}}, {2, 2},
+                               {0, 0, 0, 0}, {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({8, 1, 2, 1}));
@@ -129,7 +146,8 @@ TEST(SpaceToBatchNDOpTest, MultipleInputBatchesConstTest) {
 }
 
 TEST(SpaceToBatchNDOpTest, MultipleInputBatchesDynamicTest) {
-  SpaceToBatchNDOpDynamicModel m({2, 2, 4, 1});
+  SpaceToBatchNDOpDynamicModel m({TensorType_FLOAT32, {2, 2, 4, 1}},
+                                 {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
   m.SetBlockShape({2, 2});
   m.SetPaddings({0, 0, 0, 0});
@@ -140,7 +158,8 @@ TEST(SpaceToBatchNDOpTest, MultipleInputBatchesDynamicTest) {
 }
 
 TEST(SpaceToBatchNDOpTest, SimplePaddingConstTest) {
-  SpaceToBatchNDOpConstModel m({1, 5, 2, 1}, {3, 2}, {1, 0, 2, 0});
+  SpaceToBatchNDOpConstModel m({TensorType_FLOAT32, {1, 5, 2, 1}}, {3, 2},
+                               {1, 0, 2, 0}, {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 2, 1}));
@@ -151,7 +170,8 @@ TEST(SpaceToBatchNDOpTest, SimplePaddingConstTest) {
 }
 
 TEST(SpaceToBatchNDOpTest, SimplePaddingDynamicTest) {
-  SpaceToBatchNDOpDynamicModel m({1, 5, 2, 1});
+  SpaceToBatchNDOpDynamicModel m({TensorType_FLOAT32, {1, 5, 2, 1}},
+                                 {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
   m.SetBlockShape({3, 2});
   m.SetPaddings({1, 0, 2, 0});
@@ -164,7 +184,8 @@ TEST(SpaceToBatchNDOpTest, SimplePaddingDynamicTest) {
 }
 
 TEST(SpaceToBatchNDOpTest, ComplexPaddingConstTest) {
-  SpaceToBatchNDOpConstModel m({1, 4, 2, 1}, {3, 2}, {1, 1, 2, 4});
+  SpaceToBatchNDOpConstModel m({TensorType_FLOAT32, {1, 4, 2, 1}}, {3, 2},
+                               {1, 1, 2, 4}, {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 4, 1}));
@@ -176,7 +197,8 @@ TEST(SpaceToBatchNDOpTest, ComplexPaddingConstTest) {
 }
 
 TEST(SpaceToBatchNDOpTest, ComplexPaddingDynamicTest) {
-  SpaceToBatchNDOpDynamicModel m({1, 4, 2, 1});
+  SpaceToBatchNDOpDynamicModel m({TensorType_FLOAT32, {1, 4, 2, 1}},
+                                 {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8});
   m.SetBlockShape({3, 2});
   m.SetPaddings({1, 1, 2, 4});
@@ -189,6 +211,88 @@ TEST(SpaceToBatchNDOpTest, ComplexPaddingDynamicTest) {
                              }));
 }
 
+class QuantizedSpaceToBatchNDOpTest : public ::testing::Test {
+ protected:
+  std::vector<Matcher<float>> DequantizedArrayNear(
+      const std::vector<float>& values, const float min, const float max) {
+    const float quantization_tolerance = (max - min) / 255.0;
+    return ArrayFloatNear(values, quantization_tolerance);
+  }
+};
+
+TEST_F(QuantizedSpaceToBatchNDOpTest, ZeroNotInQuantizationRange) {
+  // The test_util and actual quantization code currently ensure that the range
+  // must include zero, but if that ever changes, this test will catch it.
+  EXPECT_DEATH(SpaceToBatchNDOpConstModel m(
+                   {TensorType_UINT8, {1, 2, 2, 1}, 1.0, 2.0}, {4, 2},
+                   {0, 0, 1, 1, 1, 1, 0, 0}, {TensorType_UINT8, {}, 1.0, 2.0}),
+               ".*Check failed: f_min <= 0.*");
+}
+
+TEST_F(QuantizedSpaceToBatchNDOpTest, SimplePaddingConstTest) {
+  SpaceToBatchNDOpConstModel m({TensorType_UINT8, {1, 5, 2, 1}, -1.0, 1.0},
+                               {3, 2}, {1, 0, 2, 0},
+                               {TensorType_UINT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput({-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8, -0.9, 0.1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 2, 1}));
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {0, 0,   0, -0.5, 0, 0,    0, 0.6,  0, -0.1, 0, -0.7,
+                   0, 0.2, 0, 0.8,  0, -0.3, 0, -0.9, 0, 0.4,  0, 0.1},
+                  -1.0, 1.0)));
+}
+
+TEST_F(QuantizedSpaceToBatchNDOpTest, SimplePaddingDynamicTest) {
+  SpaceToBatchNDOpDynamicModel m({TensorType_UINT8, {1, 5, 2, 1}, -1.0, 1.0},
+                                 {TensorType_UINT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput({-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8, -0.9, 0.1});
+  m.SetBlockShape({3, 2});
+  m.SetPaddings({1, 0, 2, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 2, 1}));
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {0, 0,   0, -0.5, 0, 0,    0, 0.6,  0, -0.1, 0, -0.7,
+                   0, 0.2, 0, 0.8,  0, -0.3, 0, -0.9, 0, 0.4,  0, 0.1},
+                  -1.0, 1.0)));
+}
+
+TEST_F(QuantizedSpaceToBatchNDOpTest, ComplexPaddingConstTest) {
+  SpaceToBatchNDOpConstModel m({TensorType_UINT8, {1, 4, 2, 1}, -1.0, 1.0},
+                               {3, 2}, {1, 1, 2, 4},
+                               {TensorType_UINT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput({-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 4, 1}));
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {
+                      0, 0,    0, 0, 0, -0.5, 0, 0, 0, 0,   0, 0, 0, 0.6, 0, 0,
+                      0, -0.1, 0, 0, 0, -0.7, 0, 0, 0, 0.2, 0, 0, 0, 0.8, 0, 0,
+                      0, -0.3, 0, 0, 0, 0,    0, 0, 0, 0.4, 0, 0, 0, 0,   0, 0,
+                  },
+                  -1.0, 1.0)));
+}
+
+TEST_F(QuantizedSpaceToBatchNDOpTest, ComplexPaddingDynamicTest) {
+  SpaceToBatchNDOpDynamicModel m({TensorType_UINT8, {1, 4, 2, 1}, -1.0, 1.0},
+                                 {TensorType_UINT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput({-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8});
+  m.SetBlockShape({3, 2});
+  m.SetPaddings({1, 1, 2, 4});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 4, 1}));
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {
+                      0, 0,    0, 0, 0, -0.5, 0, 0, 0, 0,   0, 0, 0, 0.6, 0, 0,
+                      0, -0.1, 0, 0, 0, -0.7, 0, 0, 0, 0.2, 0, 0, 0, 0.8, 0, 0,
+                      0, -0.3, 0, 0, 0, 0,    0, 0, 0, 0.4, 0, 0, 0, 0,   0, 0,
+                  },
+                  -1.0, 1.0)));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/sparse_to_dense.cc b/tensorflow/contrib/lite/kernels/sparse_to_dense.cc
index 404c32ad9c..7be5e66c16 100644
--- a/tensorflow/contrib/lite/kernels/sparse_to_dense.cc
+++ b/tensorflow/contrib/lite/kernels/sparse_to_dense.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <unistd.h>
 #include <cassert>
 #include <cmath>
 #include <cstdio>
diff --git a/tensorflow/contrib/lite/kernels/svdf.cc b/tensorflow/contrib/lite/kernels/svdf.cc
index 22eebdd4ce..6d4912ce3a 100644
--- a/tensorflow/contrib/lite/kernels/svdf.cc
+++ b/tensorflow/contrib/lite/kernels/svdf.cc
@@ -16,7 +16,6 @@ limitations under the License.
 // SVDF op that compresses a fully connected op via low-rank matrix
 // factorization. See https://research.google.com/pubs/archive/43813.pdf for
 // details.
-#include <unistd.h>
 #include <cassert>
 #include <cmath>
 #include <cstdio>
@@ -105,7 +104,7 @@ constexpr int kStateTensor = 0;
 constexpr int kOutputTensor = 1;
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  auto* op_data = new OpData;
+  auto* op_data = new OpData();
   op_data->float_weights_time_initialized = false;
   context->AddTensors(context, /*tensors_to_add=*/4,
                       &op_data->scratch_tensor_index);
diff --git a/tensorflow/contrib/lite/kernels/transpose_conv.cc b/tensorflow/contrib/lite/kernels/transpose_conv.cc
index 8b9deeed20..a9baa5c698 100644
--- a/tensorflow/contrib/lite/kernels/transpose_conv.cc
+++ b/tensorflow/contrib/lite/kernels/transpose_conv.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <unistd.h>
 #include <cassert>
 #include <cmath>
 #include <cstdio>
diff --git a/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm.cc b/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm.cc
index c48b470f92..0acd705950 100644
--- a/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm.cc
+++ b/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <unistd.h>
 #include <cassert>
 #include <cmath>
 #include <cstdio>
diff --git a/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc b/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc
index 164a0cbd08..0d6d29a171 100644
--- a/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc
+++ b/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <unistd.h>
 #include <cassert>
 #include <cmath>
 #include <cstdio>
diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc
index 5e6106a87e..5814cddc5b 100644
--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/contrib/lite/model.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include <sys/types.h>
-#include <unistd.h>
 
 #include "tensorflow/contrib/lite/allocation.h"
 #include "tensorflow/contrib/lite/builtin_op_data.h"
@@ -706,6 +705,15 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = static_cast<void*>(params);
       break;
     }
+    case BuiltinOperator_PACK: {
+      TfLitePackParams* params = MallocPOD<TfLitePackParams>();
+      if (auto* pack_params = op->builtin_options_as_PackOptions()) {
+        params->values_count = pack_params->values_count();
+        params->axis = pack_params->axis();
+      }
+      *builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
     case BuiltinOperator_DELEGATE: {
       // TODO(ycling): Revisit when supporting saving delegated models.
       error_reporter->Report("DELEGATE op shouldn't exist in model.");
@@ -722,6 +730,14 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = static_cast<void*>(params);
       break;
     }
+    case BuiltinOperator_ONE_HOT: {
+      auto* params = MallocPOD<TfLiteOneHotParams>();
+      if (auto* schema_params = op->builtin_options_as_OneHotOptions()) {
+        params->axis = schema_params->axis();
+      }
+      *builtin_data = static_cast<void*>(params);
+      break;
+    }
 
     // Below are the ops with no builtin_data strcture.
     case BuiltinOperator_BATCH_TO_SPACE_ND:
@@ -764,7 +780,7 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_TOPK_V2:
     case BuiltinOperator_TRANSPOSE:
     case BuiltinOperator_POW:
-    case BuiltinOperator_PACK:
+    case BuiltinOperator_LOGICAL_OR:
       break;
   }
   return kTfLiteOk;
diff --git a/tensorflow/contrib/lite/model_test.cc b/tensorflow/contrib/lite/model_test.cc
index 15bae21a41..df4f60d4ad 100644
--- a/tensorflow/contrib/lite/model_test.cc
+++ b/tensorflow/contrib/lite/model_test.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include <sys/types.h>
-#include <unistd.h>
 
 #include "tensorflow/contrib/lite/model.h"
 
@@ -242,14 +241,6 @@ TEST(BasicFlatBufferModel, TestWithNullVerifier) {
       "tensorflow/contrib/lite/testdata/test_model.bin", nullptr));
 }
 
-struct TestErrorReporter : public ErrorReporter {
-  int Report(const char* format, va_list args) override {
-    calls++;
-    return 0;
-  }
-  int calls = 0;
-};
-
 // This makes sure the ErrorReporter is marshalled from FlatBufferModel to
 // the Interpreter.
 TEST(BasicFlatBufferModel, TestCustomErrorReporter) {
@@ -263,7 +254,7 @@ TEST(BasicFlatBufferModel, TestCustomErrorReporter) {
   TrivialResolver resolver;
   InterpreterBuilder(*model, resolver)(&interpreter);
   ASSERT_NE(interpreter->Invoke(), kTfLiteOk);
-  ASSERT_EQ(reporter.calls, 1);
+  ASSERT_EQ(reporter.num_calls(), 1);
 }
 
 // This makes sure the ErrorReporter is marshalled from FlatBufferModel to
diff --git a/tensorflow/contrib/lite/nnapi_delegate.cc b/tensorflow/contrib/lite/nnapi_delegate.cc
index 710ce1632e..1c06b29deb 100644
--- a/tensorflow/contrib/lite/nnapi_delegate.cc
+++ b/tensorflow/contrib/lite/nnapi_delegate.cc
@@ -560,6 +560,14 @@ TfLiteStatus AddOpsAndParams(
         nnapi_version = 11;  // require NNAPI 1.1
         nn_op_type = ANEURALNETWORKS_TRANSPOSE;
         break;
+      case tflite::BuiltinOperator_L2_NORMALIZATION:
+        nn_op_type = ANEURALNETWORKS_L2_NORMALIZATION;
+        if (reinterpret_cast<TfLiteL2NormParams*>(node.builtin_data)
+                ->activation != kTfLiteActNone) {
+          FATAL(
+              "NNAPI does not support L2Normalization with fused activations");
+        }
+        break;
       case tflite::BuiltinOperator_CONCAT_EMBEDDINGS:
       case tflite::BuiltinOperator_LSH_PROJECTION:
       case tflite::BuiltinOperator_HASHTABLE_LOOKUP:
@@ -568,7 +576,6 @@ TfLiteStatus AddOpsAndParams(
       case tflite::BuiltinOperator_EMBEDDING_LOOKUP_SPARSE:
       case tflite::BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM:
       case tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM:
-      case tflite::BuiltinOperator_L2_NORMALIZATION:
       case tflite::BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION:
       case tflite::BuiltinOperator_PADV2:
       case tflite::BuiltinOperator_RESIZE_BILINEAR:
@@ -615,6 +622,8 @@ TfLiteStatus AddOpsAndParams(
       case tflite::BuiltinOperator_POW:
       case tflite::BuiltinOperator_FAKE_QUANT:
       case tflite::BuiltinOperator_PACK:
+      case tflite::BuiltinOperator_LOGICAL_OR:
+      case tflite::BuiltinOperator_ONE_HOT:
         logError("Op code %d is currently not delegated to NNAPI", builtin);
         return kTfLiteError;
         break;
diff --git a/tensorflow/contrib/lite/profiling/BUILD b/tensorflow/contrib/lite/profiling/BUILD
index b29ca330dc..1172722f7a 100644
--- a/tensorflow/contrib/lite/profiling/BUILD
+++ b/tensorflow/contrib/lite/profiling/BUILD
@@ -20,7 +20,6 @@ cc_test(
     srcs = ["profiler_test.cc"],
     copts = ["-DTFLITE_PROFILING_ENABLED"],
     defines = ["TFLITE_PROFILING_ENABLED"],
-    tags = ["no_oss"],
     deps = [
         ":profiler",
         "//tensorflow/contrib/lite/testing:util",
@@ -77,7 +76,6 @@ cc_test(
     srcs = ["profile_buffer_test.cc"],
     copts = ["-DTFLITE_PROFILING_ENABLED"],
     defines = ["TFLITE_PROFILING_ENABLED"],
-    tags = ["no_oss"],
     deps = [
         ":profile_buffer",
         "//tensorflow/contrib/lite/testing:util",
diff --git a/tensorflow/contrib/lite/profiling/profile_summarizer.cc b/tensorflow/contrib/lite/profiling/profile_summarizer.cc
index 36e87b666a..720bd717b9 100644
--- a/tensorflow/contrib/lite/profiling/profile_summarizer.cc
+++ b/tensorflow/contrib/lite/profiling/profile_summarizer.cc
@@ -23,8 +23,6 @@ namespace tflite {
 namespace profiling {
 namespace {
 
-using Detail = tensorflow::StatsCalculator::Detail;
-
 struct OperatorDetails {
   std::string name;
   std::vector<std::string> inputs;
@@ -125,28 +123,17 @@ void ProfileSummarizer::ProcessProfiles(
   int64_t base_start_us = events[0]->begin_timestamp_us;
   int node_num = 0;
   int64_t curr_total_us = 0;
-  std::map<std::string, Detail> details;
   for (auto event : events) {
     auto op_details = GetOperatorDetails(interpreter, event->event_metadata);
     auto node_name = ToString(op_details.outputs);
-    auto result = details.emplace(node_name, Detail());
-    Detail* detail = &(result.first->second);
-    detail->start_us.UpdateStat(event->begin_timestamp_us - base_start_us);
+    int64_t start_us = event->begin_timestamp_us - base_start_us;
     int64_t node_exec_time =
         event->end_timestamp_us - event->begin_timestamp_us;
-    detail->rel_end_us.UpdateStat(node_exec_time);
+    stats_calculator_->AddNodeStats(node_name, op_details.name, node_num,
+                                    start_us, node_exec_time, 0 /*memory */);
     curr_total_us += node_exec_time;
     ++node_num;
-
-    if (result.second) {
-      detail->name = node_name;
-      detail->type = op_details.name;
-      detail->run_order = node_num;
-      detail->times_called = 0;
-    }
-    ++detail->times_called;
   }
-  stats_calculator_->UpdateDetails(details);
   stats_calculator_->UpdateRunTotalUs(curr_total_us);
 }
 }  // namespace profiling
diff --git a/tensorflow/contrib/lite/profiling/time.cc b/tensorflow/contrib/lite/profiling/time.cc
index 446660bb74..875ddb02bc 100644
--- a/tensorflow/contrib/lite/profiling/time.cc
+++ b/tensorflow/contrib/lite/profiling/time.cc
@@ -14,16 +14,34 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/contrib/lite/profiling/time.h"
 
+#if defined(_MSC_VER)
+#include <chrono>  // NOLINT(build/c++11)
+#else
 #include <sys/time.h>
+#endif
 
 namespace tflite {
 namespace profiling {
 namespace time {
+
+#if defined(_MSC_VER)
+
+uint64_t NowMicros() {
+  return std::chrono::duration_cast<std::chrono::microseconds>(
+             std::chrono::system_clock::now().time_since_epoch())
+      .count();
+}
+
+#else
+
 uint64_t NowMicros() {
   struct timeval tv;
   gettimeofday(&tv, nullptr);
   return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
 }
+
+#endif  // defined(_MSC_VER)
+
 }  // namespace time
 }  // namespace profiling
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/python/BUILD b/tensorflow/contrib/lite/python/BUILD
index 727fbff38e..860aff9e7e 100644
--- a/tensorflow/contrib/lite/python/BUILD
+++ b/tensorflow/contrib/lite/python/BUILD
@@ -20,6 +20,7 @@ py_library(
     deps = [
         "//tensorflow/contrib/lite/python/interpreter_wrapper:tensorflow_wrap_interpreter_wrapper",
         "//tensorflow/python:util",
+        "//third_party/py/numpy",
     ],
 )
 
diff --git a/tensorflow/contrib/lite/python/interpreter.py b/tensorflow/contrib/lite/python/interpreter.py
index e1981ceae2..3243bddac8 100644
--- a/tensorflow/contrib/lite/python/interpreter.py
+++ b/tensorflow/contrib/lite/python/interpreter.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import sys
+import numpy as np
 from tensorflow.python.util.lazy_loader import LazyLoader
 
 # Lazy load since some of the performance benchmark skylark rules
@@ -162,6 +163,9 @@ class Interpreter(object):
       ValueError: If the interpreter could not resize the input tensor.
     """
     self._ensure_safe()
+    # `ResizeInputTensor` now only accepts int32 numpy array as `tensor_size
+    # parameter.
+    tensor_size = np.array(tensor_size, dtype=np.int32)
     self._interpreter.ResizeInputTensor(input_index, tensor_size)
 
   def get_output_details(self):
@@ -204,7 +208,7 @@ class Interpreter(object):
     for i in range(10):
       input().fill(3.)
       interpreter.invoke()
-      print("inference %s" % output)
+      print("inference %s" % output())
 
     Notice how this function avoids making a numpy array directly. This is
     because it is important to not hold actual numpy views to the data longer
diff --git a/tensorflow/contrib/lite/python/interpreter_test.py b/tensorflow/contrib/lite/python/interpreter_test.py
index 95fa4b8584..e77d52ca99 100644
--- a/tensorflow/contrib/lite/python/interpreter_test.py
+++ b/tensorflow/contrib/lite/python/interpreter_test.py
@@ -83,7 +83,7 @@ class InterpreterTest(test_util.TensorFlowTestCase):
     test_input = np.array([[1, 2, 3, 4]], dtype=np.uint8)
     expected_output = np.array([[4, 3, 2, 1]], dtype=np.uint8)
     interpreter.resize_tensor_input(input_details[0]['index'],
-                                    np.array(test_input.shape, dtype=np.int32))
+                                    test_input.shape)
     interpreter.allocate_tensors()
     interpreter.set_tensor(input_details[0]['index'], test_input)
     interpreter.invoke()
diff --git a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
index f97919363b..9ab05f3068 100644
--- a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
+++ b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
@@ -108,7 +108,9 @@ std::unique_ptr<tflite::Interpreter> CreateInterpreter(
   ImportNumpy();
 
   std::unique_ptr<tflite::Interpreter> interpreter;
-  tflite::InterpreterBuilder(*model, resolver)(&interpreter);
+  if (tflite::InterpreterBuilder(*model, resolver)(&interpreter) != kTfLiteOk) {
+    return nullptr;
+  }
   return interpreter;
 }
 
@@ -182,13 +184,37 @@ PyObject* PyTupleFromQuantizationParam(const TfLiteQuantizationParams& param) {
 
 }  // namespace
 
+InterpreterWrapper* InterpreterWrapper::CreateInterpreterWrapper(
+    std::unique_ptr<tflite::FlatBufferModel> model,
+    std::unique_ptr<PythonErrorReporter> error_reporter,
+    std::string* error_msg) {
+  if (!model) {
+    *error_msg = error_reporter->message();
+    return nullptr;
+  }
+
+  auto resolver = absl::make_unique<tflite::ops::builtin::BuiltinOpResolver>();
+  auto interpreter = CreateInterpreter(model.get(), *resolver);
+  if (!interpreter) {
+    *error_msg = error_reporter->message();
+    return nullptr;
+  }
+
+  InterpreterWrapper* wrapper =
+      new InterpreterWrapper(std::move(model), std::move(error_reporter),
+                             std::move(resolver), std::move(interpreter));
+  return wrapper;
+}
+
 InterpreterWrapper::InterpreterWrapper(
     std::unique_ptr<tflite::FlatBufferModel> model,
-    std::unique_ptr<PythonErrorReporter> error_reporter)
+    std::unique_ptr<PythonErrorReporter> error_reporter,
+    std::unique_ptr<tflite::ops::builtin::BuiltinOpResolver> resolver,
+    std::unique_ptr<tflite::Interpreter> interpreter)
     : model_(std::move(model)),
       error_reporter_(std::move(error_reporter)),
-      resolver_(absl::make_unique<tflite::ops::builtin::BuiltinOpResolver>()),
-      interpreter_(CreateInterpreter(model_.get(), *resolver_)) {}
+      resolver_(std::move(resolver)),
+      interpreter_(std::move(interpreter)) {}
 
 InterpreterWrapper::~InterpreterWrapper() {}
 
@@ -421,11 +447,8 @@ InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromFile(
   std::unique_ptr<PythonErrorReporter> error_reporter(new PythonErrorReporter);
   std::unique_ptr<tflite::FlatBufferModel> model =
       tflite::FlatBufferModel::BuildFromFile(model_path, error_reporter.get());
-  if (!model) {
-    *error_msg = error_reporter->message();
-    return nullptr;
-  }
-  return new InterpreterWrapper(std::move(model), std::move(error_reporter));
+  return CreateInterpreterWrapper(std::move(model), std::move(error_reporter),
+                                  error_msg);
 }
 
 InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromBuffer(
@@ -439,11 +462,8 @@ InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromBuffer(
   std::unique_ptr<tflite::FlatBufferModel> model =
       tflite::FlatBufferModel::BuildFromBuffer(buf, length,
                                                error_reporter.get());
-  if (!model) {
-    *error_msg = error_reporter->message();
-    return nullptr;
-  }
-  return new InterpreterWrapper(std::move(model), std::move(error_reporter));
+  return CreateInterpreterWrapper(std::move(model), std::move(error_reporter),
+                                  error_msg);
 }
 
 PyObject* InterpreterWrapper::ResetVariableTensorsToZero() {
diff --git a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
index 556ec7117a..3e03751da4 100644
--- a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
+++ b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
@@ -69,14 +69,28 @@ class InterpreterWrapper {
   PyObject* tensor(PyObject* base_object, int i);
 
  private:
-  InterpreterWrapper(std::unique_ptr<tflite::FlatBufferModel> model,
-                     std::unique_ptr<PythonErrorReporter> error_reporter);
+  // Helper function to construct an `InterpreterWrapper` object.
+  // It only returns InterpreterWrapper if it can construct an `Interpreter`.
+  // Otherwise it returns `nullptr`.
+  static InterpreterWrapper* CreateInterpreterWrapper(
+      std::unique_ptr<tflite::FlatBufferModel> model,
+      std::unique_ptr<PythonErrorReporter> error_reporter,
+      std::string* error_msg);
+
+  InterpreterWrapper(
+      std::unique_ptr<tflite::FlatBufferModel> model,
+      std::unique_ptr<PythonErrorReporter> error_reporter,
+      std::unique_ptr<tflite::ops::builtin::BuiltinOpResolver> resolver,
+      std::unique_ptr<tflite::Interpreter> interpreter);
 
   // InterpreterWrapper is not copyable or assignable. We avoid the use of
   // InterpreterWrapper() = delete here for SWIG compatibility.
   InterpreterWrapper();
   InterpreterWrapper(const InterpreterWrapper& rhs);
 
+  // The public functions which creates `InterpreterWrapper` should ensure all
+  // these member variables are initialized successfully. Otherwise it should
+  // report the error and return `nullptr`.
   const std::unique_ptr<tflite::FlatBufferModel> model_;
   const std::unique_ptr<PythonErrorReporter> error_reporter_;
   const std::unique_ptr<tflite::ops::builtin::BuiltinOpResolver> resolver_;
diff --git a/tensorflow/contrib/lite/python/lite.py b/tensorflow/contrib/lite/python/lite.py
index 29a1487c1f..2f9b9d469a 100644
--- a/tensorflow/contrib/lite/python/lite.py
+++ b/tensorflow/contrib/lite/python/lite.py
@@ -40,24 +40,23 @@ from google.protobuf import text_format as _text_format
 from google.protobuf.message import DecodeError
 from tensorflow.contrib.lite.python import lite_constants as constants
 from tensorflow.contrib.lite.python.convert import build_toco_convert_protos  # pylint: disable=unused-import
-from tensorflow.contrib.lite.python.convert import tensor_name
+from tensorflow.contrib.lite.python.convert import tensor_name as _tensor_name
 from tensorflow.contrib.lite.python.convert import toco_convert
 from tensorflow.contrib.lite.python.convert import toco_convert_protos  # pylint: disable=unused-import
-from tensorflow.contrib.lite.python.convert_saved_model import freeze_saved_model
-from tensorflow.contrib.lite.python.convert_saved_model import get_tensors_from_tensor_names
-from tensorflow.contrib.lite.python.convert_saved_model import set_tensor_shapes
+from tensorflow.contrib.lite.python.convert_saved_model import freeze_saved_model as _freeze_saved_model
+from tensorflow.contrib.lite.python.convert_saved_model import get_tensors_from_tensor_names as _get_tensors_from_tensor_names
+from tensorflow.contrib.lite.python.convert_saved_model import set_tensor_shapes as _set_tensor_shapes
 from tensorflow.contrib.lite.python.interpreter import Interpreter  # pylint: disable=unused-import
 from tensorflow.contrib.lite.python.op_hint import convert_op_hints_to_stubs  # pylint: disable=unused-import
 from tensorflow.contrib.lite.python.op_hint import OpHint  # pylint: disable=unused-import
 from tensorflow.core.framework import graph_pb2 as _graph_pb2
 from tensorflow.python import keras as _keras
 from tensorflow.python.client import session as _session
-from tensorflow.python.framework import graph_util as tf_graph_util
-from tensorflow.python.framework.importer import import_graph_def
-from tensorflow.python.ops.variables import global_variables_initializer
-from tensorflow.python.saved_model import signature_constants
-from tensorflow.python.saved_model import tag_constants
-# from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.framework import graph_util as _tf_graph_util
+from tensorflow.python.framework.importer import import_graph_def as _import_graph_def
+from tensorflow.python.ops.variables import global_variables_initializer as _global_variables_initializer
+from tensorflow.python.saved_model import signature_constants as _signature_constants
+from tensorflow.python.saved_model import tag_constants as _tag_constants
 
 
 class TocoConverter(object):
@@ -196,7 +195,7 @@ class TocoConverter(object):
         input_arrays or output_arrays contains an invalid tensor name.
     """
     with _session.Session() as sess:
-      sess.run(global_variables_initializer())
+      sess.run(_global_variables_initializer())
 
       # Read GraphDef from file.
       graph_def = _graph_pb2.GraphDef()
@@ -218,12 +217,12 @@ class TocoConverter(object):
           raise ValueError(
               "Unable to parse input file '{}'.".format(graph_def_file))
       sess.graph.as_default()
-      import_graph_def(graph_def, name="")
+      _import_graph_def(graph_def, name="")
 
       # Get input and output tensors.
-      input_tensors = get_tensors_from_tensor_names(sess.graph, input_arrays)
-      output_tensors = get_tensors_from_tensor_names(sess.graph, output_arrays)
-      set_tensor_shapes(input_tensors, input_shapes)
+      input_tensors = _get_tensors_from_tensor_names(sess.graph, input_arrays)
+      output_tensors = _get_tensors_from_tensor_names(sess.graph, output_arrays)
+      _set_tensor_shapes(input_tensors, input_shapes)
 
       # Check if graph is frozen.
       if not _is_frozen_graph(sess):
@@ -261,12 +260,12 @@ class TocoConverter(object):
       TocoConverter class.
     """
     if tag_set is None:
-      tag_set = set([tag_constants.SERVING])
+      tag_set = set([_tag_constants.SERVING])
     if signature_key is None:
-      signature_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+      signature_key = _signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
 
-    result = freeze_saved_model(saved_model_dir, input_arrays, input_shapes,
-                                output_arrays, tag_set, signature_key)
+    result = _freeze_saved_model(saved_model_dir, input_arrays, input_shapes,
+                                 output_arrays, tag_set, signature_key)
     return cls(
         graph_def=result[0], input_tensors=result[1], output_tensors=result[2])
 
@@ -299,15 +298,15 @@ class TocoConverter(object):
 
     # Get input and output tensors.
     if input_arrays:
-      input_tensors = get_tensors_from_tensor_names(sess.graph, input_arrays)
+      input_tensors = _get_tensors_from_tensor_names(sess.graph, input_arrays)
     else:
       input_tensors = keras_model.inputs
 
     if output_arrays:
-      output_tensors = get_tensors_from_tensor_names(sess.graph, output_arrays)
+      output_tensors = _get_tensors_from_tensor_names(sess.graph, output_arrays)
     else:
       output_tensors = keras_model.outputs
-    set_tensor_shapes(input_tensors, input_shapes)
+    _set_tensor_shapes(input_tensors, input_shapes)
 
     graph_def = _freeze_graph(sess, output_tensors)
     return cls(graph_def, input_tensors, output_tensors)
@@ -328,12 +327,12 @@ class TocoConverter(object):
     for tensor in self._input_tensors:
       if not tensor.get_shape():
         raise ValueError("Provide an input shape for input array '{0}'.".format(
-            tensor_name(tensor)))
+            _tensor_name(tensor)))
       shape = tensor.get_shape().as_list()
       if None in shape[1:]:
         raise ValueError(
             "None is only supported in the 1st dimension. Tensor '{0}' has "
-            "invalid shape '{1}'.".format(tensor_name(tensor), shape))
+            "invalid shape '{1}'.".format(_tensor_name(tensor), shape))
       elif shape[0] is None:
         self._set_batch_size(batch_size=1)
 
@@ -343,7 +342,7 @@ class TocoConverter(object):
       quantized_stats = []
       invalid_stats = []
       for tensor in self._input_tensors:
-        name = tensor_name(tensor)
+        name = _tensor_name(tensor)
         if name in self.quantized_input_stats:
           quantized_stats.append(self.quantized_input_stats[name])
         else:
@@ -381,7 +380,7 @@ class TocoConverter(object):
     Returns:
       List of strings.
     """
-    return [tensor_name(tensor) for tensor in self._input_tensors]
+    return [_tensor_name(tensor) for tensor in self._input_tensors]
 
   def _set_batch_size(self, batch_size):
     """Sets the first dimension of the input tensor to `batch_size`.
@@ -428,11 +427,9 @@ def _freeze_graph(sess, output_tensors):
     Frozen GraphDef.
   """
   if not _is_frozen_graph(sess):
-    sess.run(global_variables_initializer())
-    output_arrays = [tensor_name(tensor) for tensor in output_tensors]
-    return tf_graph_util.convert_variables_to_constants(sess, sess.graph_def,
-                                                        output_arrays)
+    sess.run(_global_variables_initializer())
+    output_arrays = [_tensor_name(tensor) for tensor in output_tensors]
+    return _tf_graph_util.convert_variables_to_constants(
+        sess, sess.graph_def, output_arrays)
   else:
     return sess.graph_def
-
-# remove_undocumented(__name__)
diff --git a/tensorflow/contrib/lite/schema/schema.fbs b/tensorflow/contrib/lite/schema/schema.fbs
index 0434199a08..8ed98ddaf4 100644
--- a/tensorflow/contrib/lite/schema/schema.fbs
+++ b/tensorflow/contrib/lite/schema/schema.fbs
@@ -165,6 +165,8 @@ enum BuiltinOperator : byte {
   REDUCE_PROD = 81,
   REDUCE_MAX = 82,
   PACK = 83,
+  LOGICAL_OR = 84,
+  ONE_HOT = 85,
 }
 
 // Options for the builtin operators.
@@ -228,6 +230,8 @@ union BuiltinOptions {
   ArgMinOptions,
   FakeQuantOptions,
   PackOptions,
+  LogicalOrOptions,
+  OneHotOptions,
 }
 
 enum Padding : byte { SAME, VALID }
@@ -544,6 +548,13 @@ table PackOptions {
   axis:int;
 }
 
+table LogicalOrOptions {
+}
+
+table OneHotOptions {
+  axis:int;
+}
+
 // An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
 // builtin, or a string if the operator is custom.
 table OperatorCode {
diff --git a/tensorflow/contrib/lite/schema/schema_generated.h b/tensorflow/contrib/lite/schema/schema_generated.h
index 9b84030938..4402f89b85 100755
--- a/tensorflow/contrib/lite/schema/schema_generated.h
+++ b/tensorflow/contrib/lite/schema/schema_generated.h
@@ -208,6 +208,12 @@ struct FakeQuantOptionsT;
 struct PackOptions;
 struct PackOptionsT;
 
+struct LogicalOrOptions;
+struct LogicalOrOptionsT;
+
+struct OneHotOptions;
+struct OneHotOptionsT;
+
 struct OperatorCode;
 struct OperatorCodeT;
 
@@ -357,11 +363,13 @@ enum BuiltinOperator {
   BuiltinOperator_REDUCE_PROD = 81,
   BuiltinOperator_REDUCE_MAX = 82,
   BuiltinOperator_PACK = 83,
+  BuiltinOperator_LOGICAL_OR = 84,
+  BuiltinOperator_ONE_HOT = 85,
   BuiltinOperator_MIN = BuiltinOperator_ADD,
-  BuiltinOperator_MAX = BuiltinOperator_PACK
+  BuiltinOperator_MAX = BuiltinOperator_ONE_HOT
 };
 
-inline BuiltinOperator (&EnumValuesBuiltinOperator())[83] {
+inline BuiltinOperator (&EnumValuesBuiltinOperator())[85] {
   static BuiltinOperator values[] = {
     BuiltinOperator_ADD,
     BuiltinOperator_AVERAGE_POOL_2D,
@@ -445,7 +453,9 @@ inline BuiltinOperator (&EnumValuesBuiltinOperator())[83] {
     BuiltinOperator_FAKE_QUANT,
     BuiltinOperator_REDUCE_PROD,
     BuiltinOperator_REDUCE_MAX,
-    BuiltinOperator_PACK
+    BuiltinOperator_PACK,
+    BuiltinOperator_LOGICAL_OR,
+    BuiltinOperator_ONE_HOT
   };
   return values;
 }
@@ -536,6 +546,8 @@ inline const char **EnumNamesBuiltinOperator() {
     "REDUCE_PROD",
     "REDUCE_MAX",
     "PACK",
+    "LOGICAL_OR",
+    "ONE_HOT",
     nullptr
   };
   return names;
@@ -607,11 +619,13 @@ enum BuiltinOptions {
   BuiltinOptions_ArgMinOptions = 57,
   BuiltinOptions_FakeQuantOptions = 58,
   BuiltinOptions_PackOptions = 59,
+  BuiltinOptions_LogicalOrOptions = 60,
+  BuiltinOptions_OneHotOptions = 61,
   BuiltinOptions_MIN = BuiltinOptions_NONE,
-  BuiltinOptions_MAX = BuiltinOptions_PackOptions
+  BuiltinOptions_MAX = BuiltinOptions_OneHotOptions
 };
 
-inline BuiltinOptions (&EnumValuesBuiltinOptions())[60] {
+inline BuiltinOptions (&EnumValuesBuiltinOptions())[62] {
   static BuiltinOptions values[] = {
     BuiltinOptions_NONE,
     BuiltinOptions_Conv2DOptions,
@@ -672,7 +686,9 @@ inline BuiltinOptions (&EnumValuesBuiltinOptions())[60] {
     BuiltinOptions_PowOptions,
     BuiltinOptions_ArgMinOptions,
     BuiltinOptions_FakeQuantOptions,
-    BuiltinOptions_PackOptions
+    BuiltinOptions_PackOptions,
+    BuiltinOptions_LogicalOrOptions,
+    BuiltinOptions_OneHotOptions
   };
   return values;
 }
@@ -739,6 +755,8 @@ inline const char **EnumNamesBuiltinOptions() {
     "ArgMinOptions",
     "FakeQuantOptions",
     "PackOptions",
+    "LogicalOrOptions",
+    "OneHotOptions",
     nullptr
   };
   return names;
@@ -989,6 +1007,14 @@ template<> struct BuiltinOptionsTraits<PackOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_PackOptions;
 };
 
+template<> struct BuiltinOptionsTraits<LogicalOrOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_LogicalOrOptions;
+};
+
+template<> struct BuiltinOptionsTraits<OneHotOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_OneHotOptions;
+};
+
 struct BuiltinOptionsUnion {
   BuiltinOptions type;
   void *value;
@@ -1492,6 +1518,22 @@ struct BuiltinOptionsUnion {
     return type == BuiltinOptions_PackOptions ?
       reinterpret_cast<const PackOptionsT *>(value) : nullptr;
   }
+  LogicalOrOptionsT *AsLogicalOrOptions() {
+    return type == BuiltinOptions_LogicalOrOptions ?
+      reinterpret_cast<LogicalOrOptionsT *>(value) : nullptr;
+  }
+  const LogicalOrOptionsT *AsLogicalOrOptions() const {
+    return type == BuiltinOptions_LogicalOrOptions ?
+      reinterpret_cast<const LogicalOrOptionsT *>(value) : nullptr;
+  }
+  OneHotOptionsT *AsOneHotOptions() {
+    return type == BuiltinOptions_OneHotOptions ?
+      reinterpret_cast<OneHotOptionsT *>(value) : nullptr;
+  }
+  const OneHotOptionsT *AsOneHotOptions() const {
+    return type == BuiltinOptions_OneHotOptions ?
+      reinterpret_cast<const OneHotOptionsT *>(value) : nullptr;
+  }
 };
 
 bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type);
@@ -5391,6 +5433,100 @@ inline flatbuffers::Offset<PackOptions> CreatePackOptions(
 
 flatbuffers::Offset<PackOptions> CreatePackOptions(flatbuffers::FlatBufferBuilder &_fbb, const PackOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct LogicalOrOptionsT : public flatbuffers::NativeTable {
+  typedef LogicalOrOptions TableType;
+  LogicalOrOptionsT() {
+  }
+};
+
+struct LogicalOrOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef LogicalOrOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  LogicalOrOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(LogicalOrOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<LogicalOrOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const LogicalOrOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct LogicalOrOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit LogicalOrOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  LogicalOrOptionsBuilder &operator=(const LogicalOrOptionsBuilder &);
+  flatbuffers::Offset<LogicalOrOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<LogicalOrOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<LogicalOrOptions> CreateLogicalOrOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  LogicalOrOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<LogicalOrOptions> CreateLogicalOrOptions(flatbuffers::FlatBufferBuilder &_fbb, const LogicalOrOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct OneHotOptionsT : public flatbuffers::NativeTable {
+  typedef OneHotOptions TableType;
+  int32_t axis;
+  OneHotOptionsT()
+      : axis(0) {
+  }
+};
+
+struct OneHotOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef OneHotOptionsT NativeTableType;
+  enum {
+    VT_AXIS = 4
+  };
+  int32_t axis() const {
+    return GetField<int32_t>(VT_AXIS, 0);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_AXIS) &&
+           verifier.EndTable();
+  }
+  OneHotOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(OneHotOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<OneHotOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const OneHotOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct OneHotOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_axis(int32_t axis) {
+    fbb_.AddElement<int32_t>(OneHotOptions::VT_AXIS, axis, 0);
+  }
+  explicit OneHotOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  OneHotOptionsBuilder &operator=(const OneHotOptionsBuilder &);
+  flatbuffers::Offset<OneHotOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<OneHotOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<OneHotOptions> CreateOneHotOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t axis = 0) {
+  OneHotOptionsBuilder builder_(_fbb);
+  builder_.add_axis(axis);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<OneHotOptions> CreateOneHotOptions(flatbuffers::FlatBufferBuilder &_fbb, const OneHotOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct OperatorCodeT : public flatbuffers::NativeTable {
   typedef OperatorCode TableType;
   BuiltinOperator builtin_code;
@@ -5701,6 +5837,12 @@ struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const PackOptions *builtin_options_as_PackOptions() const {
     return builtin_options_type() == BuiltinOptions_PackOptions ? static_cast<const PackOptions *>(builtin_options()) : nullptr;
   }
+  const LogicalOrOptions *builtin_options_as_LogicalOrOptions() const {
+    return builtin_options_type() == BuiltinOptions_LogicalOrOptions ? static_cast<const LogicalOrOptions *>(builtin_options()) : nullptr;
+  }
+  const OneHotOptions *builtin_options_as_OneHotOptions() const {
+    return builtin_options_type() == BuiltinOptions_OneHotOptions ? static_cast<const OneHotOptions *>(builtin_options()) : nullptr;
+  }
   const flatbuffers::Vector<uint8_t> *custom_options() const {
     return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_CUSTOM_OPTIONS);
   }
@@ -5968,6 +6110,14 @@ template<> inline const PackOptions *Operator::builtin_options_as<PackOptions>()
   return builtin_options_as_PackOptions();
 }
 
+template<> inline const LogicalOrOptions *Operator::builtin_options_as<LogicalOrOptions>() const {
+  return builtin_options_as_LogicalOrOptions();
+}
+
+template<> inline const OneHotOptions *Operator::builtin_options_as<OneHotOptions>() const {
+  return builtin_options_as_OneHotOptions();
+}
+
 struct OperatorBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
@@ -8060,6 +8210,55 @@ inline flatbuffers::Offset<PackOptions> CreatePackOptions(flatbuffers::FlatBuffe
       _axis);
 }
 
+inline LogicalOrOptionsT *LogicalOrOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new LogicalOrOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void LogicalOrOptions::UnPackTo(LogicalOrOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<LogicalOrOptions> LogicalOrOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const LogicalOrOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateLogicalOrOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<LogicalOrOptions> CreateLogicalOrOptions(flatbuffers::FlatBufferBuilder &_fbb, const LogicalOrOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const LogicalOrOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateLogicalOrOptions(
+      _fbb);
+}
+
+inline OneHotOptionsT *OneHotOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new OneHotOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void OneHotOptions::UnPackTo(OneHotOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = axis(); _o->axis = _e; };
+}
+
+inline flatbuffers::Offset<OneHotOptions> OneHotOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const OneHotOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateOneHotOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<OneHotOptions> CreateOneHotOptions(flatbuffers::FlatBufferBuilder &_fbb, const OneHotOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const OneHotOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _axis = _o->axis;
+  return tflite::CreateOneHotOptions(
+      _fbb,
+      _axis);
+}
+
 inline OperatorCodeT *OperatorCode::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new OperatorCodeT();
   UnPackTo(_o, _resolver);
@@ -8485,6 +8684,14 @@ inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *ob
       auto ptr = reinterpret_cast<const PackOptions *>(obj);
       return verifier.VerifyTable(ptr);
     }
+    case BuiltinOptions_LogicalOrOptions: {
+      auto ptr = reinterpret_cast<const LogicalOrOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_OneHotOptions: {
+      auto ptr = reinterpret_cast<const OneHotOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
     default: return false;
   }
 }
@@ -8739,6 +8946,14 @@ inline void *BuiltinOptionsUnion::UnPack(const void *obj, BuiltinOptions type, c
       auto ptr = reinterpret_cast<const PackOptions *>(obj);
       return ptr->UnPack(resolver);
     }
+    case BuiltinOptions_LogicalOrOptions: {
+      auto ptr = reinterpret_cast<const LogicalOrOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_OneHotOptions: {
+      auto ptr = reinterpret_cast<const OneHotOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
     default: return nullptr;
   }
 }
@@ -8981,6 +9196,14 @@ inline flatbuffers::Offset<void> BuiltinOptionsUnion::Pack(flatbuffers::FlatBuff
       auto ptr = reinterpret_cast<const PackOptionsT *>(value);
       return CreatePackOptions(_fbb, ptr, _rehasher).Union();
     }
+    case BuiltinOptions_LogicalOrOptions: {
+      auto ptr = reinterpret_cast<const LogicalOrOptionsT *>(value);
+      return CreateLogicalOrOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_OneHotOptions: {
+      auto ptr = reinterpret_cast<const OneHotOptionsT *>(value);
+      return CreateOneHotOptions(_fbb, ptr, _rehasher).Union();
+    }
     default: return 0;
   }
 }
@@ -9223,6 +9446,14 @@ inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u) FL
       value = new PackOptionsT(*reinterpret_cast<PackOptionsT *>(u.value));
       break;
     }
+    case BuiltinOptions_LogicalOrOptions: {
+      value = new LogicalOrOptionsT(*reinterpret_cast<LogicalOrOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_OneHotOptions: {
+      value = new OneHotOptionsT(*reinterpret_cast<OneHotOptionsT *>(u.value));
+      break;
+    }
     default:
       break;
   }
@@ -9525,6 +9756,16 @@ inline void BuiltinOptionsUnion::Reset() {
       delete ptr;
       break;
     }
+    case BuiltinOptions_LogicalOrOptions: {
+      auto ptr = reinterpret_cast<LogicalOrOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_OneHotOptions: {
+      auto ptr = reinterpret_cast<OneHotOptionsT *>(value);
+      delete ptr;
+      break;
+    }
     default: break;
   }
   value = nullptr;
diff --git a/tensorflow/contrib/lite/simple_memory_arena.cc b/tensorflow/contrib/lite/simple_memory_arena.cc
index 4eaf6f1bfe..cd0f1f7c17 100644
--- a/tensorflow/contrib/lite/simple_memory_arena.cc
+++ b/tensorflow/contrib/lite/simple_memory_arena.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/contrib/lite/simple_memory_arena.h"
 
+#include <algorithm>
 #include <cstring>
 #include <limits>
 #include <vector>
@@ -34,7 +35,7 @@ namespace tflite {
 TfLiteStatus SimpleMemoryArena::Allocate(TfLiteContext* context,
                                          size_t alignment, size_t size,
                                          ArenaAlloc* new_alloc) {
-  TF_LITE_ENSURE(context, alignment < arena_alignment_);
+  TF_LITE_ENSURE(context, alignment <= arena_alignment_);
 
   if (size == 0) {
     new_alloc->offset = 0;
diff --git a/tensorflow/contrib/lite/testdata/add.bin b/tensorflow/contrib/lite/testdata/add.bin
new file mode 100644
index 0000000000..aef0fe3d82
--- /dev/null
+++ b/tensorflow/contrib/lite/testdata/add.bin
diff --git a/tensorflow/contrib/lite/testing/BUILD b/tensorflow/contrib/lite/testing/BUILD
index 4c37bcb3c9..a788d41ba7 100644
--- a/tensorflow/contrib/lite/testing/BUILD
+++ b/tensorflow/contrib/lite/testing/BUILD
@@ -112,7 +112,6 @@ cc_library(
 cc_test(
     name = "message_test",
     srcs = ["message_test.cc"],
-    tags = ["no_oss"],
     deps = [
         ":message",
         "@com_google_googletest//:gtest_main",
@@ -132,7 +131,6 @@ cc_test(
     name = "split_test",
     size = "small",
     srcs = ["split_test.cc"],
-    tags = ["no_oss"],
     deps = [
         ":split",
         "@com_google_googletest//:gtest_main",
@@ -142,13 +140,13 @@ cc_test(
 cc_library(
     name = "join",
     hdrs = ["join.h"],
+    deps = ["//tensorflow/contrib/lite:string"],
 )
 
 cc_test(
     name = "join_test",
     size = "small",
     srcs = ["join_test.cc"],
-    tags = ["no_oss"],
     deps = [
         ":join",
         "@com_google_googletest//:gtest_main",
@@ -174,7 +172,6 @@ cc_test(
     srcs = ["tflite_driver_test.cc"],
     data = ["//tensorflow/contrib/lite:testdata/multi_add.bin"],
     tags = [
-        "no_oss",
         "tflite_not_portable_android",
         "tflite_not_portable_ios",
     ],
@@ -196,7 +193,6 @@ cc_library(
 cc_test(
     name = "tokenize_test",
     srcs = ["tokenize_test.cc"],
-    tags = ["no_oss"],
     deps = [
         ":tokenize",
         "@com_google_googletest//:gtest_main",
@@ -214,12 +210,15 @@ cc_library(
 cc_library(
     name = "util",
     hdrs = ["util.h"],
+    deps = [
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite:string",
+    ],
 )
 
 cc_test(
     name = "test_runner_test",
     srcs = ["test_runner_test.cc"],
-    tags = ["no_oss"],
     deps = [
         ":test_runner",
         "@com_google_googletest//:gtest_main",
@@ -275,6 +274,7 @@ cc_library(
         ":join",
         ":split",
         ":tf_driver",
+        "//tensorflow/contrib/lite:string",
         "//tensorflow/core:framework",
     ],
 )
@@ -341,7 +341,7 @@ tf_cc_test(
     ],
     tags = [
         "no_cuda_on_cpu_tap",
-        "no_oss",
+        "no_oss",  # needs test data
         "tflite_not_portable",
     ],
     deps = [
diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index 32d04c0717..6d03c0fd9e 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -104,6 +104,8 @@ KNOWN_BUGS = {
     r"div.*int32": "72051395",
     # No support for SplitV
     r"split.*num_or_size_splits=\[2,2\]": "73377559",
+    # Scalar constants don't work.
+    r"constant.*shape=\[\]": "109811500",
 }
 
 
@@ -229,6 +231,7 @@ _TF_TYPE_INFO = {
     tf.int32: (np.int32, "INT32"),
     tf.uint8: (np.uint8, "QUANTIZED_UINT8"),
     tf.int64: (np.int64, "INT64"),
+    tf.bool: (np.bool, "BOOL"),
 }
 
 
@@ -242,7 +245,10 @@ def create_tensor_data(dtype, shape, min_value=-100, max_value=100):
     value = (max_value-min_value)*np.random.random_sample(shape)+min_value
   elif dtype in (tf.int32, tf.uint8, tf.int64):
     value = np.random.randint(min_value, max_value+1, shape)
-  return value.astype(dtype)
+  elif dtype == tf.bool:
+    value = np.random.choice([True, False], size=shape)
+  return np.dtype(dtype).type(value) if np.isscalar(value) else value.astype(
+      dtype)
 
 
 def create_scalar_data(dtype, min_value=-100, max_value=100):
@@ -479,7 +485,7 @@ def make_zip_of_tests(zip_path,
                           else report_lib.FAILED)
         report["toco_log"] = toco_log
 
-        if FLAGS.save_graphdefs:
+        if True or FLAGS.save_graphdefs:
           archive.writestr(label + ".pbtxt",
                            text_format.MessageToString(graph_def),
                            zipfile.ZIP_DEFLATED)
@@ -734,21 +740,22 @@ def make_constant_tests(zip_path):
 
   test_parameters = [{
       "dtype": [tf.float32, tf.int32],
-      "input_shape": [[1], [2], [1, 1, 1, 1], [2, 2, 2, 2]],
+      "input_shape": [[], [1], [2], [1, 1, 1, 1], [2, 2, 2, 2]],
   }]
 
   def build_graph(parameters):
-    # Since Toco & Tflite can't have a single constant op in the entire graph,
-    # this test adds a zero tensor with a constant op tensor.
-    input1 = tf.placeholder(dtype=parameters["dtype"], name="input1",
-                            shape=parameters["input_shape"])
-    out = tf.ones(parameters["input_shape"], dtype=parameters["dtype"]) + input1
-    return [input1], [out]
+    dummy_input = tf.placeholder(
+        dtype=parameters["dtype"],
+        name="input1",
+        shape=parameters["input_shape"])
+    out = tf.constant(
+        create_tensor_data(parameters["dtype"], parameters["input_shape"]))
+    return [dummy_input], [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
-    input1 = np.zeros(parameters["input_shape"],
-                      dtype=_TF_TYPE_INFO[parameters["dtype"]][0])
-    return [input1], sess.run(outputs, feed_dict={inputs[0]: input1})
+    dummy_input = np.zeros(
+        parameters["input_shape"], dtype=_TF_TYPE_INFO[parameters["dtype"]][0])
+    return [dummy_input], sess.run(outputs, feed_dict={inputs[0]: dummy_input})
 
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
@@ -772,6 +779,11 @@ def make_binary_op_tests(zip_path, binary_operator):
       "input_shape_1": [[1, 3, 4, 3]],
       "input_shape_2": [[3]],
       "activation": [True]
+  }, {
+      "dtype": [tf.float32],
+      "input_shape_1": [[]],
+      "input_shape_2": [[]],
+      "activation": [False]
   }]
 
   def build_graph(parameters):
@@ -821,7 +833,7 @@ def make_reduce_tests(reduce_op):
         "input_dtype": [tf.float32, tf.int32, tf.int64],
         "input_shape": [[3, 2, 4]],
         "axis": [
-            None, 0, 1, 2, [0, 1], [0, 2], [1, 2], [0, 1, 2], [1, 0], [2, 0],
+            0, 1, 2, [0, 1], [0, 2], [1, 2], [0, 1, 2], [1, 0], [2, 0],
             [2, 1], [2, 1, 0], [2, 0, 1], -1, -2, -3, [1, -1], [0, -1], [-1, 0],
             [-1, -2, -3], [0, 0, 0], [2, 2, 0], [1, 0, -3, -3]
         ],
@@ -831,13 +843,19 @@ def make_reduce_tests(reduce_op):
         "input_dtype": [tf.float32],
         "input_shape": [[1, 8, 8, 3]],
         "axis": [
-            None, 0, 1, 2, 3, [1, 2], [0, 3], [1, 2, 3], [0, 1, 2, 3],
+            0, 1, 2, 3, [1, 2], [0, 3], [1, 2, 3], [0, 1, 2, 3],
             [3, 2, 1, 0], [3, 1, 0, 2], [2, 0], [3, 0], [3, 1], [1, 0], -1, -2,
             -3, -4, [0, -2], [2, 3, -1, 0], [3, 1, 2, -3], [3, -4], [2, 2, 2],
             [2, 2, 3], [-3, -3, -4], [-3, 2, 1]
         ],
         "const_axis": [True, False],
         "keepdims": [True, False],
+    }, {
+        "input_dtype": [tf.float32],
+        "input_shape": [[], [1, 8, 8, 3], [3, 2, 4]],
+        "axis": [None],
+        "const_axis": [True],
+        "keepdims": [True, False],
     }]
 
     def build_graph(parameters):
@@ -855,7 +873,7 @@ def make_reduce_tests(reduce_op):
         if isinstance(parameters["axis"], list):
           shape = [len(parameters["axis"])]
         else:
-          shape = [0]  # shape for None or integers.
+          shape = []  # shape for None or integers.
         axis = tf.placeholder(dtype=tf.int32, name="axis", shape=shape)
         input_tensors = [input_tensor, axis]
 
@@ -866,10 +884,11 @@ def make_reduce_tests(reduce_op):
     def build_inputs(parameters, sess, inputs, outputs):
       values = [
           create_tensor_data(parameters["input_dtype"],
-                             parameters["input_shape"])]
+                             parameters["input_shape"],
+                             min_value=-10,
+                             max_value=10)]
       if not parameters["const_axis"]:
-        if parameters["axis"]:
-          values.append(np.array(parameters["axis"]))
+        values.append(np.array(parameters["axis"]))
       return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
     make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
@@ -902,7 +921,7 @@ def make_exp_tests(zip_path):
 
   test_parameters = [{
       "input_dtype": [tf.float32],
-      "input_shape": [[3], [1, 100], [4, 2, 3], [5, 224, 224, 3]],
+      "input_shape": [[], [3], [1, 100], [4, 2, 3], [5, 224, 224, 3]],
   }]
 
   def build_graph(parameters):
@@ -961,8 +980,8 @@ def make_maximum_tests(zip_path):
 
   test_parameters = [{
       "input_dtype": [tf.float32],
-      "input_shape_1": [[3], [1, 100], [4, 2, 3], [5, 224, 224, 3]],
-      "input_shape_2": [[3], [1, 100], [4, 2, 3], [5, 224, 224, 3]],
+      "input_shape_1": [[], [3], [1, 100], [4, 2, 3], [5, 224, 224, 3]],
+      "input_shape_2": [[], [3], [1, 100], [4, 2, 3], [5, 224, 224, 3]],
   }]
 
   def build_graph(parameters):
@@ -996,8 +1015,8 @@ def make_minimum_tests(zip_path):
 
   test_parameters = [{
       "input_dtype": [tf.float32],
-      "input_shape_1": [[3], [1, 100], [4, 2, 3], [5, 224, 224, 3]],
-      "input_shape_2": [[3], [1, 100], [4, 2, 3], [5, 224, 224, 3]],
+      "input_shape_1": [[], [3], [1, 100], [4, 2, 3], [5, 224, 224, 3]],
+      "input_shape_2": [[], [3], [1, 100], [4, 2, 3], [5, 224, 224, 3]],
   }]
 
   def build_graph(parameters):
@@ -1595,19 +1614,39 @@ def make_reshape_tests(zip_path):
       "dtype": [tf.float32, tf.int32],
       "input_shape": [[3, 4, 5, 7], [4, 105], [21, 5, 2, 2], [420]],
       "output_shape": [[15, 28], [420], [1, -1, 5, 7], [-1]],
+      "constant_shape": [True, False],
+  }, {
+      "dtype": [tf.float32],
+      "input_shape": [[1]],
+      "output_shape": [[]],
+      "constant_shape": [True, False],
   }]
 
   def build_graph(parameters):
     input_tensor = tf.placeholder(dtype=parameters["dtype"], name="input",
                                   shape=parameters["input_shape"])
-    out = tf.reshape(input_tensor, shape=parameters["output_shape"])
-    return [input_tensor], [out]
+
+    # Get shape as either a placeholder or constants.
+    if parameters["constant_shape"]:
+      output_shape = parameters["output_shape"]
+      input_tensors = [input_tensor]
+    else:
+      # The shape of the shape tensor.
+      shape_tensor_shape = [len(parameters["output_shape"])]
+      output_shape = tf.placeholder(
+          dtype=tf.int32, name="output_shape", shape=shape_tensor_shape)
+      input_tensors = [input_tensor, output_shape]
+    out = tf.reshape(input_tensor, shape=output_shape)
+    return input_tensors, [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
-    input_values = create_tensor_data(parameters["dtype"],
-                                      parameters["input_shape"])
-    return [input_values], sess.run(
-        outputs, feed_dict=dict(zip(inputs, [input_values])))
+    values = [
+        create_tensor_data(parameters["dtype"], parameters["input_shape"])
+    ]
+    if not parameters["constant_shape"]:
+      values.append(np.array(parameters["output_shape"]))
+
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
@@ -1638,6 +1677,65 @@ def make_shape_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
+def make_one_hot_tests(zip_path):
+  """Make a set of tests to do one_hot."""
+
+  test_parameters = [{
+      "indices_type": [tf.int32, tf.int64],
+      "indices_shape": [[3], [4, 4], [1, 5], [5, 1]],
+      "axis": [0, 1],
+      "dtype": [tf.int32, tf.int64, tf.float32],
+      "provide_optional_inputs": [True, False],
+  }]
+
+  def build_graph(parameters):
+    indices = tf.placeholder(
+        dtype=parameters["indices_type"],
+        name="indices",
+        shape=parameters["indices_shape"])
+    depth = tf.placeholder(dtype=tf.int32, name="depth", shape=())
+
+    if not parameters["provide_optional_inputs"]:
+      out = tf.one_hot(indices=indices, depth=depth)
+      return [indices, depth], [out]
+
+    on_value = tf.placeholder(
+        dtype=parameters["dtype"], name="on_value", shape=())
+    off_value = tf.placeholder(
+        dtype=parameters["dtype"], name="off_value", shape=())
+    out = tf.one_hot(
+        indices=indices,
+        depth=depth,
+        on_value=on_value,
+        off_value=off_value,
+        axis=parameters["axis"],
+        dtype=parameters["dtype"])
+    return [indices, depth, on_value, off_value], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_values = [
+        create_tensor_data(
+            parameters["indices_type"],
+            shape=parameters["indices_shape"],
+            min_value=-1,
+            max_value=10),
+        create_tensor_data(tf.int32, shape=None, min_value=1, max_value=10),
+    ]
+
+    if parameters["provide_optional_inputs"]:
+      input_values.append(
+          create_tensor_data(
+              parameters["dtype"], shape=None, min_value=1, max_value=10))
+      input_values.append(
+          create_tensor_data(
+              parameters["dtype"], shape=None, min_value=-1, max_value=0))
+
+    return input_values, sess.run(
+        outputs, feed_dict=dict(zip(inputs, input_values)))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 def make_resize_bilinear_tests(zip_path):
   """Make a set of tests to do resize_bilinear."""
 
@@ -2237,7 +2335,7 @@ def make_arg_min_max_tests(zip_path):
 
   test_parameters = [{
       "input_dtype": [tf.float32, tf.int32],
-      "input_shape": [[1, 1, 1, 3], [2, 3, 4, 5], [2, 3, 3], [5, 5], [10]],
+      "input_shape": [[], [1, 1, 1, 3], [2, 3, 4, 5], [2, 3, 3], [5, 5], [10]],
       "output_type": [tf.int32, tf.int64],
       "axis_is_last_dim": [True, False],
       "is_arg_max": [True],
@@ -2273,7 +2371,8 @@ def make_equal_tests(zip_path):
 
   test_parameters = [{
       "input_dtype": [tf.float32, tf.int32, tf.int64],
-      "input_shape_pair": [([1, 1, 1, 3], [1, 1, 1, 3]),
+      "input_shape_pair": [([], []),
+                           ([1, 1, 1, 3], [1, 1, 1, 3]),
                            ([2, 3, 4, 5], [2, 3, 4, 5]), ([2, 3, 3], [2, 3]),
                            ([5, 5], [1]), ([10], [2, 4, 10])],
   }]
@@ -2530,7 +2629,7 @@ def _make_elementwise_tests(op):
     """Actual function that generates examples."""
     test_parameters = [{
         "input_dtype": [tf.float32],
-        "input_shape": [[1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
+        "input_shape": [[], [1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
     }]
 
     def build_graph(parameters):
@@ -2852,6 +2951,73 @@ def make_sparse_to_dense_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
+def make_pack_tests(zip_path):
+  """Make a set of tests to do stack."""
+
+  test_parameters = [{
+      "base_shape": [[3, 4, 3], [3, 4], [5]],
+      "num_tensors": [1, 2, 3, 4, 5, 6],
+      "axis": [0, 1, 2, 3],
+      "additional_shape": [1, 2, 3],
+  }]
+
+  def get_shape(parameters):
+    """Return a tweaked version of 'base_shape'."""
+    axis = parameters["axis"]
+    shape = parameters["base_shape"][:]
+    if axis < len(shape):
+      shape[axis] += parameters["additional_shape"]
+    return shape
+
+  def build_graph(parameters):
+    all_tensors = []
+    for n in range(0, parameters["num_tensors"]):
+      input_tensor = tf.placeholder(
+          dtype=tf.float32, name=("input%d" % n), shape=get_shape(parameters))
+      all_tensors.append(input_tensor)
+    out = tf.stack(all_tensors, parameters["axis"])
+    return all_tensors, [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    all_values = []
+    for _ in range(0, parameters["num_tensors"]):
+      input_values = create_tensor_data(np.float32, get_shape(parameters))
+      all_values.append(input_values)
+    return all_values, sess.run(
+        outputs, feed_dict=dict(zip(inputs, all_values)))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_logical_or_tests(zip_path):
+  """Make a set of tests to do logical_or."""
+
+  test_parameters = [{
+      "input_shape_pair": [([], []), ([1, 1, 1, 3], [1, 1, 1, 3]),
+                           ([2, 3, 4, 5], [2, 3, 4, 5]), ([2, 3, 3], [2, 3]),
+                           ([5, 5], [1]), ([10], [2, 4, 10])],
+  }]
+
+  def build_graph(parameters):
+    """Build the logical_or op testing graph."""
+    input_value1 = tf.placeholder(
+        dtype=tf.bool, name="input1", shape=parameters["input_shape_pair"][0])
+    input_value2 = tf.placeholder(
+        dtype=tf.bool, name="input2", shape=parameters["input_shape_pair"][1])
+    out = tf.logical_or(input_value1, input_value2)
+    return [input_value1, input_value2], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_value1 = create_tensor_data(tf.bool,
+                                      parameters["input_shape_pair"][0])
+    input_value2 = create_tensor_data(tf.bool,
+                                      parameters["input_shape_pair"][1])
+    return [input_value1, input_value2], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_value1, input_value2])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 # Toco binary path provided by the generate rule.
 bin_path = None
 
diff --git a/tensorflow/contrib/lite/testing/generate_testspec.cc b/tensorflow/contrib/lite/testing/generate_testspec.cc
index c1092e4d25..f29c188e6c 100644
--- a/tensorflow/contrib/lite/testing/generate_testspec.cc
+++ b/tensorflow/contrib/lite/testing/generate_testspec.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <iostream>
+
 #include "tensorflow/contrib/lite/testing/generate_testspec.h"
 #include "tensorflow/contrib/lite/testing/join.h"
 #include "tensorflow/contrib/lite/testing/split.h"
@@ -88,13 +90,13 @@ bool GenerateTestSpecFromTensorflowModel(
   TfDriver runner(input_layer, input_layer_type, input_layer_shape,
                   output_layer);
   if (!runner.IsValid()) {
-    cerr << runner.GetErrorMessage() << endl;
+    std::cerr << runner.GetErrorMessage() << std::endl;
     return false;
   }
 
   runner.LoadModel(tensorflow_model_path);
   if (!runner.IsValid()) {
-    cerr << runner.GetErrorMessage() << endl;
+    std::cerr << runner.GetErrorMessage() << std::endl;
     return false;
   }
 
@@ -118,14 +120,14 @@ bool GenerateTestSpecFromTensorflowModel(
     for (int j = 0; j < input_values.size(); j++) {
       runner.SetInput(j, input_values[j]);
       if (!runner.IsValid()) {
-        cerr << runner.GetErrorMessage() << endl;
+        std::cerr << runner.GetErrorMessage() << std::endl;
         return false;
       }
     }
 
     runner.Invoke();
     if (!runner.IsValid()) {
-      cerr << runner.GetErrorMessage() << endl;
+      std::cerr << runner.GetErrorMessage() << std::endl;
       return false;
     }
 
@@ -137,7 +139,7 @@ bool GenerateTestSpecFromTensorflowModel(
     for (int j = 0; j < output_layer.size(); j++) {
       stream << "  output: \"" << runner.ReadOutput(j) << "\"\n";
       if (!runner.IsValid()) {
-        cerr << runner.GetErrorMessage() << endl;
+        std::cerr << runner.GetErrorMessage() << std::endl;
         return false;
       }
     }
diff --git a/tensorflow/contrib/lite/testing/generate_testspec.h b/tensorflow/contrib/lite/testing/generate_testspec.h
index bfaf5e7ec8..b3d0db31c0 100644
--- a/tensorflow/contrib/lite/testing/generate_testspec.h
+++ b/tensorflow/contrib/lite/testing/generate_testspec.h
@@ -19,6 +19,8 @@ limitations under the License.
 #include <iostream>
 #include <vector>
 
+#include "tensorflow/contrib/lite/string.h"
+
 namespace tflite {
 namespace testing {
 
diff --git a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
index 770092e12c..106cbc1b8e 100644
--- a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
+++ b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
@@ -226,7 +226,8 @@ TEST_P(OpsTest, RunZipTests) {
   string message = test_driver.GetErrorMessage();
   if (bug_number.empty()) {
     if (FLAGS_use_nnapi && FLAGS_ignore_unsupported_nnapi && !result) {
-      EXPECT_EQ(message, string("Failed to invoke interpreter")) << message;
+      EXPECT_EQ(message, string("Failed to invoke NNAPI interpreter"))
+          << message;
     } else {
       EXPECT_TRUE(result) << message;
     }
diff --git a/tensorflow/contrib/lite/testing/join.h b/tensorflow/contrib/lite/testing/join.h
index 1edee01cf9..4be19ad756 100644
--- a/tensorflow/contrib/lite/testing/join.h
+++ b/tensorflow/contrib/lite/testing/join.h
@@ -17,7 +17,8 @@ limitations under the License.
 
 #include <cstdlib>
 #include <sstream>
-#include <string>
+
+#include "tensorflow/contrib/lite/string.h"
 
 namespace tflite {
 namespace testing {
diff --git a/tensorflow/contrib/lite/testing/test_runner.h b/tensorflow/contrib/lite/testing/test_runner.h
index 96ab6be54e..fac7d01aab 100644
--- a/tensorflow/contrib/lite/testing/test_runner.h
+++ b/tensorflow/contrib/lite/testing/test_runner.h
@@ -90,7 +90,7 @@ class TestRunner {
 
   // Invalidate the test runner, preventing it from executing any further.
   void Invalidate(const string& error_message) {
-    cerr << error_message << std::endl;
+    std::cerr << error_message << std::endl;
     error_message_ = error_message;
   }
   bool IsValid() const { return error_message_.empty(); }
diff --git a/tensorflow/contrib/lite/testing/tf_driver.cc b/tensorflow/contrib/lite/testing/tf_driver.cc
index 3b27f6f3da..ec435ca60d 100644
--- a/tensorflow/contrib/lite/testing/tf_driver.cc
+++ b/tensorflow/contrib/lite/testing/tf_driver.cc
@@ -28,8 +28,8 @@ namespace {
 
 tensorflow::Tensor CreateTensor(const tensorflow::DataType type,
                                 const std::vector<int64_t>& dim) {
-  tensorflow::TensorShape shape{gtl::ArraySlice<int64>{
-      reinterpret_cast<const int64*>(dim.data()), dim.size()}};
+  tensorflow::TensorShape shape{tensorflow::gtl::ArraySlice<tensorflow::int64>{
+      reinterpret_cast<const tensorflow::int64*>(dim.data()), dim.size()}};
   return {type, shape};
 }
 
@@ -179,7 +179,7 @@ void TfDriver::Invoke() {
   auto status = session_->Run({input_tensors_.begin(), input_tensors_.end()},
                               output_names_, {}, &output_tensors_);
   if (!status.ok()) {
-    Invalidate("Failed to invoke interpreter");
+    Invalidate("Failed to run input data on graph");
   }
 }
 
diff --git a/tensorflow/contrib/lite/testing/tflite_diff_flags.h b/tensorflow/contrib/lite/testing/tflite_diff_flags.h
index 7a57e8d3fb..695c2a3de6 100644
--- a/tensorflow/contrib/lite/testing/tflite_diff_flags.h
+++ b/tensorflow/contrib/lite/testing/tflite_diff_flags.h
@@ -15,6 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_LITE_TESTING_TFLITE_DIFF_FLAGS_H_
 #define TENSORFLOW_CONTRIB_LITE_TESTING_TFLITE_DIFF_FLAGS_H_
 
+#include <cstring>
+
 #include "tensorflow/contrib/lite/testing/split.h"
 #include "tensorflow/contrib/lite/testing/tflite_diff_util.h"
 #include "tensorflow/core/util/command_line_flags.h"
diff --git a/tensorflow/contrib/lite/testing/util.h b/tensorflow/contrib/lite/testing/util.h
index 6d20aec141..8aa639157b 100644
--- a/tensorflow/contrib/lite/testing/util.h
+++ b/tensorflow/contrib/lite/testing/util.h
@@ -15,8 +15,39 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_LITE_TESTING_UTIL_H_
 #define TENSORFLOW_CONTRIB_LITE_TESTING_UTIL_H_
 
+#include <cstdio>
+
+#include "tensorflow/contrib/lite/error_reporter.h"
+#include "tensorflow/contrib/lite/string.h"
+
 namespace tflite {
 
+// An ErrorReporter that collects error message in a string, in addition
+// to printing to stderr.
+class TestErrorReporter : public ErrorReporter {
+ public:
+  int Report(const char* format, va_list args) override {
+    char buffer[1024];
+    int size = vsnprintf(buffer, sizeof(buffer), format, args);
+    fprintf(stderr, "%s", buffer);
+    error_messages_ += buffer;
+    num_calls_++;
+    return size;
+  }
+
+  void Reset() {
+    num_calls_ = 0;
+    error_messages_.clear();
+  }
+
+  int num_calls() const { return num_calls_; }
+  const string& error_messages() const { return error_messages_; }
+
+ private:
+  int num_calls_ = 0;
+  string error_messages_;
+};
+
 inline void LogToStderr() {
 #ifdef PLATFORM_GOOGLE
   FLAGS_logtostderr = true;
diff --git a/tensorflow/contrib/lite/toco/export_tensorflow.cc b/tensorflow/contrib/lite/toco/export_tensorflow.cc
index b79bb300f0..378212cb74 100644
--- a/tensorflow/contrib/lite/toco/export_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/export_tensorflow.cc
@@ -1316,6 +1316,20 @@ void ConvertResizeBilinearOperator(const Model& model,
   (*resize_op->mutable_attr())["align_corners"].set_b(src_op.align_corners);
 }
 
+void ConvertOneHotOperator(const Model& model, const OneHotOperator& src_op,
+                           GraphDef* tensorflow_graph) {
+  tensorflow::NodeDef* onehot_op = tensorflow_graph->add_node();
+  onehot_op->set_op("OneHot");
+  onehot_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 4);
+  for (const auto& input : src_op.inputs) {
+    *onehot_op->add_input() = input;
+  }
+  (*onehot_op->mutable_attr())["T"].set_type(
+      GetTensorFlowDataType(model, src_op.outputs[0]));
+  (*onehot_op->mutable_attr())["axis"].set_i(src_op.axis);
+}
+
 namespace {
 // TODO(aselle): Remove when available in absl
 absl::string_view FindLongestCommonPrefix(absl::string_view a,
@@ -1911,6 +1925,21 @@ void ConvertLogicalNotOperator(const Model& model,
   *logical_op->add_input() = src_op.inputs[0];
 }
 
+void ConvertLogicalOrOperator(const Model& model,
+                              const LogicalOrOperator& src_op,
+                              const char* op_name, GraphDef* tensorflow_graph) {
+  tensorflow::NodeDef* logical_or_op = tensorflow_graph->add_node();
+  logical_or_op->set_op(op_name);
+  logical_or_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 2);
+  for (int i = 0; i < 2; ++i) {
+    *logical_or_op->add_input() = src_op.inputs[i];
+  }
+  const tensorflow::DataType data_type =
+      GetTensorFlowDataType(model, src_op.inputs[0]);
+  (*logical_or_op->mutable_attr())["T"].set_type(data_type);
+}
+
 void ConvertOperator(const Model& model, const Operator& src_op,
                      GraphDef* tensorflow_graph) {
   if (src_op.fused_activation_function != FusedActivationFunctionType::kNone) {
@@ -2158,6 +2187,13 @@ void ConvertOperator(const Model& model, const Operator& src_op,
     ConvertLogicalNotOperator(model,
                               static_cast<const LogicalNotOperator&>(src_op),
                               tensorflow_graph);
+  } else if (src_op.type == OperatorType::kOneHot) {
+    ConvertOneHotOperator(model, static_cast<const OneHotOperator&>(src_op),
+                          tensorflow_graph);
+  } else if (src_op.type == OperatorType::kLogicalOr) {
+    ConvertLogicalOrOperator(model,
+                             static_cast<const LogicalOrOperator&>(src_op),
+                             "LogicalOr", tensorflow_graph);
   } else {
     LOG(FATAL) << "Unhandled operator type " << OperatorTypeName(src_op.type);
   }
diff --git a/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md b/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md
index 18b7848db8..4bf47aa3c4 100644
--- a/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md
+++ b/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md
@@ -36,7 +36,7 @@ There are two approaches to running TOCO via command line.
 *   `tflite_convert`: Starting from TensorFlow 1.9, the command-line tool
     `tflite_convert` will be installed as part of the Python package. All of the
     examples below use `tflite_convert` for simplicity.
-    *   Example: `tflite --output_file=...`
+    *   Example: `tflite_convert --output_file=...`
 *   `bazel`: In order to run the latest version of TOCO, [clone the TensorFlow
     repository](https://www.tensorflow.org/install/install_sources#clone_the_tensorflow_repository)
     and use `bazel`. This is the recommended approach for converting models that
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc b/tensorflow/contrib/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
index 75642bbc37..c13fc0de75 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
@@ -181,7 +181,7 @@ bool EnsureUint8WeightsSafeForFastInt8Kernels::Run(Model* model,
         // future without worrying.
         static constexpr int kMinDistanceBetweenBadValues = 16;
         if (distance < kMinDistanceBetweenBadValues) {
-          if (allow_nudging_weights()) {
+          if (allow_nudging_weights() || has_default_ranges_flag()) {
             buffer_data[i] = 1;
             changed = true;
             continue;
@@ -200,6 +200,15 @@ bool EnsureUint8WeightsSafeForFastInt8Kernels::Run(Model* model,
   }
 
   if (changed) {
+    if (has_default_ranges_flag()) {
+      std::cerr
+          << "Since the specified values of --default_ranges_min and "
+             "--default_ranges_max result in values incompatible with TFLite's "
+             "fast int8 kernels, "
+             "--allow_nudging_weights_to_use_fast_gemm_kernel "
+             "has been enabled. This may affect the accuracy of the model."
+          << std::endl;
+    }
     AddMessageF("Tweaked weights values for %s", LogName(op));
   }
 
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
index b7634e28c6..8d9a4c4700 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
+++ b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
@@ -262,8 +262,12 @@ class EnsureUint8WeightsSafeForFastInt8Kernels : public GraphTransformation {
   bool allow_nudging_weights() const { return allow_nudging_weights_; }
   void set_allow_nudging_weights(bool val) { allow_nudging_weights_ = val; }
 
+  bool has_default_ranges_flag() const { return has_default_ranges_flag_; }
+  void set_has_default_ranges_flag(bool val) { has_default_ranges_flag_ = val; }
+
  private:
   bool allow_nudging_weights_ = false;
+  bool has_default_ranges_flag_ = false;
 };
 
 #undef DECLARE_GRAPH_TRANSFORMATION
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc
index 3dda536ef7..9cec6d65f3 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc
@@ -65,6 +65,7 @@ bool PropagateArrayDataTypes::Run(Model* model, std::size_t op_index) {
     case OperatorType::kAny:
     case OperatorType::kLogicalAnd:
     case OperatorType::kLogicalNot:
+    case OperatorType::kLogicalOr:
       // These operators unconditionally produce bool outputs
       SetDataTypeForAllOutputs(model, op, ArrayDataType::kBool);
       break;
@@ -154,8 +155,8 @@ bool PropagateArrayDataTypes::Run(Model* model, std::size_t op_index) {
         return false;
       }
       for (int i = 0; i < op->outputs.size(); ++i) {
-        auto output = op->outputs[i];
-        auto data_type = unsupported_op->output_data_types[i];
+        const string& output = op->outputs[i];
+        const ArrayDataType data_type = unsupported_op->output_data_types[i];
         model->GetArray(output).data_type = data_type;
       }
       break;
@@ -193,6 +194,26 @@ bool PropagateArrayDataTypes::Run(Model* model, std::size_t op_index) {
       SetDataTypeForAllOutputs(model, op, data_type);
       break;
     }
+    case OperatorType::kPack: {
+      const ArrayDataType data_type = model->GetArray(op->inputs[0]).data_type;
+      for (const auto& input : op->inputs) {
+        CHECK(data_type == model->GetArray(input).data_type);
+      }
+      SetDataTypeForAllOutputs(model, op, data_type);
+      break;
+    }
+    case OperatorType::kOneHot: {
+      CHECK_EQ(op->inputs.size(), 4);
+      CHECK_EQ(op->outputs.size(), 1);
+      const ArrayDataType on_value_type =
+          model->GetArray(op->inputs[OneHotOperator::ON_VALUE_INPUT]).data_type;
+      const ArrayDataType off_value_type =
+          model->GetArray(op->inputs[OneHotOperator::OFF_VALUE_INPUT])
+              .data_type;
+      CHECK(on_value_type == off_value_type);
+      model->GetArray(op->outputs[0]).data_type = on_value_type;
+      break;
+    }
     default: {
       // These operators produce outputs with the same type as their 1st input
       CHECK_GT(op->inputs.size(), 0);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index 62ed5c46e9..3c9379fd87 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -1578,6 +1578,61 @@ void ProcessAnyOperator(Model* model, AnyOperator* op) {
   }
 }
 
+void ProcessOneHotOperator(Model* model, OneHotOperator* op) {
+  CHECK_EQ(op->inputs.size(), 4);
+  CHECK_EQ(op->outputs.size(), 1);
+  auto& output_array = model->GetArray(op->outputs[0]);
+  if (output_array.has_shape()) {
+    // Shape already propagated
+    return;
+  }
+
+  // Yield until indices dims have been resolved.
+  const auto& indices_array =
+      model->GetArray(op->inputs[OneHotOperator::INDICES_INPUT]);
+  if (!indices_array.has_shape()) {
+    return;
+  }
+
+  // Yield until depth is constant and dims have been resolved.
+  if (!IsConstantParameterArray(*model,
+                                op->inputs[OneHotOperator::DEPTH_INPUT])) {
+    return;
+  }
+  const auto& depth_array =
+      model->GetArray(op->inputs[OneHotOperator::DEPTH_INPUT]);
+  if (!depth_array.has_shape()) {
+    return;
+  }
+
+  CHECK(depth_array.data_type == ArrayDataType::kInt32)
+      << "Depth array must be int32.";
+  CHECK_EQ(RequiredBufferSizeForShape(depth_array.shape()), 1)
+      << "Depth array must be scalar.";
+
+  const int depth = depth_array.GetBuffer<ArrayDataType::kInt32>().data[0];
+  CHECK_GE(depth, 0) << "Depth must be non-negative.";
+
+  const int indices_dims = indices_array.shape().dimensions_count();
+  const int output_dims = indices_dims + 1;
+  const int axis = op->axis == -1 ? indices_dims : op->axis;
+  CHECK_GE(axis, 0) << "Resolved axis must be non-negative.";
+
+  auto* mutable_dims = output_array.mutable_shape()->mutable_dims();
+  mutable_dims->resize(output_dims);
+  for (int i = 0; i < output_dims; ++i) {
+    int dim = 0;
+    if (i < axis) {
+      dim = indices_array.shape().dims(i);
+    } else if (i == axis) {
+      dim = depth;
+    } else {
+      dim = indices_array.shape().dims(i - 1);
+    }
+    (*mutable_dims)[i] = dim;
+  }
+}
+
 }  // namespace
 
 bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
@@ -1618,6 +1673,7 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
     case OperatorType::kSin:
     case OperatorType::kLogicalAnd:
     case OperatorType::kLogicalNot:
+    case OperatorType::kLogicalOr:
       ProcessSimpleOperator(model, op, 0);
       break;
     case OperatorType::kGather:
@@ -1786,8 +1842,19 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
       ProcessArgMinMaxOperator<ArgMinOperator>(
           model, static_cast<ArgMinOperator*>(op));
       break;
-    case OperatorType::kUnsupported:
+    case OperatorType::kUnsupported: {
+      const auto* unsupported_op =
+          static_cast<TensorFlowUnsupportedOperator*>(op);
+      // Attribute can be not specified, ignore it.
+      if (unsupported_op->output_shapes.size() < op->outputs.size()) {
+        return false;
+      }
+      for (int i = 0; i < op->outputs.size(); ++i) {
+        const string& output = op->outputs[i];
+        model->GetArray(output).copy_shape(unsupported_op->output_shapes.at(i));
+      }
       break;
+    }
     case OperatorType::kSvdf:
       ProcessSvdfOperator(model, static_cast<SvdfOperator*>(op));
       break;
@@ -1814,6 +1881,9 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
     case OperatorType::kAny:
       ProcessAnyOperator(model, static_cast<AnyOperator*>(op));
       break;
+    case OperatorType::kOneHot:
+      ProcessOneHotOperator(model, static_cast<OneHotOperator*>(op));
+      break;
     default:
       // Unimplemented, another graph transformation should drop it.
       LOG(FATAL) << "Unhandled operator type " << OperatorTypeName(op->type);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
index 5be2757479..f6ce3b3ecb 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
@@ -50,6 +50,7 @@ bool SupportsQuantization(const Operator& op) {
          type == OperatorType::kSqueeze || type == OperatorType::kPad ||
          type == OperatorType::kPadV2 || type == OperatorType::kReshape ||
          type == OperatorType::kTanh || type == OperatorType::kMul ||
+         type == OperatorType::kBatchToSpaceND ||
          type == OperatorType::kSpaceToBatchND ||
          type == OperatorType::kSpaceToDepth ||
          type == OperatorType::kStridedSlice ||
diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/contrib/lite/toco/import_tensorflow.cc
index 8bb797fe0f..9a3db5c888 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc
@@ -215,7 +215,7 @@ tensorflow::Status ImportFloatArray(const TensorProto& input_tensor,
                                     Array* output_array) {
   CHECK_EQ(input_tensor.dtype(), DT_FLOAT);
   const auto& input_shape = input_tensor.tensor_shape();
-  CHECK_LE(input_shape.dim_size(), 4);
+  CHECK_LE(input_shape.dim_size(), 6);
   int input_flat_size;
   auto status = ImportShape(input_shape.dim(), &input_flat_size,
                             output_array->mutable_shape());
@@ -253,7 +253,7 @@ tensorflow::Status ImportQuint8Array(const TensorProto& input_tensor,
                                      Array* output_array) {
   CHECK_EQ(input_tensor.dtype(), DT_QUINT8);
   const auto& input_shape = input_tensor.tensor_shape();
-  CHECK_LE(input_shape.dim_size(), 4);
+  CHECK_LE(input_shape.dim_size(), 6);
   int input_flat_size;
   auto status = ImportShape(input_shape.dim(), &input_flat_size,
                             output_array->mutable_shape());
@@ -290,7 +290,7 @@ tensorflow::Status ImportInt32Array(const TensorProto& input_tensor,
                                     Array* output_array) {
   CHECK_EQ(input_tensor.dtype(), DT_INT32);
   const auto& input_shape = input_tensor.tensor_shape();
-  CHECK_LE(input_shape.dim_size(), 4);
+  CHECK_LE(input_shape.dim_size(), 6);
   int input_flat_size;
   auto status = ImportShape(input_shape.dim(), &input_flat_size,
                             output_array->mutable_shape());
@@ -326,7 +326,7 @@ tensorflow::Status ImportInt64Array(const TensorProto& input_tensor,
                                     Array* output_array) {
   CHECK_EQ(input_tensor.dtype(), DT_INT64);
   const auto& input_shape = input_tensor.tensor_shape();
-  CHECK_LE(input_shape.dim_size(), 4);
+  CHECK_LE(input_shape.dim_size(), 6);
   int input_flat_size;
   auto status = ImportShape(input_shape.dim(), &input_flat_size,
                             output_array->mutable_shape());
@@ -363,7 +363,7 @@ tensorflow::Status ImportBoolArray(const TensorProto& input_tensor,
                                    Array* output_array) {
   CHECK_EQ(input_tensor.dtype(), DT_BOOL);
   const auto& input_shape = input_tensor.tensor_shape();
-  CHECK_LE(input_shape.dim_size(), 4);
+  CHECK_LE(input_shape.dim_size(), 6);
   int input_flat_size;
   auto status = ImportShape(input_shape.dim(), &input_flat_size,
                             output_array->mutable_shape());
@@ -409,7 +409,7 @@ tensorflow::Status ImportStringArray(const TensorProto& input_tensor,
                                      Array* output_array) {
   CHECK_EQ(input_tensor.dtype(), DT_STRING);
   const auto& input_shape = input_tensor.tensor_shape();
-  CHECK_LE(input_shape.dim_size(), 4);
+  CHECK_LE(input_shape.dim_size(), 6);
   int input_flat_size;
   auto status = ImportShape(input_shape.dim(), &input_flat_size,
                             output_array->mutable_shape());
@@ -1045,6 +1045,11 @@ tensorflow::Status ConvertSimpleOperator(
 tensorflow::Status ConvertUnsupportedOperator(
     const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
     Model* model) {
+  // Names of special attributes in TF graph that are used by Toco.
+  static constexpr char kAttrOutputQuantized[] = "_output_quantized";
+  static constexpr char kAttrOutputTypes[] = "_output_types";
+  static constexpr char kAttrOutputShapes[] = "_output_shapes";
+
   LOG(INFO) << "Converting unsupported operation: " << node.op();
   auto* op = new TensorFlowUnsupportedOperator;
   const int num_inputs = GetInputsCount(node, tf_import_flags);
@@ -1055,11 +1060,11 @@ tensorflow::Status ConvertUnsupportedOperator(
   op->tensorflow_op = node.op();
   node.SerializeToString(&op->tensorflow_node_def);
   model->operators.emplace_back(op);
-  if (HasAttr(node, "_output_quantized")) {
-    op->quantized = GetBoolAttr(node, "_output_quantized");
+  if (HasAttr(node, kAttrOutputQuantized)) {
+    op->quantized = GetBoolAttr(node, kAttrOutputQuantized);
   }
-  if (HasAttr(node, "_output_types")) {
-    const auto& output_types = GetListAttr(node, "_output_types");
+  if (HasAttr(node, kAttrOutputTypes)) {
+    const auto& output_types = GetListAttr(node, kAttrOutputTypes);
     for (int i = 0; i < output_types.type_size(); ++i) {
       op->output_data_types.push_back(ConvertDataType(output_types.type(i)));
     }
@@ -1067,6 +1072,19 @@ tensorflow::Status ConvertUnsupportedOperator(
     const auto& output_type = GetDataTypeAttr(node, "Tout");
     op->output_data_types.push_back(ConvertDataType(output_type));
   }
+  if (HasAttr(node, kAttrOutputShapes)) {
+    const auto& output_shapes = GetListAttr(node, kAttrOutputShapes);
+    Shape output_shape;
+    for (int i = 0; i < output_shapes.shape_size(); ++i) {
+      const auto status =
+          ImportShape(output_shapes.shape(i).dim(), /*input_flat_size=*/nullptr,
+                      &output_shape);
+      if (!status.ok()) {
+        return status;
+      }
+      op->output_shapes.push_back(output_shape);
+    }
+  }
   return tensorflow::Status::OK();
 }
 
@@ -1529,6 +1547,7 @@ tensorflow::Status ConvertPackOperator(
   for (int i = 0; i < num_inputs; ++i) {
     op->inputs.push_back(node.input(i));
   }
+  op->values_count = HasAttr(node, "N") ? GetIntAttr(node, "N") : num_inputs;
   op->axis = HasAttr(node, "axis") ? GetIntAttr(node, "axis") : 0;
   op->dtype = ConvertDataType(toco::GetDataTypeAttr(node, "T"));
   op->outputs.push_back(node.name());
@@ -1814,6 +1833,27 @@ tensorflow::Status ConvertSparseToDenseOperator(
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status ConvertOneHotOperator(
+    const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
+    Model* model) {
+  CHECK_EQ(node.op(), "OneHot");
+  TF_QCHECK_OK(CheckInputsCount(node, tf_import_flags, 4));
+
+  const auto dtype = GetDataTypeAttr(node, "T");
+  // TODO(b/111744875): Support DT_UINT8 and quantization.
+  CHECK(dtype == DT_INT32 || dtype == DT_INT64 || dtype == DT_FLOAT ||
+        dtype == DT_BOOL);
+
+  auto op = absl::make_unique<OneHotOperator>();
+  op->axis = HasAttr(node, "axis") ? GetIntAttr(node, "axis") : -1;
+  for (const string& input : node.input()) {
+    op->inputs.push_back(input);
+  }
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op.release());
+  return tensorflow::Status::OK();
+}
+
 }  // namespace
 
 namespace internal {
@@ -1874,9 +1914,10 @@ ConverterMapType GetTensorFlowNodeConverterMap() {
       {"Less", ConvertSimpleOperator<TensorFlowLessOperator, 2>},
       {"LessEqual", ConvertSimpleOperator<TensorFlowLessEqualOperator, 2>},
       {"Log", ConvertSimpleOperator<LogOperator, 1>},
-      {"LogSoftmax", ConvertSimpleOperator<LogSoftmaxOperator, 1>},
       {"LogicalAnd", ConvertSimpleOperator<LogicalAndOperator, 2>},
+      {"LogicalOr", ConvertSimpleOperator<LogicalOrOperator, 2>},
       {"LogicalNot", ConvertSimpleOperator<LogicalNotOperator, 1>},
+      {"LogSoftmax", ConvertSimpleOperator<LogSoftmaxOperator, 1>},
       {"MatMul", ConvertMatMulOperator},
       {"Max", ConvertReduceOperator<TensorFlowMaxOperator>},
       {"MaxPool", ConvertMaxPoolOperator},
@@ -1890,6 +1931,7 @@ ConverterMapType GetTensorFlowNodeConverterMap() {
       {"NextIteration", ConvertOperatorSpecialCasedAsRNNBackEdge},
       {"NoOp", ConvertNoOpOperator},
       {"NotEqual", ConvertSimpleOperator<TensorFlowNotEqualOperator, 2>},
+      {"OneHot", ConvertOneHotOperator},
       {"Pack", ConvertPackOperator},
       {"Pad", ConvertSimpleOperator<PadOperator, 2>},
       {"PadV2", ConvertSimpleOperator<PadV2Operator, 3>},
diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h
index 6fe194516d..7d0dbfcc05 100644
--- a/tensorflow/contrib/lite/toco/model.h
+++ b/tensorflow/contrib/lite/toco/model.h
@@ -64,6 +64,7 @@ enum class OperatorType : uint8 {
   kMaxPool,
   kFakeQuant,
   kMul,
+  kOneHot,
   kRandomUniform,
   kRange,
   kRank,
@@ -146,6 +147,7 @@ enum class OperatorType : uint8 {
   kAny,
   kLogicalAnd,
   kLogicalNot,
+  kLogicalOr,
 };
 
 // Helper to deal with TensorFlow arrays using a different ordering of
@@ -292,6 +294,46 @@ struct Buffer : GenericBuffer {
   std::vector<DataType<A>> data;
 };
 
+class Shape {
+ public:
+  // For Shape, we stick to half-way encapsulation for now:
+  // we hide the raw dims_ member, but expose it raw by accessors
+  // because from some brainstorming, it's not at all easy to
+  // anticipate which flavor of more hermetic encapsulation would
+  // actually buy us future-proof-ness without being needlessly
+  // cumbersome.
+  Shape() {}
+  Shape(std::initializer_list<int> dim_list) : dims_(dim_list) {}
+
+  void ReplaceDims(std::initializer_list<int> dim_list) {
+    dims_ = std::vector<int>(dim_list);
+  }
+
+  const std::vector<int>& dims() const { return dims_; }
+  std::vector<int>* mutable_dims() { return &dims_; }
+  const int dimensions_count() const { return dims_.size(); }
+
+  // We still have that one convenience accessor to avoid
+  // the awkward double bracket issue:  shape.dims()[i].
+  int dims(int i) const {
+    // Always check for out-of-bounds accesses, even in optimized builds where
+    // standard assertions are disabled. Out-of-bounds access here is a common
+    // occurrence.
+    CHECK_GE(i, 0);
+    CHECK_GT(dims_.size(), i);
+    return dims_[i];
+  }
+
+  bool operator==(const Shape& comp) const {
+    return (this->dims_ == comp.dims());
+  }
+
+  bool operator!=(const Shape& comp) const { return !((*this) == comp); }
+
+ private:
+  std::vector<int> dims_;
+};
+
 // Base class for all operator classes.
 struct Operator {
   // Non-default-constructible: only OperatorType-specific subclass
@@ -1164,6 +1206,7 @@ struct TensorFlowRsqrtOperator : Operator {
 // TensorFlow equivalent: Pack
 struct PackOperator : Operator {
   PackOperator() : Operator(OperatorType::kPack) {}
+  int values_count;
   int axis = 0;
   ArrayDataType dtype = ArrayDataType::kNone;
 };
@@ -1468,6 +1511,8 @@ struct TensorFlowUnsupportedOperator : Operator {
   bool quantized = false;
   // Output data types
   std::vector<ArrayDataType> output_data_types;
+  // Output shapes.
+  std::vector<Shape> output_shapes;
 };
 
 // Softmax activation function.
@@ -1725,6 +1770,38 @@ struct LogicalNotOperator : Operator {
   LogicalNotOperator() : Operator(OperatorType::kLogicalNot) {}
 };
 
+// OneHot operator:
+//
+// Inputs:
+// Inputs[0]: required: indices.
+// Inputs[1]: required: depth.
+// Inputs[2]: required: on_value.
+// Inputs[3]: required: off_value.
+//
+// TensorFlow equivalent: OneHot.
+struct OneHotOperator : Operator {
+  enum Inputs {
+    INDICES_INPUT = 0,
+    DEPTH_INPUT = 1,
+    ON_VALUE_INPUT = 2,
+    OFF_VALUE_INPUT = 3,
+  };
+
+  OneHotOperator() : Operator(OperatorType::kOneHot) {}
+  int axis = -1;
+};
+
+// LogicalOr operator:
+//
+// Inputs:
+// Inputs[0]: required: A Bool tensor.
+// Inputs[1]: required: A Bool tensor.
+//
+// TensorFlow equivalent: LogicalOr.
+struct LogicalOrOperator : Operator {
+  LogicalOrOperator() : Operator(OperatorType::kLogicalOr) {}
+};
+
 // Alloc's are used for transient arrays only. An Alloc specifies which interval
 // of the "transient_data" workspace buffer passed to inference functions, is to
 // be used for the transient array at hand. The 'start' and 'end' values are
@@ -1738,46 +1815,6 @@ inline bool operator<(const Alloc& a, const Alloc& b) {
   return a.start < b.start;
 }
 
-class Shape {
- public:
-  // For Shape, we stick to half-way encapsulation for now:
-  // we hide the raw dims_ member, but expose it raw by accessors
-  // because from some brainstorming, it's not at all easy to
-  // anticipate which flavor of more hermetic encapsulation would
-  // actually buy us future-proof-ness without being needlessly
-  // cumbersome.
-  Shape() {}
-  Shape(std::initializer_list<int> dim_list) : dims_(dim_list) {}
-
-  void ReplaceDims(std::initializer_list<int> dim_list) {
-    dims_ = std::vector<int>(dim_list);
-  }
-
-  const std::vector<int>& dims() const { return dims_; }
-  std::vector<int>* mutable_dims() { return &dims_; }
-  const int dimensions_count() const { return dims_.size(); }
-
-  // We still have that one convenience accessor to avoid
-  // the awkward double bracket issue:  shape.dims()[i].
-  int dims(int i) const {
-    // Always check for out-of-bounds accesses, even in optimized builds where
-    // standard assertions are disabled. Out-of-bounds access here is a common
-    // occurrence.
-    CHECK_GE(i, 0);
-    CHECK_GT(dims_.size(), i);
-    return dims_[i];
-  }
-
-  bool operator==(const Shape& comp) const {
-    return (this->dims_ == comp.dims());
-  }
-
-  bool operator!=(const Shape& comp) const { return !((*this) == comp); }
-
- private:
-  std::vector<int> dims_;
-};
-
 // Array represents an array (either a constant parameter array or an
 // activations array) in a Model.
 struct Array {
diff --git a/tensorflow/contrib/lite/toco/model_cmdline_flags.cc b/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
index 06072d1fcb..d34da63e43 100644
--- a/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
+++ b/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
@@ -322,6 +322,10 @@ void ReadModelFlagsFromCommandLineFlags(
     for (int i = 0; i < input_shapes.size(); ++i) {
       auto* shape = model_flags->mutable_input_arrays(i)->mutable_shape();
       shape->clear_dims();
+      // Treat an empty input shape as a scalar.
+      if (input_shapes[i].empty()) {
+        continue;
+      }
       for (const auto& dim_str : absl::StrSplit(input_shapes[i], ',')) {
         int size;
         CHECK(absl::SimpleAtoi(dim_str, &size))
diff --git a/tensorflow/contrib/lite/toco/tflite/export_test.cc b/tensorflow/contrib/lite/toco/tflite/export_test.cc
index d1fdbcb8e9..a95937ba0f 100644
--- a/tensorflow/contrib/lite/toco/tflite/export_test.cc
+++ b/tensorflow/contrib/lite/toco/tflite/export_test.cc
@@ -262,7 +262,7 @@ TEST_F(VersionedOpExportTest, Export) {
   EXPECT_EQ(1, (*operators)[1]->opcode_index());
 }
 
-// TODO(ahentz): tests for tensors, inputs, outpus, opcodes and operators.
+// TODO(ahentz): tests for tensors, inputs, outputs, opcodes and operators.
 
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/toco/tflite/operator.cc b/tensorflow/contrib/lite/toco/tflite/operator.cc
index 1a1c4b8944..9380168f30 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator.cc
@@ -1013,6 +1013,26 @@ class ExpandDims
   int GetVersion(const Operator& op) const override { return 1; }
 };
 
+class Pack : public BuiltinOperator<PackOperator, ::tflite::PackOptions,
+                                    ::tflite::BuiltinOptions_PackOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreatePackOptions(*builder, op.values_count, op.axis);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->values_count = options.values_count();
+    op->axis = options.axis();
+  }
+
+  int GetVersion(const Operator& op) const override { return 1; }
+};
+
 class Shape
     : public BuiltinOperator<TensorFlowShapeOperator, ::tflite::ShapeOptions,
                              ::tflite::BuiltinOptions_ShapeOptions> {
@@ -1033,6 +1053,23 @@ class Shape
   int GetVersion(const Operator& op) const override { return 1; }
 };
 
+class OneHot : public BuiltinOperator<OneHotOperator, ::tflite::OneHotOptions,
+                                      ::tflite::BuiltinOptions_OneHotOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateOneHotOptions(*builder, op.axis);
+  }
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->axis = options.axis();
+  }
+
+  int GetVersion(const Operator& op) const override { return 1; }
+};
+
 class TensorFlowUnsupported : public BaseOperator {
  public:
   using BaseOperator::BaseOperator;
@@ -1256,6 +1293,10 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList() {
       new Shape(::tflite::BuiltinOperator_SHAPE, OperatorType::kShape));
   ops.emplace_back(new FakeQuant(::tflite::BuiltinOperator_FAKE_QUANT,
                                  OperatorType::kFakeQuant));
+  ops.emplace_back(
+      new Pack(::tflite::BuiltinOperator_PACK, OperatorType::kPack));
+  ops.emplace_back(
+      new OneHot(::tflite::BuiltinOperator_ONE_HOT, OperatorType::kOneHot));
 
   // Custom Operators.
   ops.emplace_back(
@@ -1309,6 +1350,8 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList() {
   ops.emplace_back(
       new SimpleOperator<SliceOperator>("SLICE", OperatorType::kSlice));
   ops.emplace_back(new SimpleOperator<PowOperator>("POW", OperatorType::kPow));
+  ops.emplace_back(new SimpleOperator<LogicalOrOperator>(
+      "LOGICAL_OR", OperatorType::kLogicalOr));
   // Element-wise operator
   ops.emplace_back(new SimpleOperator<SinOperator>("SIN", OperatorType::kSin));
   ops.emplace_back(new SimpleOperator<LogOperator>("LOG", OperatorType::kLog));
diff --git a/tensorflow/contrib/lite/toco/tflite/operator_test.cc b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
index ff2d35b1f5..384f7c118d 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator_test.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
@@ -127,6 +127,8 @@ TEST_F(OperatorTest, SimpleOperators) {
   CheckSimpleOperator<TensorFlowSqrtOperator>("SQRT", OperatorType::kSqrt);
   CheckSimpleOperator<TensorFlowRsqrtOperator>("RSQRT", OperatorType::kRsqrt);
   CheckSimpleOperator<PowOperator>("POW", OperatorType::kPow);
+  CheckSimpleOperator<LogicalOrOperator>("LOGICAL_OR",
+                                         OperatorType::kLogicalOr);
 }
 
 TEST_F(OperatorTest, BuiltinAdd) {
@@ -452,6 +454,24 @@ TEST_F(OperatorTest, BuiltinSparseToDense) {
   EXPECT_EQ(op.validate_indices, output_toco_op->validate_indices);
 }
 
+TEST_F(OperatorTest, BuiltinPack) {
+  PackOperator op;
+  op.values_count = 3;
+  op.axis = 1;
+  std::unique_ptr<toco::PackOperator> output_toco_op =
+      SerializeAndDeserialize(GetOperator("PACK", OperatorType::kPack), op);
+  EXPECT_EQ(op.values_count, output_toco_op->values_count);
+  EXPECT_EQ(op.axis, output_toco_op->axis);
+}
+
+TEST_F(OperatorTest, BuiltinOneHot) {
+  OneHotOperator op;
+  op.axis = 2;
+  auto output_toco_op = SerializeAndDeserialize(
+      GetOperator("ONE_HOT", OperatorType::kOneHot), op);
+  EXPECT_EQ(op.axis, output_toco_op->axis);
+}
+
 TEST_F(OperatorTest, TensorFlowUnsupported) {
   TensorFlowUnsupportedOperator op;
   op.tensorflow_op = "MyCustomUnsupportedOp";
diff --git a/tensorflow/contrib/lite/toco/toco_tooling.cc b/tensorflow/contrib/lite/toco/toco_tooling.cc
index aa7f6996eb..fcd3cbab07 100644
--- a/tensorflow/contrib/lite/toco/toco_tooling.cc
+++ b/tensorflow/contrib/lite/toco/toco_tooling.cc
@@ -309,8 +309,9 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
     // HardcodeMinMax to move changes through the graph as we make changes.
     auto propagate_default_min_max =
         absl::make_unique<PropagateDefaultMinMax>();
-    if (toco_flags.has_default_ranges_min() &&
-        toco_flags.has_default_ranges_max()) {
+    bool has_default_ranges_flag = (toco_flags.has_default_ranges_min() &&
+                                    toco_flags.has_default_ranges_max());
+    if (has_default_ranges_flag) {
       propagate_default_min_max->DefineTypeRange(
           ArrayDataType::kUint8, toco_flags.default_ranges_min(),
           toco_flags.default_ranges_max());
@@ -335,6 +336,8 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
         new EnsureUint8WeightsSafeForFastInt8Kernels;
     ensure_safe_for_int8_kernels->set_allow_nudging_weights(
         toco_flags.allow_nudging_weights_to_use_fast_gemm_kernel());
+    ensure_safe_for_int8_kernels->set_has_default_ranges_flag(
+        has_default_ranges_flag);
     RunGraphTransformations(model, "quantization graph transformations",
                             {
                                 new RemoveTrivialQuantizedActivationFunc,
diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
index 52f8df45a2..7f5da251d0 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -356,6 +356,7 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(ReduceMin)  //  Reduction Min
     HANDLE_OPERATORTYPENAME_CASE(Minimum)    //  Element-wise Minimum
     HANDLE_OPERATORTYPENAME_CASE(Neg)
+    HANDLE_OPERATORTYPENAME_CASE(OneHot)
     HANDLE_OPERATORTYPENAME_CASE(Pack)
     HANDLE_OPERATORTYPENAME_CASE(Pad)
     HANDLE_OPERATORTYPENAME_CASE(PadV2)
@@ -402,6 +403,7 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(Any)
     HANDLE_OPERATORTYPENAME_CASE(LogicalAnd)
     HANDLE_OPERATORTYPENAME_CASE(LogicalNot)
+    HANDLE_OPERATORTYPENAME_CASE(LogicalOr)
     default:
       LOG(FATAL) << "Unhandled op type";
 #undef HANDLE_OPERATORTYPENAME_CASE
@@ -1585,11 +1587,6 @@ void ResolveModelFlags(const ModelFlags& model_flags, Model* model) {
                                model);
   }
 
-  for (const auto& input_array : model->flags.input_arrays()) {
-    if (input_array.has_shape()) {
-      CHECK(input_array.shape().dims_size());
-    }
-  }
   model->flags.set_change_concat_input_ranges(
       model_flags.change_concat_input_ranges());
   model->flags.set_allow_nonascii_arrays(model_flags.allow_nonascii_arrays());
diff --git a/tensorflow/contrib/lite/toco/tooling_util_test.cc b/tensorflow/contrib/lite/toco/tooling_util_test.cc
index 8609e5bedd..eb495646a2 100644
--- a/tensorflow/contrib/lite/toco/tooling_util_test.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util_test.cc
@@ -39,6 +39,8 @@ std::vector<ShapePair> CreateShapePairs() {
        {Shape({256, 256, 3}), Shape({256, 256, 3}), Agreement::kBroadcast},
        {Shape({256, 256, 3}), Shape({3}), Agreement::kBroadcast},
        {Shape({8, 1, 6, 1}), Shape({7, 1, 5}), Agreement::kBroadcast},
+       {Shape({}), Shape({3}), Agreement::kBroadcast},
+       {Shape({}), Shape({3, 1}), Agreement::kBroadcast},
 
        // These extend (and therefore broadcast).
        {Shape({3}), Shape({3}), Agreement::kExtend},
@@ -54,6 +56,7 @@ std::vector<ShapePair> CreateShapePairs() {
        {Shape({15, 3, 5}), Shape({15, 1, 5}), Agreement::kBroadcastNotExtend},
        {Shape({15, 3, 5}), Shape({3, 5}), Agreement::kBroadcastNotExtend},
        {Shape({15, 3, 5}), Shape({3, 1}), Agreement::kBroadcastNotExtend},
+       {Shape({3, 1}), Shape({}), Agreement::kBroadcastNotExtend},
 
        // These do not broadcast (and therefore also do not extend).
        {Shape({3}), Shape({4}), Agreement::kNeither},
@@ -175,6 +178,20 @@ TEST(NumElementsTest, UnsignedInt64) {
   EXPECT_EQ(status.error_message(), kLargeTensorMessage);
 }
 
+TEST(NumElementsTest, Scalar) {
+  tensorflow::Status status = tensorflow::Status::OK();
+
+  int32_t count;
+  status = NumElements(std::vector<int32_t>{}, &count);
+  EXPECT_TRUE(status.ok());
+  EXPECT_EQ(count, 1);
+
+  uint64_t countu64;
+  status = NumElements(std::vector<uint64_t>{}, &countu64);
+  EXPECT_TRUE(status.ok());
+  EXPECT_EQ(countu64, 1ULL);
+}
+
 TEST(FusedActivationTest, DefaultsToUnfused) {
   EXPECT_TRUE(OperatorSupportsFusedActivation(OperatorType::kAdd));
   EXPECT_FALSE(OperatorSupportsFusedActivation(OperatorType::kNone));
diff --git a/tensorflow/contrib/lite/tools/benchmark/BUILD b/tensorflow/contrib/lite/tools/benchmark/BUILD
index 810e25961f..2cb07eb6ec 100644
--- a/tensorflow/contrib/lite/tools/benchmark/BUILD
+++ b/tensorflow/contrib/lite/tools/benchmark/BUILD
@@ -10,11 +10,16 @@ load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts")
 
 common_copts = ["-Wall"] + tflite_copts()
 
+cc_library(
+    name = "logging",
+    hdrs = ["logging.h"],
+    copts = common_copts,
+)
+
 cc_binary(
     name = "benchmark_model",
     srcs = [
         "benchmark_main.cc",
-        "logging.h",
     ],
     copts = common_copts,
     linkopts = tflite_linkopts() + select({
@@ -26,6 +31,26 @@ cc_binary(
     }),
     deps = [
         ":benchmark_tflite_model_lib",
+        ":logging",
+    ],
+)
+
+cc_test(
+    name = "benchmark_test",
+    srcs = ["benchmark_test.cc"],
+    args = [
+        "--graph=$(location //tensorflow/contrib/lite:testdata/multi_add.bin)",
+    ],
+    data = ["//tensorflow/contrib/lite:testdata/multi_add.bin"],
+    tags = [
+        "tflite_not_portable_android",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":benchmark_tflite_model_lib",
+        ":command_line_flags",
+        "//tensorflow/contrib/lite/testing:util",
+        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -40,7 +65,6 @@ cc_test(
     name = "command_line_flags_test",
     srcs = ["command_line_flags_test.cc"],
     copts = common_copts,
-    tags = ["no_oss"],
     visibility = ["//visibility:private"],
     deps = [
         ":command_line_flags",
@@ -59,6 +83,7 @@ cc_library(
     copts = common_copts,
     deps = [
         ":benchmark_model_lib",
+        ":logging",
         "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite:string_util",
         "//tensorflow/contrib/lite/kernels:builtin_ops",
@@ -71,23 +96,23 @@ cc_library(
     name = "benchmark_params",
     srcs = [
         "benchmark_params.cc",
-        "logging.h",
     ],
     hdrs = ["benchmark_params.h"],
     copts = common_copts,
+    deps = [":logging"],
 )
 
 cc_library(
     name = "benchmark_model_lib",
     srcs = [
         "benchmark_model.cc",
-        "logging.h",
     ],
     hdrs = ["benchmark_model.h"],
     copts = common_copts,
     deps = [
         ":benchmark_params",
         ":command_line_flags",
+        ":logging",
         "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite:string_util",
         "//tensorflow/contrib/lite/kernels:builtin_ops",
diff --git a/tensorflow/contrib/lite/tools/benchmark/benchmark_model.cc b/tensorflow/contrib/lite/tools/benchmark/benchmark_model.cc
index 19b9a9c7ba..f86c0445b0 100644
--- a/tensorflow/contrib/lite/tools/benchmark/benchmark_model.cc
+++ b/tensorflow/contrib/lite/tools/benchmark/benchmark_model.cc
@@ -84,7 +84,7 @@ std::vector<Flag> BenchmarkModel::GetFlags() {
   };
 }
 
-void BenchmarkModel::LogFlags() {
+void BenchmarkModel::LogParams() {
   TFLITE_LOG(INFO) << "Num runs: [" << params_.Get<int32_t>("num_runs") << "]";
   TFLITE_LOG(INFO) << "Inter-run delay (seconds): ["
                    << params_.Get<float>("run_delay") << "]";
@@ -122,12 +122,18 @@ Stat<int64_t> BenchmarkModel::Run(int num_times, RunType run_type) {
   return run_stats;
 }
 
+bool BenchmarkModel::ValidateParams() { return true; }
+
 void BenchmarkModel::Run(int argc, char **argv) {
   if (!ParseFlags(argc, argv)) {
     return;
   }
+  Run();
+}
 
-  LogFlags();
+void BenchmarkModel::Run() {
+  ValidateParams();
+  LogParams();
 
   listeners_.OnBenchmarkStart(params_);
   int64_t initialization_start_us = profiling::time::NowMicros();
@@ -155,7 +161,7 @@ bool BenchmarkModel::ParseFlags(int argc, char **argv) {
     TFLITE_LOG(ERROR) << usage;
     return false;
   }
-  return ValidateFlags();
+  return true;
 }
 
 }  // namespace benchmark
diff --git a/tensorflow/contrib/lite/tools/benchmark/benchmark_model.h b/tensorflow/contrib/lite/tools/benchmark/benchmark_model.h
index 3c7063b2d4..677a1ee68c 100644
--- a/tensorflow/contrib/lite/tools/benchmark/benchmark_model.h
+++ b/tensorflow/contrib/lite/tools/benchmark/benchmark_model.h
@@ -137,16 +137,17 @@ class BenchmarkModel {
   BenchmarkModel();
   BenchmarkModel(BenchmarkParams params) : params_(std::move(params)) {}
   virtual ~BenchmarkModel() {}
-  bool ParseFlags(int argc, char** argv);
   virtual void Init() = 0;
   void Run(int argc, char** argv);
+  virtual void Run();
   void AddListener(BenchmarkListener* listener) {
     listeners_.AddListener(listener);
   }
 
  protected:
-  virtual void LogFlags();
-  virtual bool ValidateFlags() { return true; }
+  virtual void LogParams();
+  virtual bool ValidateParams();
+  bool ParseFlags(int argc, char** argv);
   virtual std::vector<Flag> GetFlags();
   virtual uint64_t ComputeInputBytes() = 0;
   virtual tensorflow::Stat<int64_t> Run(int num_times, RunType run_type);
diff --git a/tensorflow/contrib/lite/tools/benchmark/benchmark_test.cc b/tensorflow/contrib/lite/tools/benchmark/benchmark_test.cc
new file mode 100644
index 0000000000..b697bb394d
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/benchmark/benchmark_test.cc
@@ -0,0 +1,74 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/testing/util.h"
+#include "tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.h"
+#include "tensorflow/contrib/lite/tools/benchmark/command_line_flags.h"
+
+namespace {
+const std::string* g_model_path = nullptr;
+}
+
+namespace tflite {
+namespace benchmark {
+namespace {
+
+BenchmarkParams CreateParams() {
+  BenchmarkParams params;
+  params.AddParam("num_runs", BenchmarkParam::Create<int32_t>(2));
+  params.AddParam("run_delay", BenchmarkParam::Create<float>(-1.0f));
+  params.AddParam("num_threads", BenchmarkParam::Create<int32_t>(1));
+  params.AddParam("benchmark_name", BenchmarkParam::Create<std::string>(""));
+  params.AddParam("output_prefix", BenchmarkParam::Create<std::string>(""));
+  params.AddParam("warmup_runs", BenchmarkParam::Create<int32_t>(1));
+  params.AddParam("graph", BenchmarkParam::Create<std::string>(*g_model_path));
+  params.AddParam("input_layer", BenchmarkParam::Create<std::string>(""));
+  params.AddParam("input_layer_shape", BenchmarkParam::Create<std::string>(""));
+  params.AddParam("use_nnapi", BenchmarkParam::Create<bool>(false));
+  return params;
+}
+
+TEST(BenchmarkTest, DoesntCrash) {
+  ASSERT_THAT(g_model_path, testing::NotNull());
+
+  BenchmarkTfLiteModel benchmark(CreateParams());
+  benchmark.Run();
+}
+
+}  // namespace
+}  // namespace benchmark
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  std::string model_path;
+  std::vector<tflite::Flag> flags = {
+      tflite::Flag::CreateFlag("graph", &model_path, "Path to model file.")};
+  g_model_path = &model_path;
+  const bool parse_result =
+      tflite::Flags::Parse(&argc, const_cast<const char**>(argv), flags);
+  if (!parse_result) {
+    std::cerr << tflite::Flags::Usage(argv[0], flags);
+    return 1;
+  }
+
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.cc b/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.cc
index 73affc26b0..7f97f5d0cd 100644
--- a/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.cc
+++ b/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.cc
@@ -198,8 +198,8 @@ std::vector<Flag> BenchmarkTfLiteModel::GetFlags() {
   return flags;
 }
 
-void BenchmarkTfLiteModel::LogFlags() {
-  BenchmarkModel::LogFlags();
+void BenchmarkTfLiteModel::LogParams() {
+  BenchmarkModel::LogParams();
   TFLITE_LOG(INFO) << "Graph: [" << params_.Get<std::string>("graph") << "]";
   TFLITE_LOG(INFO) << "Input layers: ["
                    << params_.Get<std::string>("input_layer") << "]";
@@ -208,7 +208,7 @@ void BenchmarkTfLiteModel::LogFlags() {
   TFLITE_LOG(INFO) << "Use nnapi : [" << params_.Get<bool>("use_nnapi") << "]";
 }
 
-bool BenchmarkTfLiteModel::ValidateFlags() {
+bool BenchmarkTfLiteModel::ValidateParams() {
   if (params_.Get<std::string>("graph").empty()) {
     TFLITE_LOG(ERROR)
         << "Please specify the name of your TF Lite input file with --graph";
diff --git a/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.h b/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.h
index 50cc3f24b3..9931dcbafe 100644
--- a/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.h
+++ b/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.h
@@ -54,8 +54,8 @@ class BenchmarkTfLiteModel : public BenchmarkModel {
   BenchmarkTfLiteModel(BenchmarkParams params);
 
   std::vector<Flag> GetFlags() override;
-  void LogFlags() override;
-  bool ValidateFlags() override;
+  void LogParams() override;
+  bool ValidateParams() override;
   uint64_t ComputeInputBytes() override;
   void Init() override;
   void RunImpl() override;
diff --git a/tensorflow/contrib/optimizer_v2/BUILD b/tensorflow/contrib/optimizer_v2/BUILD
index 5225ecc14f..3ba3ee29ec 100644
--- a/tensorflow/contrib/optimizer_v2/BUILD
+++ b/tensorflow/contrib/optimizer_v2/BUILD
@@ -193,6 +193,7 @@ cuda_py_test(
     srcs = ["rmsprop_test.py"],
     additional_deps = [
         ":training",
+        "@absl_py//absl/testing:parameterized",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:embedding_ops",
         "//tensorflow/python:framework",
diff --git a/tensorflow/contrib/optimizer_v2/optimizer_v2_test.py b/tensorflow/contrib/optimizer_v2/optimizer_v2_test.py
index ec033c4a01..a44bfd1bfd 100644
--- a/tensorflow/contrib/optimizer_v2/optimizer_v2_test.py
+++ b/tensorflow/contrib/optimizer_v2/optimizer_v2_test.py
@@ -38,12 +38,8 @@ class OptimizerTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testBasic(self):
     for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
-      # Note that we name the variables uniquely here since the variables don't
-      # seem to be getting deleted at the end of the loop.
-      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype,
-                                                    name='a_%d' % i)
-      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype,
-                                                    name='b_%d' % i)
+      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
       def loss():
         return 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
       # Note that for eager execution, minimize expects a function instead of a
@@ -131,12 +127,8 @@ class OptimizerTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testNoGradients(self):
     for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
-      # Note that we name the variables uniquely here since the variables don't
-      # seem to be getting deleted at the end of the loop.
-      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype,
-                                                    name='a%d' % i)
-      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype,
-                                                    name='b%d' % i)
+      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
       # pylint: disable=cell-var-from-loop
       def loss():
         return 5 * var0
@@ -149,12 +141,8 @@ class OptimizerTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testNoGradientsForAnyVariables_Minimize(self):
     for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
-      # Note that we name the variables uniquely here since the variables don't
-      # seem to be getting deleted at the end of the loop.
-      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype,
-                                                    name='a_%d' % i)
-      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype,
-                                                    name='b_%d' % i)
+      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
       def loss():
         return constant_op.constant(5.0)
       sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
@@ -165,12 +153,8 @@ class OptimizerTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testNoGradientsForAnyVariables_ApplyGradients(self):
     for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
-      # Note that we name the variables uniquely here since the variables don't
-      # seem to be getting deleted at the end of the loop.
-      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype,
-                                                    name='a_%d' % i)
-      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype,
-                                                    name='b_%d' % i)
+      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
       sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
       with self.assertRaisesRegexp(ValueError,
                                    'No gradients provided for any variable'):
@@ -179,12 +163,8 @@ class OptimizerTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testGradientsAsVariables(self):
     for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
-      # Note that we name the variables uniquely here since the variables don't
-      # seem to be getting deleted at the end of the loop.
-      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype,
-                                                    name='a%d' % i)
-      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype,
-                                                    name='b%d' % i)
+      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
       def loss():
         return 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
       sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
diff --git a/tensorflow/contrib/optimizer_v2/rmsprop_test.py b/tensorflow/contrib/optimizer_v2/rmsprop_test.py
index ed68f6afbf..dc23ef241a 100644
--- a/tensorflow/contrib/optimizer_v2/rmsprop_test.py
+++ b/tensorflow/contrib/optimizer_v2/rmsprop_test.py
@@ -19,15 +19,16 @@ from __future__ import division
 from __future__ import print_function
 
 import copy
-import itertools
 import math
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.contrib.optimizer_v2 import rmsprop
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -48,13 +49,8 @@ _TEST_PARAM_VALUES = [
     [0.5, 0.95, 0.9, 1e-5, True, False],
 ]
 
-_TESTPARAMS = [
-    [data_type] + values
-    for data_type, values in itertools.product(_DATA_TYPES, _TEST_PARAM_VALUES)
-]
-
 
-class RMSPropOptimizerTest(test.TestCase):
+class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase):
 
   def _rmsprop_update_numpy(self, var, g, mg, rms, mom, lr, decay, momentum,
                             epsilon, centered):
@@ -87,362 +83,366 @@ class RMSPropOptimizerTest(test.TestCase):
       var_t[gindex] = var[gindex] - mom_t[gindex]
     return var_t, mg_t, rms_t, mom_t
 
-  def testDense(self):
-    # TODO(yori): Use ParameterizedTest when available
-    for (dtype, learning_rate, decay, momentum,
-         epsilon, centered, use_resource) in _TESTPARAMS:
-      with self.test_session(use_gpu=True):
-        # Initialize variables for numpy implementation.
-        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
-        grads0_np = np.array([0.1, 0.2], dtype=dtype.as_numpy_dtype)
-        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
-        grads1_np = np.array([0.01, 0.2], dtype=dtype.as_numpy_dtype)
-
-        if use_resource:
-          var0 = resource_variable_ops.ResourceVariable(var0_np)
-          var1 = resource_variable_ops.ResourceVariable(var1_np)
-        else:
-          var0 = variables.Variable(var0_np)
-          var1 = variables.Variable(var1_np)
-        grads0 = constant_op.constant(grads0_np)
-        grads1 = constant_op.constant(grads1_np)
-        opt = rmsprop.RMSPropOptimizer(
-            learning_rate=learning_rate,
-            decay=decay,
-            momentum=momentum,
-            epsilon=epsilon,
-            centered=centered)
-
-        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
-
-        mg0 = opt.get_slot(var0, "mg")
-        self.assertEqual(mg0 is not None, centered)
-        mg1 = opt.get_slot(var1, "mg")
-        self.assertEqual(mg1 is not None, centered)
-        rms0 = opt.get_slot(var0, "rms")
-        self.assertTrue(rms0 is not None)
-        rms1 = opt.get_slot(var1, "rms")
-        self.assertTrue(rms1 is not None)
-        mom0 = opt.get_slot(var0, "momentum")
-        self.assertTrue(mom0 is not None)
-        mom1 = opt.get_slot(var1, "momentum")
-        self.assertTrue(mom1 is not None)
-
-        mg0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
-        mg1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
-        rms0_np = np.array([1.0, 1.0], dtype=dtype.as_numpy_dtype)
-        rms1_np = np.array([1.0, 1.0], dtype=dtype.as_numpy_dtype)
-        mom0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
-        mom1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
-
-        # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
-
-        # Run 4 steps of RMSProp
-        for _ in range(1, 5):
-          update.run()
-
-          var0_np, mg0_np, rms0_np, mom0_np = self._rmsprop_update_numpy(
-              var0_np, grads0_np, mg0_np, rms0_np, mom0_np, learning_rate,
-              decay, momentum, epsilon, centered)
-          var1_np, mg1_np, rms1_np, mom1_np = self._rmsprop_update_numpy(
-              var1_np, grads1_np, mg1_np, rms1_np, mom1_np, learning_rate,
-              decay, momentum, epsilon, centered)
-
-          # Validate updated params
-          if centered:
-            self.assertAllCloseAccordingToType(mg0_np, mg0.eval())
-            self.assertAllCloseAccordingToType(mg1_np, mg1.eval())
-          self.assertAllCloseAccordingToType(rms0_np, rms0.eval())
-          self.assertAllCloseAccordingToType(rms1_np, rms1.eval())
-          self.assertAllCloseAccordingToType(mom0_np, mom0.eval())
-          self.assertAllCloseAccordingToType(mom1_np, mom1.eval())
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
-
-  def testMinimizeSparseResourceVariable(self):
-    for dtype in [dtypes.float32, dtypes.float64]:
-      with self.test_session():
-        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
-        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
-        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
-        loss = pred * pred
-        sgd_op = rmsprop.RMSPropOptimizer(
-            learning_rate=1.0,
-            decay=0.0,
-            momentum=0.0,
-            epsilon=0.0,
-            centered=False).minimize(loss)
-        variables.global_variables_initializer().run()
-        # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
-        # Run 1 step of sgd
-        sgd_op.run()
-        # Validate updated params
-        self.assertAllCloseAccordingToType(
-            [[0., 1.]], var0.eval(), atol=0.01)
-
-  def testMinimizeSparseResourceVariableCentered(self):
-    for dtype in [dtypes.float32, dtypes.float64]:
-      with self.test_session():
-        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
-        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
-        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
-        loss = pred * pred
-        sgd_op = rmsprop.RMSPropOptimizer(
-            learning_rate=1.0,
-            decay=0.0,
-            momentum=0.0,
-            epsilon=1.0,
-            centered=True).minimize(loss)
-        variables.global_variables_initializer().run()
-        # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
-        # Run 1 step of sgd
-        sgd_op.run()
-        # Validate updated params
-        self.assertAllCloseAccordingToType(
-            [[-111, -138]], var0.eval(), atol=0.01)
-
-  def testSparse(self):
-    # TODO(yori): Use ParameterizedTest when available
-    for (dtype, learning_rate, decay,
-         momentum, epsilon, centered, _) in _TESTPARAMS:
-      with self.test_session(use_gpu=True):
-        # Initialize variables for numpy implementation.
-        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
-        grads0_np = np.array([0.1], dtype=dtype.as_numpy_dtype)
-        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
-        grads1_np = np.array([0.01], dtype=dtype.as_numpy_dtype)
-
+  @parameterized.named_parameters(
+      *test_util.generate_combinations_with_testcase_name(
+          dtype=_DATA_TYPES, param_value=_TEST_PARAM_VALUES))
+  def testDense(self, dtype, param_value):
+    (learning_rate, decay, momentum, epsilon, centered, use_resource) = tuple(
+        param_value)
+    with self.test_session(use_gpu=True):
+      # Initialize variables for numpy implementation.
+      var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+      grads0_np = np.array([0.1, 0.2], dtype=dtype.as_numpy_dtype)
+      var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+      grads1_np = np.array([0.01, 0.2], dtype=dtype.as_numpy_dtype)
+
+      if use_resource:
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var1 = resource_variable_ops.ResourceVariable(var1_np)
+      else:
         var0 = variables.Variable(var0_np)
         var1 = variables.Variable(var1_np)
-        grads0_np_indices = np.array([0], dtype=np.int32)
-        grads0 = ops.IndexedSlices(
-            constant_op.constant(grads0_np),
-            constant_op.constant(grads0_np_indices), constant_op.constant([1]))
-        grads1_np_indices = np.array([1], dtype=np.int32)
-        grads1 = ops.IndexedSlices(
-            constant_op.constant(grads1_np),
-            constant_op.constant(grads1_np_indices), constant_op.constant([1]))
-        opt = rmsprop.RMSPropOptimizer(
-            learning_rate=learning_rate,
-            decay=decay,
-            momentum=momentum,
-            epsilon=epsilon,
-            centered=centered)
-        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
-
-        mg0 = opt.get_slot(var0, "mg")
-        self.assertEqual(mg0 is not None, centered)
-        mg1 = opt.get_slot(var1, "mg")
-        self.assertEqual(mg1 is not None, centered)
-        rms0 = opt.get_slot(var0, "rms")
-        self.assertTrue(rms0 is not None)
-        rms1 = opt.get_slot(var1, "rms")
-        self.assertTrue(rms1 is not None)
-        mom0 = opt.get_slot(var0, "momentum")
-        self.assertTrue(mom0 is not None)
-        mom1 = opt.get_slot(var1, "momentum")
-        self.assertTrue(mom1 is not None)
-
-        mg0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
-        mg1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
-        rms0_np = np.array([1.0, 1.0], dtype=dtype.as_numpy_dtype)
-        rms1_np = np.array([1.0, 1.0], dtype=dtype.as_numpy_dtype)
-        mom0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
-        mom1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
-
-        # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
-
-        # Run 4 steps of RMSProp
-        for _ in range(1, 5):
-          update.run()
-
-          var0_np, mg0_np, rms0_np, mom0_np = self._sparse_rmsprop_update_numpy(
-              var0_np, grads0_np_indices, grads0_np, mg0_np, rms0_np, mom0_np,
-              learning_rate, decay, momentum, epsilon, centered)
-          var1_np, mg1_np, rms1_np, mom1_np = self._sparse_rmsprop_update_numpy(
-              var1_np, grads1_np_indices, grads1_np, mg1_np, rms1_np, mom1_np,
-              learning_rate, decay, momentum, epsilon, centered)
-
-          # Validate updated params
-          if centered:
-            self.assertAllCloseAccordingToType(mg0_np, mg0.eval())
-            self.assertAllCloseAccordingToType(mg1_np, mg1.eval())
-          self.assertAllCloseAccordingToType(rms0_np, rms0.eval())
-          self.assertAllCloseAccordingToType(rms1_np, rms1.eval())
-          self.assertAllCloseAccordingToType(mom0_np, mom0.eval())
-          self.assertAllCloseAccordingToType(mom1_np, mom1.eval())
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
-
-  def testWithoutMomentum(self):
-    for dtype in [dtypes.half, dtypes.float32]:
-      with self.test_session(use_gpu=True):
-        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
-        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
-        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
-        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
-        opt = rmsprop.RMSPropOptimizer(
-            learning_rate=2.0, decay=0.9, momentum=0.0, epsilon=1.0)
-        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
-
-        rms0 = opt.get_slot(var0, "rms")
-        self.assertTrue(rms0 is not None)
-        rms1 = opt.get_slot(var1, "rms")
-        self.assertTrue(rms1 is not None)
-        mom0 = opt.get_slot(var0, "momentum")
-        self.assertTrue(mom0 is not None)
-        mom1 = opt.get_slot(var1, "momentum")
-        self.assertTrue(mom1 is not None)
-
-        # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
-        # Step 1: the rms accumulators where 1. So we should see a normal
-        # update: v -= grad * learning_rate
-        update.run()
-        # Check the root mean square accumulators.
-        self.assertAllCloseAccordingToType(
-            np.array([0.901, 0.901]), rms0.eval())
-        self.assertAllCloseAccordingToType(
-            np.array([0.90001, 0.90001]), rms1.eval())
-        # Check the parameters.
-        self.assertAllCloseAccordingToType(
-            np.array([
-                1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0)),
-                2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0))
-            ]), var0.eval())
-        self.assertAllCloseAccordingToType(
-            np.array([
-                3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)),
-                4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0))
-            ]), var1.eval())
-        # Step 2: the root mean square accumulators contain the previous update.
-        update.run()
-        # Check the rms accumulators.
-        self.assertAllCloseAccordingToType(
-            np.array([0.901 * 0.9 + 0.001, 0.901 * 0.9 + 0.001]), rms0.eval())
-        self.assertAllCloseAccordingToType(
-            np.array([0.90001 * 0.9 + 1e-5, 0.90001 * 0.9 + 1e-5]), rms1.eval())
-        # Check the parameters.
-        self.assertAllCloseAccordingToType(
-            np.array([
-                1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0)) -
-                (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1.0)),
-                2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0)) -
-                (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1.0))
-            ]), var0.eval())
-        self.assertAllCloseAccordingToType(
-            np.array([
-                3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)) -
-                (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5 + 1.0)),
-                4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)) -
-                (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5 + 1.0))
-            ]), var1.eval())
-
-  def testWithMomentum(self):
-    for dtype in [dtypes.half, dtypes.float32]:
-      with self.test_session(use_gpu=True):
-        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
-        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
-        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
-        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
-
-        opt = rmsprop.RMSPropOptimizer(
-            learning_rate=2.0, decay=0.9, momentum=0.5, epsilon=1e-5)
-        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
-
-        rms0 = opt.get_slot(var0, "rms")
-        self.assertTrue(rms0 is not None)
-        rms1 = opt.get_slot(var1, "rms")
-        self.assertTrue(rms1 is not None)
-        mom0 = opt.get_slot(var0, "momentum")
-        self.assertTrue(mom0 is not None)
-        mom1 = opt.get_slot(var1, "momentum")
-        self.assertTrue(mom1 is not None)
-
-        # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
-        # Step 1: rms = 1, mom = 0. So we should see a normal
-        # update: v -= grad * learning_rate
+      grads0 = constant_op.constant(grads0_np)
+      grads1 = constant_op.constant(grads1_np)
+      opt = rmsprop.RMSPropOptimizer(
+          learning_rate=learning_rate,
+          decay=decay,
+          momentum=momentum,
+          epsilon=epsilon,
+          centered=centered)
+
+      update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+      variables.global_variables_initializer().run()
+
+      mg0 = opt.get_slot(var0, "mg")
+      self.assertEqual(mg0 is not None, centered)
+      mg1 = opt.get_slot(var1, "mg")
+      self.assertEqual(mg1 is not None, centered)
+      rms0 = opt.get_slot(var0, "rms")
+      self.assertIsNotNone(rms0)
+      rms1 = opt.get_slot(var1, "rms")
+      self.assertIsNotNone(rms1)
+      mom0 = opt.get_slot(var0, "momentum")
+      self.assertIsNotNone(mom0)
+      mom1 = opt.get_slot(var1, "momentum")
+      self.assertIsNotNone(mom1)
+
+      mg0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+      mg1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+      rms0_np = np.array([1.0, 1.0], dtype=dtype.as_numpy_dtype)
+      rms1_np = np.array([1.0, 1.0], dtype=dtype.as_numpy_dtype)
+      mom0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+      mom1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+
+      # Fetch params to validate initial values
+      self.assertAllClose([1.0, 2.0], var0.eval())
+      self.assertAllClose([3.0, 4.0], var1.eval())
+
+      # Run 4 steps of RMSProp
+      for _ in range(4):
         update.run()
-        # Check the root mean square accumulators.
-        self.assertAllCloseAccordingToType(
-            np.array([0.901, 0.901]), rms0.eval())
-        self.assertAllCloseAccordingToType(
-            np.array([0.90001, 0.90001]), rms1.eval())
-        # Check the momentum accumulators
-        self.assertAllCloseAccordingToType(
-            np.array([(0.1 * 2.0 / math.sqrt(0.901 + 1e-5)),
-                      (0.1 * 2.0 / math.sqrt(0.901 + 1e-5))]), mom0.eval())
-        self.assertAllCloseAccordingToType(
-            np.array([(0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)),
-                      (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5))]), mom1.eval())
-
-        # Check that the parameters.
-        self.assertAllCloseAccordingToType(
-            np.array([
-                1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)),
-                2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5))
-            ]), var0.eval())
-        self.assertAllCloseAccordingToType(
-            np.array([
-                3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)),
-                4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5))
-            ]), var1.eval())
-
-        # Step 2: the root mean square accumulators contain the previous update.
+
+        var0_np, mg0_np, rms0_np, mom0_np = self._rmsprop_update_numpy(
+            var0_np, grads0_np, mg0_np, rms0_np, mom0_np, learning_rate,
+            decay, momentum, epsilon, centered)
+        var1_np, mg1_np, rms1_np, mom1_np = self._rmsprop_update_numpy(
+            var1_np, grads1_np, mg1_np, rms1_np, mom1_np, learning_rate,
+            decay, momentum, epsilon, centered)
+
+        # Validate updated params
+        if centered:
+          self.assertAllCloseAccordingToType(mg0_np, mg0.eval())
+          self.assertAllCloseAccordingToType(mg1_np, mg1.eval())
+        self.assertAllCloseAccordingToType(rms0_np, rms0.eval())
+        self.assertAllCloseAccordingToType(rms1_np, rms1.eval())
+        self.assertAllCloseAccordingToType(mom0_np, mom0.eval())
+        self.assertAllCloseAccordingToType(mom1_np, mom1.eval())
+        self.assertAllCloseAccordingToType(var0_np, var0.eval())
+        self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  @parameterized.parameters([dtypes.float32, dtypes.float64])
+  def testMinimizeSparseResourceVariable(self, dtype):
+    with self.test_session():
+      var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+      x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+      pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+      loss = pred * pred
+      sgd_op = rmsprop.RMSPropOptimizer(
+          learning_rate=1.0,
+          decay=0.0,
+          momentum=0.0,
+          epsilon=0.0,
+          centered=False).minimize(loss)
+      variables.global_variables_initializer().run()
+      # Fetch params to validate initial values
+      self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+      # Run 1 step of sgd
+      sgd_op.run()
+      # Validate updated params
+      self.assertAllCloseAccordingToType(
+          [[0., 1.]], var0.eval(), atol=0.01)
+
+  @parameterized.parameters([dtypes.float32, dtypes.float64])
+  def testMinimizeSparseResourceVariableCentered(self, dtype):
+    with self.test_session():
+      var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+      x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+      pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+      loss = pred * pred
+      sgd_op = rmsprop.RMSPropOptimizer(
+          learning_rate=1.0,
+          decay=0.0,
+          momentum=0.0,
+          epsilon=1.0,
+          centered=True).minimize(loss)
+      variables.global_variables_initializer().run()
+      # Fetch params to validate initial values
+      self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+      # Run 1 step of sgd
+      sgd_op.run()
+      # Validate updated params
+      self.assertAllCloseAccordingToType(
+          [[-111, -138]], var0.eval(), atol=0.01)
+
+  @parameterized.named_parameters(
+      *test_util.generate_combinations_with_testcase_name(
+          dtype=_DATA_TYPES, param_value=_TEST_PARAM_VALUES))
+  def testSparse(self, dtype, param_value):
+    (learning_rate, decay, momentum, epsilon, centered, _) = tuple(
+        param_value)
+    with self.test_session(use_gpu=True):
+      # Initialize variables for numpy implementation.
+      var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+      grads0_np = np.array([0.1], dtype=dtype.as_numpy_dtype)
+      var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+      grads1_np = np.array([0.01], dtype=dtype.as_numpy_dtype)
+
+      var0 = variables.Variable(var0_np)
+      var1 = variables.Variable(var1_np)
+      grads0_np_indices = np.array([0], dtype=np.int32)
+      grads0 = ops.IndexedSlices(
+          constant_op.constant(grads0_np),
+          constant_op.constant(grads0_np_indices), constant_op.constant([1]))
+      grads1_np_indices = np.array([1], dtype=np.int32)
+      grads1 = ops.IndexedSlices(
+          constant_op.constant(grads1_np),
+          constant_op.constant(grads1_np_indices), constant_op.constant([1]))
+      opt = rmsprop.RMSPropOptimizer(
+          learning_rate=learning_rate,
+          decay=decay,
+          momentum=momentum,
+          epsilon=epsilon,
+          centered=centered)
+      update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+      variables.global_variables_initializer().run()
+
+      mg0 = opt.get_slot(var0, "mg")
+      self.assertEqual(mg0 is not None, centered)
+      mg1 = opt.get_slot(var1, "mg")
+      self.assertEqual(mg1 is not None, centered)
+      rms0 = opt.get_slot(var0, "rms")
+      self.assertIsNotNone(rms0)
+      rms1 = opt.get_slot(var1, "rms")
+      self.assertIsNotNone(rms1)
+      mom0 = opt.get_slot(var0, "momentum")
+      self.assertIsNotNone(mom0)
+      mom1 = opt.get_slot(var1, "momentum")
+      self.assertIsNotNone(mom1)
+
+      mg0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+      mg1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+      rms0_np = np.array([1.0, 1.0], dtype=dtype.as_numpy_dtype)
+      rms1_np = np.array([1.0, 1.0], dtype=dtype.as_numpy_dtype)
+      mom0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+      mom1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+
+      # Fetch params to validate initial values
+      self.assertAllClose([1.0, 2.0], var0.eval())
+      self.assertAllClose([3.0, 4.0], var1.eval())
+
+      # Run 4 steps of RMSProp
+      for _ in range(4):
         update.run()
-        # Check the rms accumulators.
-        self.assertAllCloseAccordingToType(
-            np.array([0.901 * 0.9 + 0.001, 0.901 * 0.9 + 0.001]), rms0.eval())
-        self.assertAllCloseAccordingToType(
-            np.array([0.90001 * 0.9 + 1e-5, 0.90001 * 0.9 + 1e-5]), rms1.eval())
-        self.assertAllCloseAccordingToType(
-            np.array([
-                0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
-                (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5)),
-                0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
-                (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5))
-            ]), mom0.eval())
-        self.assertAllCloseAccordingToType(
-            np.array([
-                0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
-                (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5)),
-                0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
-                (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5))
-            ]), mom1.eval())
-
-        # Check the parameters.
-        self.assertAllCloseAccordingToType(
-            np.array([
-                1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) -
-                (0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
-                 (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5))),
-                2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) -
-                (0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
-                 (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5)))
-            ]), var0.eval())
-
-        self.assertAllCloseAccordingToType(
-            np.array([
-                3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) -
-                (0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
-                 (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5))),
-                4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) -
-                (0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
-                 (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5)))
-            ]), var1.eval())
+
+        var0_np, mg0_np, rms0_np, mom0_np = self._sparse_rmsprop_update_numpy(
+            var0_np, grads0_np_indices, grads0_np, mg0_np, rms0_np, mom0_np,
+            learning_rate, decay, momentum, epsilon, centered)
+        var1_np, mg1_np, rms1_np, mom1_np = self._sparse_rmsprop_update_numpy(
+            var1_np, grads1_np_indices, grads1_np, mg1_np, rms1_np, mom1_np,
+            learning_rate, decay, momentum, epsilon, centered)
+
+        # Validate updated params
+        if centered:
+          self.assertAllCloseAccordingToType(mg0_np, mg0.eval())
+          self.assertAllCloseAccordingToType(mg1_np, mg1.eval())
+        self.assertAllCloseAccordingToType(rms0_np, rms0.eval())
+        self.assertAllCloseAccordingToType(rms1_np, rms1.eval())
+        self.assertAllCloseAccordingToType(mom0_np, mom0.eval())
+        self.assertAllCloseAccordingToType(mom1_np, mom1.eval())
+        self.assertAllCloseAccordingToType(var0_np, var0.eval())
+        self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  @parameterized.parameters(_DATA_TYPES)
+  def testWithoutMomentum(self, dtype):
+    with self.test_session(use_gpu=True):
+      var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+      var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+      grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+      grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+      opt = rmsprop.RMSPropOptimizer(
+          learning_rate=2.0, decay=0.9, momentum=0.0, epsilon=1.0)
+      update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+      variables.global_variables_initializer().run()
+
+      rms0 = opt.get_slot(var0, "rms")
+      self.assertIsNotNone(rms0)
+      rms1 = opt.get_slot(var1, "rms")
+      self.assertIsNotNone(rms1)
+      mom0 = opt.get_slot(var0, "momentum")
+      self.assertIsNotNone(mom0)
+      mom1 = opt.get_slot(var1, "momentum")
+      self.assertIsNotNone(mom1)
+
+      # Fetch params to validate initial values
+      self.assertAllClose([1.0, 2.0], var0.eval())
+      self.assertAllClose([3.0, 4.0], var1.eval())
+      # Step 1: the rms accumulators where 1. So we should see a normal
+      # update: v -= grad * learning_rate
+      update.run()
+      # Check the root mean square accumulators.
+      self.assertAllCloseAccordingToType(
+          np.array([0.901, 0.901]), rms0.eval())
+      self.assertAllCloseAccordingToType(
+          np.array([0.90001, 0.90001]), rms1.eval())
+      # Check the parameters.
+      self.assertAllCloseAccordingToType(
+          np.array([
+              1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0)),
+              2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0))
+          ]), var0.eval())
+      self.assertAllCloseAccordingToType(
+          np.array([
+              3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)),
+              4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0))
+          ]), var1.eval())
+      # Step 2: the root mean square accumulators contain the previous update.
+      update.run()
+      # Check the rms accumulators.
+      self.assertAllCloseAccordingToType(
+          np.array([0.901 * 0.9 + 0.001, 0.901 * 0.9 + 0.001]), rms0.eval())
+      self.assertAllCloseAccordingToType(
+          np.array([0.90001 * 0.9 + 1e-5, 0.90001 * 0.9 + 1e-5]), rms1.eval())
+      # Check the parameters.
+      self.assertAllCloseAccordingToType(
+          np.array([
+              1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0)) -
+              (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1.0)),
+              2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0)) -
+              (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1.0))
+          ]), var0.eval())
+      self.assertAllCloseAccordingToType(
+          np.array([
+              3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)) -
+              (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5 + 1.0)),
+              4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)) -
+              (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5 + 1.0))
+          ]), var1.eval())
+
+  @parameterized.parameters(_DATA_TYPES)
+  def testWithMomentum(self, dtype):
+    with self.test_session(use_gpu=True):
+      var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+      var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+      grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+      grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+
+      opt = rmsprop.RMSPropOptimizer(
+          learning_rate=2.0, decay=0.9, momentum=0.5, epsilon=1e-5)
+      update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+      variables.global_variables_initializer().run()
+
+      rms0 = opt.get_slot(var0, "rms")
+      self.assertIsNotNone(rms0)
+      rms1 = opt.get_slot(var1, "rms")
+      self.assertIsNotNone(rms1)
+      mom0 = opt.get_slot(var0, "momentum")
+      self.assertIsNotNone(mom0)
+      mom1 = opt.get_slot(var1, "momentum")
+      self.assertIsNotNone(mom1)
+
+      # Fetch params to validate initial values
+      self.assertAllClose([1.0, 2.0], var0.eval())
+      self.assertAllClose([3.0, 4.0], var1.eval())
+      # Step 1: rms = 1, mom = 0. So we should see a normal
+      # update: v -= grad * learning_rate
+      update.run()
+      # Check the root mean square accumulators.
+      self.assertAllCloseAccordingToType(
+          np.array([0.901, 0.901]), rms0.eval())
+      self.assertAllCloseAccordingToType(
+          np.array([0.90001, 0.90001]), rms1.eval())
+      # Check the momentum accumulators
+      self.assertAllCloseAccordingToType(
+          np.array([(0.1 * 2.0 / math.sqrt(0.901 + 1e-5)),
+                    (0.1 * 2.0 / math.sqrt(0.901 + 1e-5))]), mom0.eval())
+      self.assertAllCloseAccordingToType(
+          np.array([(0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)),
+                    (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5))]), mom1.eval())
+
+      # Check that the parameters.
+      self.assertAllCloseAccordingToType(
+          np.array([
+              1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)),
+              2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5))
+          ]), var0.eval())
+      self.assertAllCloseAccordingToType(
+          np.array([
+              3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)),
+              4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5))
+          ]), var1.eval())
+
+      # Step 2: the root mean square accumulators contain the previous update.
+      update.run()
+      # Check the rms accumulators.
+      self.assertAllCloseAccordingToType(
+          np.array([0.901 * 0.9 + 0.001, 0.901 * 0.9 + 0.001]), rms0.eval())
+      self.assertAllCloseAccordingToType(
+          np.array([0.90001 * 0.9 + 1e-5, 0.90001 * 0.9 + 1e-5]), rms1.eval())
+      self.assertAllCloseAccordingToType(
+          np.array([
+              0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
+              (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5)),
+              0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
+              (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5))
+          ]), mom0.eval())
+      self.assertAllCloseAccordingToType(
+          np.array([
+              0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
+              (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5)),
+              0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
+              (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5))
+          ]), mom1.eval())
+
+      # Check the parameters.
+      self.assertAllCloseAccordingToType(
+          np.array([
+              1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) -
+              (0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
+               (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5))),
+              2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) -
+              (0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
+               (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5)))
+          ]), var0.eval())
+
+      self.assertAllCloseAccordingToType(
+          np.array([
+              3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) -
+              (0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
+               (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5))),
+              4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) -
+              (0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
+               (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5)))
+          ]), var1.eval())
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/proto/BUILD b/tensorflow/contrib/proto/BUILD
index d45622174f..b27142cf4a 100644
--- a/tensorflow/contrib/proto/BUILD
+++ b/tensorflow/contrib/proto/BUILD
@@ -16,15 +16,3 @@ py_library(
         "//tensorflow/contrib/proto/python/ops:encode_proto_op_py",
     ],
 )
-
-py_library(
-    name = "proto_pip",
-    data = if_static(
-        [],
-        otherwise = ["//tensorflow/contrib/proto/python/kernel_tests:libtestexample.so"],
-    ),
-    deps = [
-        ":proto",
-        "//tensorflow/contrib/proto/python/kernel_tests:py_test_deps",
-    ],
-)
diff --git a/tensorflow/contrib/proto/python/kernel_tests/BUILD b/tensorflow/contrib/proto/python/kernel_tests/BUILD
index 3c6fde23d2..125c1cee29 100644
--- a/tensorflow/contrib/proto/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/proto/python/kernel_tests/BUILD
@@ -100,3 +100,30 @@ tf_cc_shared_object(
         ":test_example_proto_cc",
     ],
 )
+
+py_library(
+    name = "descriptor_source_test_base",
+    testonly = 1,
+    srcs = ["descriptor_source_test_base.py"],
+    deps = [
+        ":proto_op_test_base",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+        "@protobuf_archive//:protobuf_python",
+    ],
+)
+
+tf_py_test(
+    name = "descriptor_source_test",
+    size = "small",
+    srcs = ["descriptor_source_test.py"],
+    additional_deps = [
+        ":descriptor_source_test_base",
+        "//tensorflow/contrib/proto/python/ops:decode_proto_op_py",
+        "//tensorflow/contrib/proto/python/ops:encode_proto_op_py",
+        "//tensorflow/python:client_testlib",
+    ],
+    tags = [
+        "no_pip",
+    ],
+)
diff --git a/tensorflow/contrib/proto/python/kernel_tests/descriptor_source_test.py b/tensorflow/contrib/proto/python/kernel_tests/descriptor_source_test.py
new file mode 100644
index 0000000000..32ca318f73
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/descriptor_source_test.py
@@ -0,0 +1,36 @@
+# =============================================================================
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Tests for proto ops reading descriptors from other sources."""
+# Python3 preparedness imports.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.proto.python.kernel_tests import descriptor_source_test_base as test_base
+from tensorflow.contrib.proto.python.ops import decode_proto_op
+from tensorflow.contrib.proto.python.ops import encode_proto_op
+from tensorflow.python.platform import test
+
+
+class DescriptorSourceTest(test_base.DescriptorSourceTestBase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    super(DescriptorSourceTest, self).__init__(decode_proto_op, encode_proto_op,
+                                               methodName)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/proto/python/kernel_tests/descriptor_source_test_base.py b/tensorflow/contrib/proto/python/kernel_tests/descriptor_source_test_base.py
new file mode 100644
index 0000000000..9a1c04af32
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/descriptor_source_test_base.py
@@ -0,0 +1,176 @@
+# =============================================================================
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Tests for proto ops reading descriptors from other sources."""
+# Python3 preparedness imports.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import numpy as np
+
+from google.protobuf.descriptor_pb2 import FieldDescriptorProto
+from google.protobuf.descriptor_pb2 import FileDescriptorSet
+from tensorflow.contrib.proto.python.kernel_tests import proto_op_test_base as test_base
+from tensorflow.python.framework import dtypes
+from tensorflow.python.platform import test
+
+
+class DescriptorSourceTestBase(test.TestCase):
+  """Base class for testing descriptor sources."""
+
+  def __init__(self, decode_module, encode_module, methodName='runTest'):  # pylint: disable=invalid-name
+    """DescriptorSourceTestBase initializer.
+
+    Args:
+      decode_module: a module containing the `decode_proto_op` method
+      encode_module: a module containing the `encode_proto_op` method
+      methodName: the name of the test method (same as for test.TestCase)
+    """
+
+    super(DescriptorSourceTestBase, self).__init__(methodName)
+    self._decode_module = decode_module
+    self._encode_module = encode_module
+
+  # NOTE: We generate the descriptor programmatically instead of via a compiler
+  # because of differences between different versions of the compiler.
+  #
+  # The generated descriptor should capture the subset of `test_example.proto`
+  # used in `test_base.simple_test_case()`.
+  def _createDescriptorFile(self):
+    set_proto = FileDescriptorSet()
+
+    file_proto = set_proto.file.add(
+        name='types.proto',
+        package='tensorflow',
+        syntax='proto3')
+    enum_proto = file_proto.enum_type.add(name='DataType')
+    enum_proto.value.add(name='DT_DOUBLE', number=0)
+    enum_proto.value.add(name='DT_BOOL', number=1)
+
+    file_proto = set_proto.file.add(
+        name='test_example.proto',
+        package='tensorflow.contrib.proto',
+        dependency=['types.proto'])
+    message_proto = file_proto.message_type.add(name='TestCase')
+    message_proto.field.add(
+        name='values',
+        number=1,
+        type=FieldDescriptorProto.TYPE_MESSAGE,
+        type_name='.tensorflow.contrib.proto.TestValue',
+        label=FieldDescriptorProto.LABEL_REPEATED)
+    message_proto.field.add(
+        name='shapes',
+        number=2,
+        type=FieldDescriptorProto.TYPE_INT32,
+        label=FieldDescriptorProto.LABEL_REPEATED)
+    message_proto.field.add(
+        name='sizes',
+        number=3,
+        type=FieldDescriptorProto.TYPE_INT32,
+        label=FieldDescriptorProto.LABEL_REPEATED)
+    message_proto.field.add(
+        name='fields',
+        number=4,
+        type=FieldDescriptorProto.TYPE_MESSAGE,
+        type_name='.tensorflow.contrib.proto.FieldSpec',
+        label=FieldDescriptorProto.LABEL_REPEATED)
+
+    message_proto = file_proto.message_type.add(
+        name='TestValue')
+    message_proto.field.add(
+        name='double_value',
+        number=1,
+        type=FieldDescriptorProto.TYPE_DOUBLE,
+        label=FieldDescriptorProto.LABEL_REPEATED)
+    message_proto.field.add(
+        name='bool_value',
+        number=2,
+        type=FieldDescriptorProto.TYPE_BOOL,
+        label=FieldDescriptorProto.LABEL_REPEATED)
+
+    message_proto = file_proto.message_type.add(
+        name='FieldSpec')
+    message_proto.field.add(
+        name='name',
+        number=1,
+        type=FieldDescriptorProto.TYPE_STRING,
+        label=FieldDescriptorProto.LABEL_OPTIONAL)
+    message_proto.field.add(
+        name='dtype',
+        number=2,
+        type=FieldDescriptorProto.TYPE_ENUM,
+        type_name='.tensorflow.DataType',
+        label=FieldDescriptorProto.LABEL_OPTIONAL)
+    message_proto.field.add(
+        name='value',
+        number=3,
+        type=FieldDescriptorProto.TYPE_MESSAGE,
+        type_name='.tensorflow.contrib.proto.TestValue',
+        label=FieldDescriptorProto.LABEL_OPTIONAL)
+
+    fn = os.path.join(self.get_temp_dir(), 'descriptor.pb')
+    with open(fn, 'wb') as f:
+      f.write(set_proto.SerializeToString())
+    return fn
+
+  def _testRoundtrip(self, descriptor_source):
+    # Numpy silently truncates the strings if you don't specify dtype=object.
+    in_bufs = np.array(
+        [test_base.ProtoOpTestBase.simple_test_case().SerializeToString()],
+        dtype=object)
+    message_type = 'tensorflow.contrib.proto.TestCase'
+    field_names = ['values', 'shapes', 'sizes', 'fields']
+    tensor_types = [dtypes.string, dtypes.int32, dtypes.int32, dtypes.string]
+
+    with self.test_session() as sess:
+      sizes, field_tensors = self._decode_module.decode_proto(
+          in_bufs,
+          message_type=message_type,
+          field_names=field_names,
+          output_types=tensor_types,
+          descriptor_source=descriptor_source)
+
+      out_tensors = self._encode_module.encode_proto(
+          sizes,
+          field_tensors,
+          message_type=message_type,
+          field_names=field_names,
+          descriptor_source=descriptor_source)
+
+      out_bufs, = sess.run([out_tensors])
+
+      # Check that the re-encoded tensor has the same shape.
+      self.assertEqual(in_bufs.shape, out_bufs.shape)
+
+      # Compare the input and output.
+      for in_buf, out_buf in zip(in_bufs.flat, out_bufs.flat):
+        # Check that the input and output serialized messages are identical.
+        # If we fail here, there is a difference in the serialized
+        # representation but the new serialization still parses. This could
+        # be harmless (a change in map ordering?) or it could be bad (e.g.
+        # loss of packing in the encoding).
+        self.assertEqual(in_buf, out_buf)
+
+  def testWithFileDescriptorSet(self):
+    # First try parsing with a local proto db, which should fail.
+    with self.assertRaisesOpError('No descriptor found for message type'):
+      self._testRoundtrip('local://')
+
+    # Now try parsing with a FileDescriptorSet which contains the test proto.
+    descriptor_file = self._createDescriptorFile()
+    self._testRoundtrip(descriptor_file)
diff --git a/tensorflow/contrib/rnn/python/ops/rnn.py b/tensorflow/contrib/rnn/python/ops/rnn.py
index 2f0caadda3..0266b72dcb 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn.py
@@ -175,7 +175,7 @@ def stack_bidirectional_dynamic_rnn(cells_fw,
   Returns:
     A tuple (outputs, output_state_fw, output_state_bw) where:
       outputs: Output `Tensor` shaped:
-        `batch_size, max_time, layers_output]`. Where layers_output
+        `[batch_size, max_time, layers_output]`. Where layers_output
         are depth-concatenated forward and backward outputs.
       output_states_fw is the final states, one tensor per layer,
         of the forward rnn.
diff --git a/tensorflow/contrib/saved_model/BUILD b/tensorflow/contrib/saved_model/BUILD
index 26fd4e2023..fbb50befdf 100644
--- a/tensorflow/contrib/saved_model/BUILD
+++ b/tensorflow/contrib/saved_model/BUILD
@@ -93,3 +93,32 @@ py_test(
         "//tensorflow/python/saved_model:utils",
     ],
 )
+
+py_library(
+    name = "keras_saved_model",
+    srcs = ["python/saved_model/keras_saved_model.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_windows"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/python:lib",
+        "//tensorflow/python:util",
+        "//tensorflow/python/keras:engine",
+        "//tensorflow/python/saved_model:constants",
+    ],
+)
+
+py_test(
+    name = "keras_saved_model_test",
+    size = "small",
+    srcs = ["python/saved_model/keras_saved_model_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_windows"],
+    deps = [
+        ":saved_model_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:training",
+        "//tensorflow/python/keras",
+        "//third_party/py/numpy",
+    ],
+)
diff --git a/tensorflow/contrib/saved_model/__init__.py b/tensorflow/contrib/saved_model/__init__.py
index b4f27a055d..95e1a8967b 100644
--- a/tensorflow/contrib/saved_model/__init__.py
+++ b/tensorflow/contrib/saved_model/__init__.py
@@ -24,11 +24,12 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=unused-import,wildcard-import,line-too-long
+from tensorflow.contrib.saved_model.python.saved_model.keras_saved_model import *
 from tensorflow.contrib.saved_model.python.saved_model.signature_def_utils import *
 # pylint: enable=unused-import,widcard-import,line-too-long
 
 from tensorflow.python.util.all_util import remove_undocumented
 
-_allowed_symbols = ["get_signature_def_by_key"]
+_allowed_symbols = ["get_signature_def_by_key", "load_model", "save_model"]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/saved_model/python/saved_model/__init__.py b/tensorflow/contrib/saved_model/python/saved_model/__init__.py
index 7b91622b61..e3b76bb6f3 100644
--- a/tensorflow/contrib/saved_model/python/saved_model/__init__.py
+++ b/tensorflow/contrib/saved_model/python/saved_model/__init__.py
@@ -24,5 +24,6 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=wildcard-import
+from tensorflow.contrib.saved_model.python.saved_model import keras_saved_model
 from tensorflow.contrib.saved_model.python.saved_model import signature_def_utils
 # pylint: enable=wildcard-import
diff --git a/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model.py b/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model.py
new file mode 100644
index 0000000000..e2a969f053
--- /dev/null
+++ b/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model.py
@@ -0,0 +1,108 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=protected-access
+"""Utility functions to save/load keras Model to/from SavedModel."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.python.keras.models import model_from_json
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.saved_model import constants
+from tensorflow.python.util import compat
+
+
+def save_model(model, saved_model_path):
+  """Save a `tf.keras.Model` into Tensorflow SavedModel format.
+
+  `save_model` generates such files/folders under the `saved_model_path` folder:
+  1) an asset folder containing the json string of the model's
+  configuration(topology).
+  2) a checkpoint containing the model weights.
+
+  Note that subclassed models can not be saved via this function, unless you
+  provide an implementation for get_config() and from_config().
+  Also note that `tf.keras.optimizers.Optimizer` instances can not currently be
+  saved to checkpoints. Use optimizers from `tf.train`.
+
+  Args:
+    model: A `tf.keras.Model` to be saved.
+    saved_model_path: a string specifying the path to the SavedModel directory.
+
+  Raises:
+    NotImplementedError: If the passed in model is a subclassed model.
+  """
+  if not model._is_graph_network:
+    raise NotImplementedError
+
+  # save model configuration as a json string under assets folder.
+  model_json = model.to_json()
+  assets_destination_dir = os.path.join(
+      compat.as_bytes(saved_model_path),
+      compat.as_bytes(constants.ASSETS_DIRECTORY))
+
+  if not file_io.file_exists(assets_destination_dir):
+    file_io.recursive_create_dir(assets_destination_dir)
+
+  model_json_filepath = os.path.join(
+      compat.as_bytes(assets_destination_dir),
+      compat.as_bytes(constants.SAVED_MODEL_FILENAME_JSON))
+  file_io.write_string_to_file(model_json_filepath, model_json)
+
+  # save model weights in checkpoint format.
+  checkpoint_destination_dir = os.path.join(
+      compat.as_bytes(saved_model_path),
+      compat.as_bytes(constants.VARIABLES_DIRECTORY))
+
+  if not file_io.file_exists(checkpoint_destination_dir):
+    file_io.recursive_create_dir(checkpoint_destination_dir)
+
+  checkpoint_prefix = os.path.join(
+      compat.as_text(checkpoint_destination_dir),
+      compat.as_text(constants.VARIABLES_FILENAME))
+  model.save_weights(checkpoint_prefix, save_format='tf', overwrite=True)
+
+
+def load_model(saved_model_path):
+  """Load a keras.Model from SavedModel.
+
+  load_model reinstantiates model state by:
+  1) loading model topology from json (this will eventually come
+     from metagraph).
+  2) loading model weights from checkpoint.
+
+  Args:
+    saved_model_path: a string specifying the path to an existing SavedModel.
+
+  Returns:
+    a keras.Model instance.
+  """
+  # restore model topology from json string
+  model_json_filepath = os.path.join(
+      compat.as_bytes(saved_model_path),
+      compat.as_bytes(constants.ASSETS_DIRECTORY),
+      compat.as_bytes(constants.SAVED_MODEL_FILENAME_JSON))
+  model_json = file_io.read_file_to_string(model_json_filepath)
+  model = model_from_json(model_json)
+
+  # restore model weights
+  checkpoint_prefix = os.path.join(
+      compat.as_text(saved_model_path),
+      compat.as_text(constants.VARIABLES_DIRECTORY),
+      compat.as_text(constants.VARIABLES_FILENAME))
+  model.load_weights(checkpoint_prefix)
+  return model
diff --git a/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model_test.py b/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model_test.py
new file mode 100644
index 0000000000..107ae1b07b
--- /dev/null
+++ b/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model_test.py
@@ -0,0 +1,201 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=protected-access
+"""Tests for saving/loading function for keras Model."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+import numpy as np
+
+from tensorflow.contrib.saved_model.python.saved_model import keras_saved_model
+from tensorflow.python import keras
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.engine import training
+from tensorflow.python.platform import test
+from tensorflow.python.training import training as training_module
+
+
+class TestModelSavingandLoading(test.TestCase):
+
+  def test_saving_sequential_model(self):
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(2, input_shape=(3,)))
+      model.add(keras.layers.RepeatVector(3))
+      model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
+      model.compile(
+          loss=keras.losses.MSE,
+          optimizer=keras.optimizers.RMSprop(lr=0.0001),
+          metrics=[keras.metrics.categorical_accuracy],
+          sample_weight_mode='temporal')
+      x = np.random.random((1, 3))
+      y = np.random.random((1, 3, 3))
+      model.train_on_batch(x, y)
+
+      ref_y = model.predict(x)
+      temp_dir = self.get_temp_dir()
+      self.addCleanup(shutil.rmtree, temp_dir)
+
+      temp_saved_model = os.path.join(temp_dir, 'saved_model')
+      keras_saved_model.save_model(model, temp_saved_model)
+
+      loaded_model = keras_saved_model.load_model(temp_saved_model)
+      y = loaded_model.predict(x)
+      self.assertAllClose(ref_y, y, atol=1e-05)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_saving_sequential_model_without_compile(self):
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(2, input_shape=(3,)))
+      model.add(keras.layers.RepeatVector(3))
+      model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
+
+      x = np.random.random((1, 3))
+      ref_y = model.predict(x)
+
+      temp_dir = self.get_temp_dir()
+      self.addCleanup(shutil.rmtree, temp_dir)
+
+      temp_saved_model = os.path.join(temp_dir, 'saved_model')
+      keras_saved_model.save_model(model, temp_saved_model)
+      loaded_model = keras_saved_model.load_model(temp_saved_model)
+
+      y = loaded_model.predict(x)
+      self.assertAllClose(ref_y, y, atol=1e-05)
+
+  def test_saving_functional_model(self):
+    with self.test_session():
+      inputs = keras.layers.Input(shape=(3,))
+      x = keras.layers.Dense(2)(inputs)
+      output = keras.layers.Dense(3)(x)
+
+      model = keras.models.Model(inputs, output)
+      model.compile(
+          loss=keras.losses.MSE,
+          optimizer=keras.optimizers.RMSprop(lr=0.0001),
+          metrics=[keras.metrics.categorical_accuracy])
+      x = np.random.random((1, 3))
+      y = np.random.random((1, 3))
+      model.train_on_batch(x, y)
+
+      ref_y = model.predict(x)
+      temp_dir = self.get_temp_dir()
+      self.addCleanup(shutil.rmtree, temp_dir)
+
+      temp_saved_model = os.path.join(temp_dir, 'saved_model')
+      keras_saved_model.save_model(model, temp_saved_model)
+      loaded_model = keras_saved_model.load_model(temp_saved_model)
+
+      y = loaded_model.predict(x)
+      self.assertAllClose(ref_y, y, atol=1e-05)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_saving_functional_model_without_compile(self):
+    with self.test_session():
+      inputs = keras.layers.Input(shape=(3,))
+      x = keras.layers.Dense(2)(inputs)
+      output = keras.layers.Dense(3)(x)
+
+      model = keras.models.Model(inputs, output)
+
+      x = np.random.random((1, 3))
+      y = np.random.random((1, 3))
+
+      ref_y = model.predict(x)
+      temp_dir = self.get_temp_dir()
+      self.addCleanup(shutil.rmtree, temp_dir)
+
+      temp_saved_model = os.path.join(temp_dir, 'saved_model')
+      keras_saved_model.save_model(model, temp_saved_model)
+      loaded_model = keras_saved_model.load_model(temp_saved_model)
+
+      y = loaded_model.predict(x)
+      self.assertAllClose(ref_y, y, atol=1e-05)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_saving_with_tf_optimizer(self):
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(2, input_shape=(3,)))
+      model.add(keras.layers.Dense(3))
+      model.compile(
+          loss='mse',
+          optimizer=training_module.RMSPropOptimizer(0.1),
+          metrics=['acc'])
+
+      x = np.random.random((1, 3))
+      y = np.random.random((1, 3))
+      model.train_on_batch(x, y)
+
+      ref_y = model.predict(x)
+      temp_dir = self.get_temp_dir()
+      self.addCleanup(shutil.rmtree, temp_dir)
+
+      temp_saved_model = os.path.join(temp_dir, 'saved_model')
+      keras_saved_model.save_model(model, temp_saved_model)
+      loaded_model = keras_saved_model.load_model(temp_saved_model)
+      loaded_model.compile(
+          loss='mse',
+          optimizer=training_module.RMSPropOptimizer(0.1),
+          metrics=['acc'])
+      y = loaded_model.predict(x)
+      self.assertAllClose(ref_y, y, atol=1e-05)
+
+      # test that new updates are the same with both models
+      x = np.random.random((1, 3))
+      y = np.random.random((1, 3))
+
+      ref_loss = model.train_on_batch(x, y)
+      loss = loaded_model.train_on_batch(x, y)
+      self.assertAllClose(ref_loss, loss, atol=1e-05)
+
+      ref_y = model.predict(x)
+      y = loaded_model.predict(x)
+      self.assertAllClose(ref_y, y, atol=1e-05)
+
+      # test saving/loading again
+      keras_saved_model.save_model(loaded_model, temp_saved_model)
+      loaded_model = keras_saved_model.load_model(temp_saved_model)
+      y = loaded_model.predict(x)
+      self.assertAllClose(ref_y, y, atol=1e-05)
+
+  def test_saving_subclassed_model_raise_error(self):
+    # For now, saving subclassed model should raise an error. It should be
+    # avoided later with loading from SavedModel.pb.
+
+    class SubclassedModel(training.Model):
+
+      def __init__(self):
+        super(SubclassedModel, self).__init__()
+        self.layer1 = keras.layers.Dense(3)
+        self.layer2 = keras.layers.Dense(1)
+
+      def call(self, inp):
+        return self.layer2(self.layer1(inp))
+
+    model = SubclassedModel()
+    temp_dir = self.get_temp_dir()
+    self.addCleanup(shutil.rmtree, temp_dir)
+    temp_saved_model = os.path.join(temp_dir, 'saved_model')
+    with self.assertRaises(NotImplementedError):
+      keras_saved_model.save_model(model, temp_saved_model)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/signal/python/kernel_tests/mel_ops_test.py b/tensorflow/contrib/signal/python/kernel_tests/mel_ops_test.py
index 345eb6cfaa..f4348e80ea 100644
--- a/tensorflow/contrib/signal/python/kernel_tests/mel_ops_test.py
+++ b/tensorflow/contrib/signal/python/kernel_tests/mel_ops_test.py
@@ -53,7 +53,8 @@ def spectrogram_to_mel_matrix(num_mel_bins=20,
                               num_spectrogram_bins=129,
                               audio_sample_rate=8000,
                               lower_edge_hertz=125.0,
-                              upper_edge_hertz=3800.0):
+                              upper_edge_hertz=3800.0,
+                              unused_dtype=None):
   """Return a matrix that can post-multiply spectrogram rows to make mel.
 
   Copied from
@@ -132,9 +133,9 @@ class LinearToMelTest(test.TestCase):
     # lower_edge_hertz, upper_edge_hertz) to test.
     configs = [
         # Defaults.
-        (20, 129, 8000.0, 125.0, 3800.0),
+        (20, 129, 8000.0, 125.0, 3800.0, dtypes.float64),
         # Settings used by Tacotron (https://arxiv.org/abs/1703.10135).
-        (80, 1025, 24000.0, 80.0, 12000.0)
+        (80, 1025, 24000.0, 80.0, 12000.0, dtypes.float64)
     ]
     with self.test_session(use_gpu=True):
       for config in configs:
@@ -143,7 +144,8 @@ class LinearToMelTest(test.TestCase):
         self.assertAllClose(mel_matrix_np, mel_matrix.eval(), atol=3e-6)
 
   def test_dtypes(self):
-    for dtype in (dtypes.float16, dtypes.float32, dtypes.float64):
+    # LinSpace is not supported for tf.float16.
+    for dtype in (dtypes.bfloat16, dtypes.float32, dtypes.float64):
       self.assertEqual(dtype,
                        mel_ops.linear_to_mel_weight_matrix(dtype=dtype).dtype)
 
@@ -167,7 +169,8 @@ class LinearToMelTest(test.TestCase):
 
   def test_constant_folding(self):
     """Mel functions should be constant foldable."""
-    for dtype in (dtypes.float16, dtypes.float32, dtypes.float64):
+    # TODO(rjryan): tf.bloat16 cannot be constant folded by Grappler.
+    for dtype in (dtypes.float32, dtypes.float64):
       g = ops.Graph()
       with g.as_default():
         mel_matrix = mel_ops.linear_to_mel_weight_matrix(dtype=dtype)
diff --git a/tensorflow/contrib/signal/python/ops/mel_ops.py b/tensorflow/contrib/signal/python/ops/mel_ops.py
index 1e84006116..062d84aea1 100644
--- a/tensorflow/contrib/signal/python/ops/mel_ops.py
+++ b/tensorflow/contrib/signal/python/ops/mel_ops.py
@@ -151,22 +151,21 @@ def linear_to_mel_weight_matrix(num_mel_bins=20,
     _validate_arguments(num_mel_bins, sample_rate,
                         lower_edge_hertz, upper_edge_hertz, dtype)
 
-    # To preserve accuracy, we compute the matrix at float64 precision and then
-    # cast to `dtype` at the end. This function can be constant folded by graph
-    # optimization since there are no Tensor inputs.
+    # This function can be constant folded by graph optimization since there are
+    # no Tensor inputs.
     sample_rate = ops.convert_to_tensor(
-        sample_rate, dtypes.float64, name='sample_rate')
+        sample_rate, dtype, name='sample_rate')
     lower_edge_hertz = ops.convert_to_tensor(
-        lower_edge_hertz, dtypes.float64, name='lower_edge_hertz')
+        lower_edge_hertz, dtype, name='lower_edge_hertz')
     upper_edge_hertz = ops.convert_to_tensor(
-        upper_edge_hertz, dtypes.float64, name='upper_edge_hertz')
-    zero_float64 = ops.convert_to_tensor(0.0, dtypes.float64)
+        upper_edge_hertz, dtype, name='upper_edge_hertz')
+    zero = ops.convert_to_tensor(0.0, dtype)
 
     # HTK excludes the spectrogram DC bin.
     bands_to_zero = 1
     nyquist_hertz = sample_rate / 2.0
     linear_frequencies = math_ops.linspace(
-        zero_float64, nyquist_hertz, num_spectrogram_bins)[bands_to_zero:]
+        zero, nyquist_hertz, num_spectrogram_bins)[bands_to_zero:]
     spectrogram_bins_mel = array_ops.expand_dims(
         _hertz_to_mel(linear_frequencies), 1)
 
@@ -193,11 +192,8 @@ def linear_to_mel_weight_matrix(num_mel_bins=20,
 
     # Intersect the line segments with each other and zero.
     mel_weights_matrix = math_ops.maximum(
-        zero_float64, math_ops.minimum(lower_slopes, upper_slopes))
+        zero, math_ops.minimum(lower_slopes, upper_slopes))
 
     # Re-add the zeroed lower bins we sliced out above.
-    mel_weights_matrix = array_ops.pad(
-        mel_weights_matrix, [[bands_to_zero, 0], [0, 0]])
-
-    # Cast to the desired type.
-    return math_ops.cast(mel_weights_matrix, dtype, name=name)
+    return array_ops.pad(
+        mel_weights_matrix, [[bands_to_zero, 0], [0, 0]], name=name)
diff --git a/tensorflow/contrib/summary/summary_ops_test.py b/tensorflow/contrib/summary/summary_ops_test.py
index 3e41e3d0b4..4d1807130c 100644
--- a/tensorflow/contrib/summary/summary_ops_test.py
+++ b/tensorflow/contrib/summary/summary_ops_test.py
@@ -20,6 +20,8 @@ import os
 import tempfile
 import time
 
+import sqlite3
+
 import numpy as np
 import six
 
@@ -275,6 +277,22 @@ class EagerFileTest(test_util.TensorFlowTestCase):
 
 class EagerDbTest(summary_test_util.SummaryDbTest):
 
+  def testDbURIOpen(self):
+    tmpdb_path = os.path.join(self.get_temp_dir(), 'tmpDbURITest.sqlite')
+    tmpdb_uri = six.moves.urllib_parse.urljoin("file:", tmpdb_path)
+    tmpdb_writer = summary_ops.create_db_writer(
+        tmpdb_uri,
+        "experimentA",
+        "run1",
+        "user1")
+    with summary_ops.always_record_summaries():
+      with tmpdb_writer.as_default():
+        summary_ops.scalar('t1', 2.0)
+    tmpdb = sqlite3.connect(tmpdb_path)
+    num = get_one(tmpdb, 'SELECT count(*) FROM Tags WHERE tag_name = "t1"')
+    self.assertEqual(num, 1)
+    tmpdb.close()
+
   def testIntegerSummaries(self):
     step = training_util.create_global_step()
     writer = self.create_db_writer()
diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index a9378e9ad6..46f3c36e3d 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -11,7 +11,6 @@ exports_files(["LICENSE"])
 
 load(
     "//tensorflow:tensorflow.bzl",
-    "cuda_py_test",
     "tf_cc_test",
     "tf_copts",
     "tf_cuda_library",
@@ -20,6 +19,7 @@ load(
     "tf_gen_op_libs",
     "tf_gen_op_wrapper_py",
 )
+load("//tensorflow:tensorflow.bzl", "cuda_py_tests")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
@@ -32,9 +32,14 @@ tf_cuda_cc_test(
     name = "tensorrt_test_cc",
     size = "small",
     srcs = ["tensorrt_test.cc"],
-    tags = ["no_windows"],
+    tags = [
+        "no_windows",
+        "nomac",
+    ],
     deps = [
+        "//tensorflow/core:gpu_init",
         "//tensorflow/core:lib",
+        "//tensorflow/core:stream_executor",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ] + if_tensorrt([
@@ -117,7 +122,6 @@ tf_cuda_library(
 
 tf_gen_op_wrapper_py(
     name = "trt_engine_op",
-    gen_locally = True,
     deps = [
         ":trt_engine_op_op_lib",
         ":trt_logging",
@@ -154,6 +158,7 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":tf_trt_integration_test_base",
         ":trt_convert_py",
         ":trt_ops_py",
         "//tensorflow/python:errors",
@@ -233,7 +238,10 @@ tf_cc_test(
     name = "trt_allocator_test",
     size = "small",
     srcs = ["resources/trt_allocator_test.cc"],
-    tags = ["no_windows"],
+    tags = [
+        "no_windows",
+        "nomac",
+    ],
     deps = [
         ":trt_allocator",
         "//tensorflow/core:test",
@@ -301,14 +309,21 @@ tf_cc_test(
     name = "segment_test",
     size = "small",
     srcs = ["segment/segment_test.cc"],
-    tags = ["no_windows"],
+    tags = [
+        "no_windows",
+        "nomac",
+    ],
     deps = [
         ":segment",
-        "//tensorflow/c:c_api",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/core:core_cpu",
         "//tensorflow/core:lib",
+        "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
     ],
 )
 
@@ -337,7 +352,11 @@ tf_cuda_cc_test(
     name = "trt_plugin_factory_test",
     size = "small",
     srcs = ["plugin/trt_plugin_factory_test.cc"],
-    tags = ["no_windows"],
+    tags = [
+        "no_cuda_on_cpu_tap",
+        "no_windows",
+        "nomac",
+    ],
     deps = [
         ":trt_plugins",
         "//tensorflow/core:lib",
@@ -349,16 +368,40 @@ tf_cuda_cc_test(
     ]),
 )
 
-cuda_py_test(
+py_library(
+    name = "tf_trt_integration_test_base",
+    srcs = ["test/tf_trt_integration_test_base.py"],
+    deps = [
+        ":trt_convert_py",
+        ":trt_ops_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
+
+cuda_py_tests(
     name = "tf_trt_integration_test",
-    srcs = ["test/tf_trt_integration_test.py"],
+    srcs = [
+        "test/base_test.py",
+        # "test/batch_matmul_test.py",
+        # "test/biasadd_matmul_test.py",
+        # "test/binary_tensor_weight_broadcast_test.py",  # Blocked by trt4 installation
+        # "test/concatenation_test.py",  # Blocked by trt4 installation
+        "test/const_broadcast_test.py",
+        "test/multi_connection_neighbor_engine_test.py",
+        "test/neighboring_engine_test.py",
+        # "test/unary_test.py",  # Blocked by trt4 installation
+        # "test/vgg_block_nchw_test.py",
+        # "test/vgg_block_test.py",
+        "test/memory_alignment_test.py",
+    ],
     additional_deps = [
-        ":init_py",
+        ":tf_trt_integration_test_base",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
     ],
-    main = "test/tf_trt_integration_test.py",
     tags = [
+        "no_cuda_on_cpu_tap",
         "no_windows",
         "nomac",
     ],
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index 68c78e8301..3383f6bc9b 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -301,7 +301,8 @@ tensorflow::Status GetEngineInfo(
     const int node_id = node->id();
     for (const auto edge : node->in_edges()) {
       auto input_node = edge->src();
-      if (segment_nodes.count(input_node->name()) == 0) {
+      if (segment_nodes.count(input_node->name()) == 0 &&
+          !edge->IsControlEdge() && !input_node->IsSource()) {
         // Add constant input node into the segment. We don't care if it has
         // other output edges going into other engines or TF nodes. Since we add
         // it only to the subsegment node list, not the subsegment itself, it
@@ -312,7 +313,7 @@ tensorflow::Status GetEngineInfo(
             added_const_node_ids.insert(input_node->id());
             subgraph_node_ids.push_back(input_node->id());
           }
-        } else if (!edge->IsControlEdge() && !input_node->IsSource()) {
+        } else {
           string s(input_node->name());
           StrAppend(&s, ":", edge->src_output());
           VLOG(1) << "Input edge = " << s;
@@ -378,9 +379,9 @@ tensorflow::Status CreateTRTNode(tensorflow::Graph* graph,
                                  nvinfer1::IGpuAllocator* alloc,
                                  int max_batch_size) {
   const auto& info = infos.at(pos);
-  std::vector<tensorflow::TensorShapeProto> out_shapes;
-  std::vector<tensorflow::TensorShapeProto> input_shapes;
-  std::vector<tensorflow::PartialTensorShape> shapes;
+  std::vector<tensorflow::TensorShapeProto> output_shape_protos;
+  std::vector<tensorflow::TensorShapeProto> input_shape_protos;
+  std::vector<tensorflow::PartialTensorShape> input_shapes;
   std::vector<tensorflow::NodeDefBuilder::NodeOut> inputs;
   std::vector<tensorflow::DataType> out_types;
   VLOG(1) << "Processing " << info.engine_name;
@@ -393,11 +394,11 @@ tensorflow::Status CreateTRTNode(tensorflow::Graph* graph,
       tensorflow::TensorShapeProto out_shape;
       // shape of the output node inside segment
       conn.inside_shape.AsProto(&out_shape);
-      if (out_shapes.size() <= conn.port_number) {
-        out_shapes.resize(conn.port_number + 1);
+      if (output_shape_protos.size() <= conn.port_number) {
+        output_shape_protos.resize(conn.port_number + 1);
         out_types.resize(conn.port_number + 1);
       }
-      out_shapes.at(conn.port_number) = out_shape;
+      output_shape_protos.at(conn.port_number) = out_shape;
       out_types.at(conn.port_number) = conn.connection_type;
       continue;
     }
@@ -405,12 +406,12 @@ tensorflow::Status CreateTRTNode(tensorflow::Graph* graph,
     // Set the shapes and data types of input edge.
     tensorflow::TensorShapeProto in_shape;
     conn.outside_shape.AsProto(&in_shape);
-    if (input_shapes.size() <= conn.port_number) {
+    if (input_shape_protos.size() <= conn.port_number) {
+      input_shape_protos.resize(conn.port_number + 1);
       input_shapes.resize(conn.port_number + 1);
-      shapes.resize(conn.port_number + 1);
     }
-    input_shapes.at(conn.port_number) = in_shape;
-    shapes.at(conn.port_number) = conn.outside_shape;
+    input_shape_protos.at(conn.port_number) = in_shape;
+    input_shapes.at(conn.port_number) = conn.outside_shape;
 
     string input_node = conn.outside_node_name;
     int input_port = conn.outside_port;
@@ -438,6 +439,8 @@ tensorflow::Status CreateTRTNode(tensorflow::Graph* graph,
     VLOG(1) << "Engine Input " << input_node << ":" << input_port << " -> "
             << info.engine_name << ":" << inputs.size();
     // Skip duplicate inputs.
+    // TODO(aaroey): use std::find instead. GetEngineInfo already remove
+    // duplicate connections, so here we should never find any duplicate?
     bool new_input = true;
     for (const auto& inp : inputs) {
       if (inp.node == input_node && inp.index == input_port) {
@@ -465,8 +468,8 @@ tensorflow::Status CreateTRTNode(tensorflow::Graph* graph,
     TF_RETURN_IF_ERROR(ConvertGraphDefToEngine(
         info.segment_graph_def,
         info.precision_mode == INT8MODE ? FP32MODE : info.precision_mode,
-        max_batch_size, info.max_workspace_size_bytes, shapes, &trt_logger,
-        alloc, /*calibrator=*/nullptr, &engine,
+        max_batch_size, info.max_workspace_size_bytes, input_shapes,
+        &trt_logger, alloc, /*calibrator=*/nullptr, &engine,
         /*convert_successfully=*/nullptr));
     TrtUniquePtrType<nvinfer1::IHostMemory> engine_data(engine->serialize());
     segment_string =
@@ -514,8 +517,8 @@ tensorflow::Status CreateTRTNode(tensorflow::Graph* graph,
   }
   tensorflow::NodeDef trt_node;
   tensorflow::Status status =
-      node_builder.Attr("input_shapes", input_shapes)
-          .Attr("output_shapes", out_shapes)
+      node_builder.Attr("input_shapes", input_shape_protos)
+          .Attr("output_shapes", output_shape_protos)
           .Attr("static_engine",
                 info.engine_type == EngineInfo::EngineType::TRTStatic)
           .Attr("segment_funcdef_name",
@@ -734,6 +737,7 @@ std::pair<int, tensorflow::Allocator*> GetDeviceAndAllocator(
 }
 
 // Entry function from optimization pass.
+// TODO(aaeory): parameter should use pointer type.
 tensorflow::Status ConvertAfterShapes(ConversionParams& params) {
   // Convert graphdef to graph.
   tensorflow::FunctionLibraryDefinition flib(tensorflow::OpRegistry::Global(),
@@ -751,7 +755,8 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) {
   segment_options.minimum_segment_size = params.minimum_segment_size;
   tensorflow::tensorrt::segment::SegmentNodesVector initial_segments;
   TF_RETURN_IF_ERROR(tensorrt::segment::SegmentGraph(
-      &graph, IsTensorRTCandidate, segment_options, &initial_segments));
+      &graph, IsTensorRTCandidate, InputEdgeValidator(*params.graph_properties),
+      OutputEdgeValidator(), segment_options, &initial_segments));
   if (initial_segments.size() > 1) {
     VLOG(0) << "MULTIPLE tensorrt candidate conversion: "
             << initial_segments.size();
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index 49e825151a..451d6fe698 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
 
 #include <algorithm>
+#include <cstring>
 #include <list>
 #include <map>
 #include <memory>
@@ -77,7 +78,6 @@ namespace tensorflow {
 namespace tensorrt {
 namespace convert {
 using ::tensorflow::str_util::Split;
-
 using ::tensorflow::strings::StrAppend;
 using ::tensorflow::strings::StrCat;
 
@@ -107,6 +107,59 @@ inline tensorflow::Status ConvertDType(tensorflow::DataType tf_dtype,
   return tensorflow::Status::OK();
 }
 
+void GetInputProperties(const grappler::GraphProperties& graph_properties,
+                        const Node* outside_node, const int out_port,
+                        PartialTensorShape* shape,
+                        tensorflow::DataType* dtype) {
+  if (graph_properties.HasOutputProperties(outside_node->name())) {
+    auto output_params =
+        graph_properties.GetOutputProperties(outside_node->name());
+    auto out_shape = output_params.at(out_port);
+    *dtype = out_shape.dtype();
+    *shape = out_shape.shape();
+  } else {
+    VLOG(0) << "Unknown output shape" << outside_node->name();
+    *dtype = outside_node->output_type(out_port);
+  }
+}
+
+void GetOutputProperties(const grappler::GraphProperties& graph_properties,
+                         const Node* outside_node, const int in_port,
+                         PartialTensorShape* shape,
+                         tensorflow::DataType* dtype) {
+  if (graph_properties.HasInputProperties(outside_node->name())) {
+    auto input_params =
+        graph_properties.GetInputProperties(outside_node->name());
+    auto in_shape = input_params.at(in_port);
+    *dtype = in_shape.dtype();
+    *shape = in_shape.shape();
+  } else {
+    *dtype = outside_node->input_type(in_port);
+  }
+}
+
+tensorflow::Status ValidateInputProperties(const PartialTensorShape& shape,
+                                           const tensorflow::DataType dtype,
+                                           nvinfer1::DataType* trt_dtype) {
+  // TODO(aaroey): some of these checks also apply to IsTensorRTCandidate(), so
+  // put them there instead.
+  TF_RETURN_IF_ERROR(ConvertDType(dtype, trt_dtype));
+  if (shape.dims() < 0) {
+    return tensorflow::errors::InvalidArgument("Input tensor rank is unknown.");
+  }
+  if (shape.dims() > 9) {
+    return tensorflow::errors::OutOfRange(
+        "Input tensor rank is greater than 8.");
+  }
+  for (int d = 1; d < shape.dims(); ++d) {
+    if (shape.dim_size(d) < 0) {
+      return tensorflow::errors::InvalidArgument(
+          "Input tensor has a unknown non-batch dimemension at dim ", d);
+    }
+  }
+  return Status::OK();
+}
+
 // Return whether or not the broadcast is feasible;
 bool TensorRTGetBroadcastShape(const nvinfer1::Dims& operand_l,
                                const bool operand_l_is_tensor,
@@ -2642,25 +2695,22 @@ tensorflow::Status ConvertGraphDefToEngine(
         (node_def.op() == "Placeholder")) {
       nvinfer1::DimsCHW input_dim_pseudo_chw;
       for (int i = 0; i < 8; i++) input_dim_pseudo_chw.d[i] = 0;
-      nvinfer1::DataType dtype(nvinfer1::DataType::kFLOAT);
-      auto type_status =
-          ConvertDType(node_def.attr().at("dtype").type(), &dtype);
-      if (type_status != tensorflow::Status::OK()) {
-        LOG(WARNING) << "Type conversion failed for " << node_name;
-        return type_status;
-      }
       int32 slot_number = -1;
-      if (!tensorflow::strings::safe_strto32(node_name.c_str() + 8,
-                                             &slot_number)) {
-        LOG(ERROR) << "Failed to parse slot number from " << node_name
-                   << " +8= " << node_name.c_str() + 8;
+      if (!tensorflow::strings::safe_strto32(
+              node_name.c_str() + strlen(kInputPHName), &slot_number)) {
+        return tensorflow::errors::InvalidArgument(
+            "Failed to parse slot number from ", node_name);
       }
+      nvinfer1::DataType dtype;
       auto shape = input_shapes.at(slot_number);
-      if (shape.dims() > 8) {
-        LOG(ERROR) << "Tensor rank is greater than 8 for " << node_name
-                   << " at input slot " << slot_number;
-        return tensorflow::errors::OutOfRange(
-            "Input tensor rank is greater than 8");
+      auto status = ValidateInputProperties(
+          shape, node_def.attr().at("dtype").type(), &dtype);
+      if (!status.ok()) {
+        const string error_message =
+            StrCat("Validation failed for ", node_name, " and input slot ",
+                   slot_number, ": ", status.error_message());
+        LOG(WARNING) << error_message;
+        return Status(status.code(), error_message);
       }
       if (VLOG_IS_ON(1)) {
         string dim_str("dims=");
@@ -2691,10 +2741,10 @@ tensorflow::Status ConvertGraphDefToEngine(
     } else if (tensorflow::str_util::StartsWith(node_name, kOutputPHName) &&
                (node_def.op() == "Identity")) {
       int32 slot_number = -1;
-      if (!tensorflow::strings::safe_strto32(node_name.c_str() + 9,
-                                             &slot_number)) {
-        LOG(ERROR) << "Failed to parse slot number from " << node_name
-                   << " +9=" << node_name.c_str() + 9;
+      if (!tensorflow::strings::safe_strto32(
+              node_name.c_str() + strlen(kOutputPHName), &slot_number)) {
+        return tensorflow::errors::InvalidArgument(
+            "Failed to parse slot number from ", node_name);
       }
       if (output_tensors.size() <= slot_number) {
         output_tensors.resize(slot_number + 1);
@@ -2753,38 +2803,20 @@ tensorflow::Status ConvertSegmentToGraphDef(
           "Cannot find node with id ", connection.outside_id, " in the graph.");
     }
     // Updates the shape and data types of input/output connections.
-    tensorflow::DataType input_type = tensorflow::DT_FLOAT;
+    tensorflow::DataType dtype;
     tensorflow::PartialTensorShape partial_shape;
     if (connection.is_input_edge) {
-      if (graph_properties.HasOutputProperties(connection.outside_node_name)) {
-        auto output_params =
-            graph_properties.GetOutputProperties(connection.outside_node_name);
-        auto out_shape = output_params.at(connection.outside_port);
-        input_type = out_shape.dtype();
-        std::vector<tensorflow::int64> dims;
-        partial_shape = out_shape.shape();
-        connection.outside_shape = partial_shape;
-      } else {
-        VLOG(0) << "Unknown output shape" << outside_node->name();
-        input_type = graph->FindNodeId(connection.outside_id)
-                         ->output_type(connection.outside_port);
-      }
-      connection.connection_type = input_type;
-
-    } else {  // output edge
-      if (graph_properties.HasInputProperties(connection.outside_node_name)) {
-        auto input_params =
-            graph_properties.GetInputProperties(connection.outside_node_name);
-        auto in_shape = input_params.at(connection.outside_port);
-        input_type = in_shape.dtype();
-        partial_shape = in_shape.shape();
-        connection.inside_shape = partial_shape;
-      } else {
-        input_type = graph->FindNodeId(connection.inside_id)
-                         ->output_type(connection.outside_port);
-      }
-      connection.connection_type = input_type;
+      GetInputProperties(graph_properties,
+                         graph->FindNodeId(connection.outside_id),
+                         connection.outside_port, &partial_shape, &dtype);
+
+    } else {
+      GetOutputProperties(graph_properties,
+                          graph->FindNodeId(connection.outside_id),
+                          connection.outside_port, &partial_shape, &dtype);
     }
+    connection.outside_shape = partial_shape;
+    connection.connection_type = dtype;
 
     // Add dummy input/output nodes to the segment graphdef.
     if (connection.is_input_edge) {
@@ -2800,7 +2832,7 @@ tensorflow::Status ConvertSegmentToGraphDef(
       auto seg_node = segment_def->add_node();
       tensorflow::NodeDefBuilder builder(node_name, "Placeholder");
       auto status = builder.Attr("shape", partial_shape)
-                        .Attr("dtype", input_type)
+                        .Attr("dtype", dtype)
                         .Finalize(seg_node);
       VLOG(1) << "Constructing input " << node_name << " for the edge "
               << connection.outside_node_name << ":" << connection.outside_port
@@ -2818,7 +2850,7 @@ tensorflow::Status ConvertSegmentToGraphDef(
       marker_nodes.insert(node_name);
       auto seg_node = segment_def->add_node();
       tensorflow::NodeDefBuilder builder(node_name, "Identity");
-      auto status = builder.Input(connection.inside_node_name, 0, input_type)
+      auto status = builder.Input(connection.inside_node_name, 0, dtype)
                         .Finalize(seg_node);
       VLOG(1) << "Constructing output " << node_name << " for the edge "
               << connection.inside_node_name << ":" << connection.inside_port
@@ -2856,6 +2888,38 @@ tensorflow::Status ConvertSegmentToGraphDef(
   return tensorflow::Status::OK();
 }
 
+bool InputEdgeValidator::operator()(const tensorflow::Edge* in_edge) const {
+  if (in_edge->IsControlEdge()) return true;
+  PartialTensorShape shape;
+  tensorflow::DataType dtype;
+  GetInputProperties(graph_properties_, in_edge->src(), in_edge->src_output(),
+                     &shape, &dtype);
+  nvinfer1::DataType trt_dtype;
+  Status status = ValidateInputProperties(shape, dtype, &trt_dtype);
+  if (!status.ok()) {
+    VLOG(2) << "--> Need to remove input node " << in_edge->dst()->name()
+            << ": " << status;
+    return false;
+  }
+  if (shape.dims() < 3 && in_edge->src()->type_string() != "Const") {
+    VLOG(2) << "--> Need to remove input node " << in_edge->dst()->name()
+            << " which has an input at port " << in_edge->dst_input()
+            << " with #dim<3 and is not a const: " << shape;
+    return false;
+  }
+  return true;
+}
+
+bool OutputEdgeValidator::operator()(const tensorflow::Edge* out_edge) const {
+  if (out_edge->IsControlEdge()) return true;
+  if (out_edge->src()->type_string() == "Const") {
+    VLOG(2) << "--> Need to remove output node " << out_edge->src()->name()
+            << " which is a Const.";
+    return false;
+  }
+  return true;
+}
+
 }  // namespace convert
 }  // namespace tensorrt
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.h b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
index 81baf8e7c1..6ae60ec352 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
@@ -105,6 +105,8 @@ struct EngineInfo {
 //   topological order.
 // - segment_def: the output GraphDef, whose non-input/output nodedefs will be
 //   sorted in topological order.
+//
+// TODO(aaroey): add tests to validate these properties.
 tensorflow::Status ConvertSegmentToGraphDef(
     const tensorflow::Graph* graph,
     const tensorflow::grappler::GraphProperties& graph_properties,
@@ -129,6 +131,30 @@ tensorflow::Status ConvertGraphDefToEngine(
     TrtUniquePtrType<nvinfer1::ICudaEngine>* engine,
     bool* convert_successfully);
 
+// Helper class for the segmenter to determine whether an input edge to the TRT
+// segment is valid.
+class InputEdgeValidator {
+ public:
+  InputEdgeValidator(const grappler::GraphProperties& graph_properties)
+      : graph_properties_(graph_properties) {}
+
+  // Return true if the specified edge is eligible to be an input edge of the
+  // TRT segment.
+  bool operator()(const tensorflow::Edge* in_edge) const;
+
+ private:
+  const grappler::GraphProperties& graph_properties_;
+};
+
+// Helper class for the segmenter to determine whether an output edge from the
+// TRT segment is valid.
+class OutputEdgeValidator {
+ public:
+  // Return true if the specified edge is eligible to be an output edge of the
+  // TRT segment.
+  bool operator()(const tensorflow::Edge* out_edge) const;
+};
+
 }  // namespace convert
 }  // namespace tensorrt
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tensorrt/convert/utils.cc b/tensorflow/contrib/tensorrt/convert/utils.cc
index 24591cf84b..17857cf4d0 100644
--- a/tensorflow/contrib/tensorrt/convert/utils.cc
+++ b/tensorflow/contrib/tensorrt/convert/utils.cc
@@ -24,7 +24,7 @@ bool IsGoogleTensorRTEnabled() {
   // safely write code that uses tensorrt conditionally. E.g. if it does not
   // check for for tensorrt, and user mistakenly uses tensorrt, they will just
   // crash and burn.
-#ifdef GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
   return true;
 #else
   return false;
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
index 646d62483f..6699b71d28 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
@@ -45,11 +45,11 @@ using ::tensorflow::strings::StrCat;
 // Helps simultaneous execution of native and TRT engines.
 class AsyncHelper : public tensorflow::core::RefCounted {
  public:
-  AsyncHelper(tensorflow::AsyncOpKernel::DoneCallback done) { done_ = done; }
+  AsyncHelper(AsyncOpKernel::DoneCallback done) { done_ = done; }
   ~AsyncHelper() override { done_(); }
 
  private:
-  tensorflow::AsyncOpKernel::DoneCallback done_;
+  AsyncOpKernel::DoneCallback done_;
 };
 
 #define TYPECASE(dt, X, Y)                                                \
@@ -152,7 +152,7 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context)
   }
 }
 
-void TRTEngineOp::ExecuteNativeSegment(tensorflow::OpKernelContext* ctx,
+void TRTEngineOp::ExecuteNativeSegment(OpKernelContext* ctx,
                                        AsyncHelper* helper) {
   if (!calibration_mode_) {
     VLOG(1) << "Executing native engine";
@@ -193,7 +193,7 @@ void TRTEngineOp::ExecuteNativeSegment(tensorflow::OpKernelContext* ctx,
            });
 }
 
-void TRTEngineOp::ExecuteCalibration(tensorflow::OpKernelContext* ctx,
+void TRTEngineOp::ExecuteCalibration(OpKernelContext* ctx,
                                      AsyncHelper* helper) {
   helper->Ref();
   tensorflow::core::ScopedUnref sc(helper);
@@ -238,7 +238,7 @@ void TRTEngineOp::ExecuteCalibration(tensorflow::OpKernelContext* ctx,
   ExecuteNativeSegment(ctx, helper);
 }
 
-int TRTEngineOp::GetEngineBatch(tensorflow::OpKernelContext* ctx) {
+int TRTEngineOp::GetEngineBatch(OpKernelContext* ctx) {
   int num_batch = ctx->input(0).shape().dim_size(0);
   int smallest_engine = 0;
   for (const auto i : cached_engine_batches_) {
@@ -254,21 +254,20 @@ int TRTEngineOp::GetEngineBatch(tensorflow::OpKernelContext* ctx) {
       cached_engine_batches_.push_back(num_batch);
       VLOG(1) << "Running with batch size " << num_batch;
     } else {
-      string s("Engine buffer is full. buffer limit= ");
-      StrAppend(&s, max_cached_engines_, ", current entries= ");
-      for (auto i : cached_engine_batches_) StrAppend(&s, i, ", ");
-      StrAppend(&s, "Requested batch= ", num_batch);
-      LOG(ERROR) << s;
-      ctx->SetStatus(tensorflow::errors::ResourceExhausted(
-          "Requested batch size is not available and engine cache is full"));
+      string msg =
+          StrCat("Engine buffer is full. buffer limit=", max_cached_engines_,
+                 ", current entries=");
+      for (auto i : cached_engine_batches_) StrAppend(&msg, i, ",");
+      StrAppend(&msg, "Requested batch=", num_batch);
+      LOG(WARNING) << msg;
       return -1;
     }
   }
   return smallest_engine;
 }
 
-void TRTEngineOp::ComputeAsync(tensorflow::OpKernelContext* ctx,
-                               tensorflow::AsyncOpKernel::DoneCallback done) {
+void TRTEngineOp::ComputeAsync(OpKernelContext* ctx,
+                               AsyncOpKernel::DoneCallback done) {
   auto helper = new AsyncHelper(done);
   tensorflow::core::ScopedUnref sc(helper);
   if (calibration_mode_) {
@@ -276,32 +275,52 @@ void TRTEngineOp::ComputeAsync(tensorflow::OpKernelContext* ctx,
     return;
   }
   const int smallest_engine = GetEngineBatch(ctx);
-  if (smallest_engine < 0) return;  // GetEngineBatch already set the status.
+  if (smallest_engine < 0) {
+    LOG(WARNING) << "Failed to get engine batch, running native segment";
+    ExecuteNativeSegment(ctx, helper);
+    return;
+  }
 
   const int num_batch = ctx->input(0).shape().dim_size(0);
   auto& engine_ctx_pair = GetEngine(smallest_engine, ctx);
   auto& trt_engine_ptr = engine_ctx_pair.first;
   if (!trt_engine_ptr) {
     LOG(WARNING) << "Engine retrieval for batch size " << num_batch
-                 << " failed Running native segment";
+                 << " failed. Running native segment";
+    ExecuteNativeSegment(ctx, helper);
+    return;
+  }
+  const bool retry = ExecuteTrtEngine(ctx, num_batch, trt_engine_ptr.get(),
+                                      engine_ctx_pair.second.get());
+  if (retry) {
+    LOG(WARNING) << "Failed to execute engine, retrying with native segment";
     ExecuteNativeSegment(ctx, helper);
     return;
   }
+}
 
+bool TRTEngineOp::ExecuteTrtEngine(
+    OpKernelContext* ctx, const int num_batch,
+    nvinfer1::ICudaEngine* trt_engine_ptr,
+    nvinfer1::IExecutionContext* trt_execution_context_ptr) {
+  const bool kRetry = true;
   const int num_binding = ctx->num_inputs() + ctx->num_outputs();
   std::vector<void*> buffers(num_binding);
   for (int i = 0; i < ctx->num_inputs(); i++) {
-    const string inp_name = StrCat(kInputPHName, i);
+    const string input_name = StrCat(kInputPHName, i);
     const size_t binding_index =
-        trt_engine_ptr->getBindingIndex(inp_name.c_str());
+        trt_engine_ptr->getBindingIndex(input_name.c_str());
+    if (binding_index == -1) {
+      LOG(ERROR) << "Input node not found, at " << input_name;
+      return kRetry;
+    }
 
     const Tensor& input_tensor = ctx->input(i);
     const TensorShape& input_shape = input_tensor.shape();
     if (num_batch != input_shape.dim_size(0)) {
-      LOG(ERROR) << "input data inconsistent batch size";
-      ctx->SetStatus(tensorflow::errors::FailedPrecondition(
-          "Different batch sizes between input tensors"));
-      return;
+      LOG(ERROR) << "Input data has inconsistent batch size: " << num_batch
+                 << " vs " << input_shape.dim_size(0);
+      return kRetry;
     }
     auto dtype = trt_engine_ptr->getBindingDataType(binding_index);
     switch (dtype) {
@@ -310,14 +329,10 @@ void TRTEngineOp::ComputeAsync(tensorflow::OpKernelContext* ctx,
         break;
       case nvinfer1::DataType::kHALF:
         LOG(ERROR) << "FP16 inputs are not supported yet!";
-        ctx->SetStatus(tensorflow::errors::InvalidArgument(
-            "FP16 inputs are not supported!"));
-        return;
+        return kRetry;
       case nvinfer1::DataType::kINT8:
         LOG(ERROR) << "INT8 inputs are not supported yet!";
-        ctx->SetStatus(tensorflow::errors::InvalidArgument(
-            "INT8 inputs are not supported!"));
-        return;
+        return kRetry;
 #if NV_TENSORRT_MAJOR > 3
       case nvinfer1::DataType::kINT32:
         buffers[binding_index] = (void*)(input_tensor.flat<int32>().data());
@@ -325,9 +340,7 @@ void TRTEngineOp::ComputeAsync(tensorflow::OpKernelContext* ctx,
 #endif
       default:
         LOG(ERROR) << "Unknown TRT data type: " << int(dtype);
-        ctx->SetStatus(tensorflow::errors::InvalidArgument(
-            "Unknown output TRT data type! ", static_cast<int>(dtype)));
-        return;
+        return kRetry;
     }
   }
 
@@ -344,20 +357,23 @@ void TRTEngineOp::ComputeAsync(tensorflow::OpKernelContext* ctx,
       std::vector<int> trt_shape(dims.nbDims + 1);
       trt_shape[0] = num_batch;
       for (int j = 0; j < dims.nbDims; j++) trt_shape[j + 1] = dims.d[j];
-      OP_REQUIRES_OK(
-          ctx, TensorShapeUtils::MakeShape(trt_shape.data(), trt_shape.size(),
-                                           &output_shape));
+      auto status = TensorShapeUtils::MakeShape(
+          trt_shape.data(), trt_shape.size(), &output_shape);
+      if (!status.ok()) {
+        LOG(ERROR) << "Failed to get output shape: " << status;
+        return kRetry;
+      }
     } else {
-      LOG(ERROR) << "output node not found, at " << output_name;
-      ctx->SetStatus(tensorflow::errors::Internal("output ", output_name,
-                                                  " couldn't be found!"));
-      return;
+      LOG(ERROR) << "Output node not found, at " << output_name;
+      return kRetry;
     }
     auto status = ctx->allocate_output(i, output_shape, &output_tensor);
     if (!status.ok()) {
       LOG(ERROR) << "Allocating output failed with " << status;
       ctx->SetStatus(status);
-      return;
+      // Do not retry since we cannot allocate the same output twice.
+      // TODO(aaroey): ideally we should retry, fix this.
+      return !kRetry;
     }
     auto dtype = trt_engine_ptr->getBindingDataType(binding_index);
     switch (dtype) {
@@ -366,15 +382,11 @@ void TRTEngineOp::ComputeAsync(tensorflow::OpKernelContext* ctx,
             reinterpret_cast<void*>(output_tensor->flat<float>().data());
         break;
       case nvinfer1::DataType::kHALF:
-        LOG(ERROR) << "half size is not supported yet!";
-        ctx->SetStatus(tensorflow::errors::InvalidArgument(
-            "Half outputs are not supported!"));
-        return;
+        LOG(WARNING) << "half size is not supported yet!";
+        return kRetry;
       case nvinfer1::DataType::kINT8:
-        LOG(ERROR) << "int8 is not supported yet!";
-        ctx->SetStatus(tensorflow::errors::InvalidArgument(
-            "INT8 outputs are not supported!"));
-        return;
+        LOG(WARNING) << "int8 is not supported yet!";
+        return kRetry;
 #if NV_TENSORRT_MAJOR > 3
       case nvinfer1::DataType::kINT32:
         buffers[binding_index] =
@@ -382,13 +394,11 @@ void TRTEngineOp::ComputeAsync(tensorflow::OpKernelContext* ctx,
         break;
 #endif
       default:
-        LOG(ERROR) << "Unknown TRT data type: " << static_cast<int>(dtype);
-        ctx->SetStatus(tensorflow::errors::InvalidArgument(
-            "Unsupported output data type! ", static_cast<int>(dtype)));
-        return;
+        LOG(WARNING) << "Unknown TRT data type: " << static_cast<int>(dtype);
+        return kRetry;
     }
   }
-  // copied from cuda_kernel_helper since it seems only valid in *.cu.cc files
+  // Copied from cuda_kernel_helper since it seems only valid in *.cu.cc files
   const cudaStream_t* stream = CHECK_NOTNULL(
       reinterpret_cast<const cudaStream_t*>(ctx->op_device_context()
                                                 ->stream()
@@ -396,15 +406,14 @@ void TRTEngineOp::ComputeAsync(tensorflow::OpKernelContext* ctx,
                                                 ->GpuStreamMemberHack()));
 
   // TODO(jie): trt enqueue does not return error
-  auto& trt_execution_context_ptr = engine_ctx_pair.second;
   auto ret = trt_execution_context_ptr->enqueue(num_batch, &buffers[0], *stream,
                                                 nullptr);
   if (!ret) {
-    LOG(ERROR) << "Failed to enqueue batch for TRT engine: " << name();
-    ctx->SetStatus(tensorflow::errors::Internal(
-        "Failed to enqueue batch for TRT engine: ", name()));
+    LOG(WARNING) << "Failed to enqueue batch for TRT engine: " << name();
+    return kRetry;
   }
-  // sync should be done by TF.
+  // Synchronization will be done by TF.
+  return !kRetry;
 }
 
 TRTEngineOp::~TRTEngineOp() {
@@ -424,8 +433,6 @@ nvinfer1::IGpuAllocator* TRTEngineOp::GetAllocator(OpKernelContext* ctx) {
   if (!alloc) {
     LOG(ERROR) << "Can't find device allocator for gpu device "
                << device->name();
-    ctx->SetStatus(tensorflow::errors::Internal(
-        "Can't get device allocator for device ", device->name()));
     return nullptr;
   }
   allocator_.reset(new TRTDeviceAllocator(alloc));
@@ -452,7 +459,6 @@ TRTEngineOp::EngineCtxPair& TRTEngineOp::GetEngine(int batch_size,
 #if NV_TENSORRT_MAJOR > 3
     auto allocator = GetAllocator(ctx);
     if (allocator == nullptr) {
-      // GetAllocator already set the Status.
       return null_pair;
     }
     infer->setGpuAllocator(allocator);
@@ -469,7 +475,9 @@ TRTEngineOp::EngineCtxPair& TRTEngineOp::GetEngine(int batch_size,
             raw_static_engine->createExecutionContext())};
     // Runtime is safe to delete after engine creation
     serialized_segment_.clear();
-    if (max_batch_size < batch_size) return null_pair;
+    if (max_batch_size < batch_size) {
+      return null_pair;
+    }
     return engine_map_.at(max_batch_size);
   }  // static_engine_
 
@@ -481,7 +489,6 @@ TRTEngineOp::EngineCtxPair& TRTEngineOp::GetEngine(int batch_size,
 #if NV_TENSORRT_MAJOR > 3
     allocator = GetAllocator(ctx);
     if (allocator == nullptr) {
-      // GetAllocator already set the Status.
       return null_pair;
     }
 #endif
@@ -505,9 +512,8 @@ TRTEngineOp::EngineCtxPair& TRTEngineOp::GetEngine(int batch_size,
         // retry in the future.
         engine_map_[batch_size] = {nullptr, nullptr};
       }
-      LOG(ERROR) << "Engine creation for batch size " << batch_size
-                 << " failed " << status;
-      ctx->SetStatus(tensorflow::errors::Internal("Engine creation failed!"));
+      LOG(WARNING) << "Engine creation for batch size " << batch_size
+                   << " failed " << status;
       return null_pair;
     }
     VLOG(1) << "Conversion is done";
@@ -519,7 +525,7 @@ TRTEngineOp::EngineCtxPair& TRTEngineOp::GetEngine(int batch_size,
 }
 
 tensorflow::Status TRTEngineOp::AllocateCalibrationResources(
-    tensorflow::OpKernelContext* ctx, TRTCalibrationResource** cr) {
+    OpKernelContext* ctx, TRTCalibrationResource** cr) {
   auto cres = new TRTCalibrationResource();
   *cr = cres;
   // Get the allocator.
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
index 9265250605..59b744e6d3 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
@@ -60,6 +60,12 @@ class TRTEngineOp : public AsyncOpKernel {
   // Execute replaced native segment as function Op.
   void ExecuteNativeSegment(OpKernelContext* ctx, AsyncHelper* helper);
 
+  // Execute the tensorrt engine. Returns whether we need to retry by running
+  // the native segment.
+  bool ExecuteTrtEngine(OpKernelContext* ctx, const int num_batch,
+                        nvinfer1::ICudaEngine* trt_engine_ptr,
+                        nvinfer1::IExecutionContext* trt_execution_context_ptr);
+
   // Allocate necessary resources for calibration
   Status AllocateCalibrationResources(OpKernelContext* ctx,
                                       TRTCalibrationResource** cr);
diff --git a/tensorflow/contrib/tensorrt/segment/segment.cc b/tensorflow/contrib/tensorrt/segment/segment.cc
index cc42913eca..008fffc954 100644
--- a/tensorflow/contrib/tensorrt/segment/segment.cc
+++ b/tensorflow/contrib/tensorrt/segment/segment.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/contrib/tensorrt/segment/segment.h"
 
+#include <queue>
 #include <set>
 #include <unordered_map>
 #include <vector>
@@ -32,6 +33,7 @@ namespace tensorflow {
 namespace tensorrt {
 namespace segment {
 using ::tensorflow::strings::StrAppend;
+
 // A simple graph representation to mirror tensorflow::Graph. This structure
 // helps saving memory since segmenter modifies the graph in place, preventing
 // the need to create a copy of the graph. It is composed of edges and nodes.
@@ -215,7 +217,7 @@ namespace {
 
 bool CheckCycles(const std::unique_ptr<SimpleGraph>& g, const SimpleNode* src,
                  const std::vector<SimpleNode*>& start) {
-  // copied from TF ReverseDFS.
+  // Copied from TF ReverseDFS, which only works for tensorflow::Graph.
   struct Work {
     SimpleNode* node;
     bool leave;  // Are we entering or leaving n?
@@ -269,6 +271,24 @@ bool CanContractEdge(const SimpleEdge* edge,
   //   1. Get all nodes incoming to 'dst', excluding 'src'
   //   2. Reverse DFS from those nodes
   //   3. If reverse DFS reaches 'src' then we have a cycle
+  //
+  // TODO(aaroey): there are several problems with the current approach:
+  // 1. src->dst->src, this is not detected but it should be;
+  // 2. src->dst->...(any node sequence that doesn't contain src)...->dst, this
+  //    is detected but it should not be.
+  //
+  // Note that it's fine that dst connects back to src indirectly (i.e. through
+  // a path with length > 1 that consists of intermedia nodes other than src).
+  // While loops is one example.
+  //
+  // The goal is to make sure that the trt subgraph:
+  // 1. has no loops (i.e. is a DAG), and
+  // 2. if there is a path in the subgraph from X to Y (X and Y are both nodes
+  //    in the subgraph), then all paths from X to Y are in the subgraph.
+  //
+  // To achieve this goal, the correct way seems to be:
+  // 1. remove any direct edge from src->dst;
+  // 2. detect if src can reach dst, if so they cannot be merged.
   std::vector<SimpleNode*> dfs_start_nodes;
   for (SimpleNode* node : dst->in_nodes()) {
     if (node != src) {
@@ -276,8 +296,8 @@ bool CanContractEdge(const SimpleEdge* edge,
     }
   }
 
-  bool is_cycle = CheckCycles(graph, src, dfs_start_nodes);
-  return !is_cycle;
+  const bool has_cycle = CheckCycles(graph, src, dfs_start_nodes);
+  return !has_cycle;
 }
 }  // namespace
 
@@ -342,22 +362,20 @@ void ContractEdge(SimpleEdge* edge, SimpleGraph* graph,
 }
 
 tensorflow::Status SegmentGraph(
-    const tensorflow::GraphDef& gdef,
-    const std::function<bool(const tensorflow::Node*)>& candidate_fn,
-    const SegmentOptions& options, SegmentNodesVector* segments) {
-  // Create a Graph representation of the GraphDef.
-  tensorflow::FunctionLibraryDefinition flib(tensorflow::OpRegistry::Global(),
-                                             gdef.library());
-  tensorflow::Graph graph(flib);
-  TF_RETURN_IF_ERROR(tensorflow::ConvertGraphDefToGraph(
-      tensorflow::GraphConstructorOptions(), gdef, &graph));
-  return SegmentGraph(&graph, candidate_fn, options, segments);
-}
-
-tensorflow::Status SegmentGraph(
-    tensorflow::Graph* tf_graph,
+    const tensorflow::Graph* tf_graph,
     const std::function<bool(const tensorflow::Node*)>& candidate_fn,
+    const std::function<bool(const tensorflow::Edge*)>& input_candidate_fn,
+    const std::function<bool(const tensorflow::Edge*)>& output_candidate_fn,
     const SegmentOptions& options, SegmentNodesVector* segments) {
+  // Steps:
+  // 1. run the segmentation algorithm to find all the segments, which uses
+  //    candidate_fn to determine the candidates segment nodes;
+  // 2. for each segments, remove the nodes that are inputs/outputs of the
+  //    segment but are not eligible, using input/output_candidate_fn to
+  //    determine the eligibilities;
+  // 3. convert the segment into expected return format and return the result.
+
+  // --------------------------------- Step 1 ---------------------------------
   auto graph = std::unique_ptr<SimpleGraph>(new SimpleGraph(tf_graph));
   // Use a union-find to collect the nodes that belong to the same
   // segment. A node value of nullptr indicates that the node is not a candidate
@@ -372,14 +390,19 @@ tensorflow::Status SegmentGraph(
     node_segments.emplace_back(node);
   }
 
-  // The segmentation algorithm below visits nodes in reverse
-  // topological order and attempts to merge nodes along output
-  // edges. That means that subgraphs grow from the output-side of the
-  // network towards the inputs. In general this is not guaranteed to
-  // produce a globally optimal segmentation. In the future if we have
-  // a measure of how beneficial it is to include a given node in a
-  // TRT subgraph then we can revisit this algorithm to take advantage
-  // of that information.
+  // The segmentation algorithm below visits nodes in reverse topological order
+  // and attempts to merge nodes along output edges. That means that subgraphs
+  // grow from the output-side of the network towards the inputs.
+  //
+  // In general this is not guaranteed to produce a globally optimal
+  // segmentation. For exaample, consider graph with node {A, B, C, D} and edges
+  // {A->B, A->C, B->D, C->D), where A, B, D are trt compatible but C is not, so
+  // in theory we can choose to contract either A, B or B, D but not both, but
+  // here it always choose to contract B, D.
+  //
+  // In the future if we have a measure of how beneficial it is to include a
+  // given node in a TRT subgraph then we can revisit this algorithm to take
+  // advantage of that information.
   std::vector<tensorflow::Node*> tforder;
   tensorflow::GetPostOrder(*tf_graph, &tforder);
   // use postorder implementation from tensorflow and construct mirror in
@@ -392,13 +415,11 @@ tensorflow::Status SegmentGraph(
   for (const SimpleNode* node : order) {
     // All output nodes of 'node' have been visited...
     VLOG(2) << "Trying node " << node->name() << " id=" << node->id();
-
     // 'node' must be a TRT candidate...
     if (node_segments[node->id()].Value() == nullptr) {
       VLOG(2) << "... not a TRT candidate";
       continue;
     }
-
     // Contract output edges to combine 'node' with output
     // nodes. Iterate since combining two nodes may unblock other
     // combining.
@@ -416,7 +437,6 @@ tensorflow::Status SegmentGraph(
           VLOG(2) << "... ... not a TRT candidate";
           continue;
         }
-
         if (CanContractEdge(out_edge, graph)) {
           VLOG(2) << "... ... can contract";
           contract_edges.insert(out_edge);
@@ -424,11 +444,9 @@ tensorflow::Status SegmentGraph(
           VLOG(2) << "... ... cannot contract, would form cycle";
         }
       }
-
       if (contract_edges.empty()) {
         break;
       }
-
       // Contract edges and collect the adjacent nodes into the same
       // segment/subgraph.
       while (!contract_edges.empty()) {
@@ -457,11 +475,22 @@ tensorflow::Status SegmentGraph(
 
   // Collect the segments/subgraphs. Each subgraph is represented by a
   // set of the names of the nodes in that subgraph.
-  std::unordered_map<string, std::set<string>> sg_map;
+
+  // A map from the segment identifier (currently the name of the root node of
+  // the segment tree) to the segment nodes set.
+  std::unordered_map<string, std::set<const tensorflow::Node*>> sg_map;
+
+  // A map from the segment identifier (currently the name of the root node of
+  // the segment tree) to the device names that the nodes in the segment are
+  // assigned to.
+  //
+  // TODO(aaroey): nodes assigned to different devices should not be merged,
+  // fix this.
   std::unordered_map<string, std::set<string>> device_maps;
+
   for (auto& u : node_segments) {
     if ((u.Value() != nullptr) && (u.ParentValue() != nullptr)) {
-      sg_map[u.ParentValue()->name()].insert(u.Value()->name());
+      sg_map[u.ParentValue()->name()].insert(u.Value()->tf_node());
       auto tf_node = u.Value()->tf_node();
       // has_assigned_device_name() is expected to return true
       // when called from optimization pass. However, since graph
@@ -482,25 +511,104 @@ tensorflow::Status SegmentGraph(
     }
   }
 
+  // --------------------------------- Step 2 ---------------------------------
+  // Remove ineligible input/output nodes.
+  for (auto& itr : sg_map) {
+    std::set<const tensorflow::Node*>& segment_nodes = itr.second;
+    VLOG(1) << "Segment original size: " << segment_nodes.size();
+    while (true) {
+      std::deque<const tensorflow::Node*> in_nodes_que, out_nodes_que;
+      // Find an input node that is not eligible and add it to the queue.
+      // Nodes that has no incoming edges should not be treated as "input",
+      // as there are really no inputs to them. Similar for output nodes.
+      for (auto node : segment_nodes) {
+        bool added = false;
+        for (const tensorflow::Edge* edge : node->in_edges()) {
+          if (!edge->IsControlEdge() && !edge->src()->IsSource() &&
+              !segment_nodes.count(edge->src())) {  // 'node' is an input node.
+            if (!input_candidate_fn(edge)) {
+              in_nodes_que.push_back(node);
+              added = true;
+              break;
+            }
+          }
+        }
+        if (added) continue;  // Only adding the node once to either queue.
+        for (const tensorflow::Edge* edge : node->out_edges()) {
+          if (!edge->dst()->IsSink() && !edge->IsControlEdge() &&
+              !segment_nodes.count(edge->dst())) {  // 'node' is an output node.
+            if (!output_candidate_fn(edge)) {
+              out_nodes_que.push_back(node);
+              break;
+            }
+          }
+        }
+      }
+      if (in_nodes_que.empty() && out_nodes_que.empty()) {
+        // No more ineligible input/output nodes.
+        break;
+      }
+      // Now for each ineligible node, remove all of its inputs or outputs from
+      // the subgraph.
+      //
+      // It can be proven that, if the original subgraph:
+      // 1. is a DAG, and
+      // 2. all paths between two nodes in the subgraph are all inside the
+      //    subgraph
+      // then after doing this operation the resulting subgraph will keep the
+      // same properties 1 and 2.
+      //
+      // For simplicity we use heuristics: for input nodes remove all its
+      // input, for output nodes remove all its output. In this way, for common
+      // cases the number of removed nodes should be minimum.
+      auto remove_nodes = [&segment_nodes](
+                              bool is_input_nodes,
+                              std::deque<const tensorflow::Node*>* que) {
+        // Run a BFS on the queue to find all the input/output nodes.
+        std::set<const tensorflow::Node*> visited;
+        while (!que->empty()) {
+          auto node = que->front();
+          que->pop_front();
+          if (!visited.insert(node).second) continue;
+          segment_nodes.erase(node);
+          for (auto in :
+               is_input_nodes ? node->in_nodes() : node->out_nodes()) {
+            if (segment_nodes.count(in)) {
+              que->push_back(in);
+              VLOG(2) << "Need to remove node " << in->name()
+                      << " because one of its "
+                      << (is_input_nodes ? "output" : "input")
+                      << " nodes in the graph was removed: " << node->name();
+            }
+          }
+        }
+      };
+      remove_nodes(true, &in_nodes_que);
+      remove_nodes(false, &out_nodes_que);
+    }
+    VLOG(1) << "Segment new size: " << segment_nodes.size();
+  }
+
+  // --------------------------------- Step 3 ---------------------------------
   // Convert the segments into the expected return format
   for (const auto& itr : sg_map) {
-    const auto& segment_node_names = itr.second;
+    const std::set<const tensorflow::Node*>& segment_nodes = itr.second;
     if (VLOG_IS_ON(1)) {
       string s;
-      for (const auto& name : segment_node_names) {
-        s += " " + name;
-      }
-      VLOG(1) << "Segment " << segments->size() << ":" << s;
+      for (auto node : segment_nodes) s += " " + node->name();
+      VLOG(1) << "Segment " << segments->size() << ": " << s;
     }
 
     // Don't use small segments.
-    if (static_cast<int>(segment_node_names.size()) <
-        options.minimum_segment_size) {
+    if (static_cast<int>(segment_nodes.size()) < options.minimum_segment_size) {
       VLOG(1) << "Segment " << segments->size() << " has only "
-              << segment_node_names.size() << " nodes, dropping";
+              << segment_nodes.size() << " nodes, dropping";
       continue;
     }
+
     // TODO(sami): Make segmenter placement aware once trtscopes are in place
+    std::set<string> segment_node_names;
+    for (auto node : itr.second) segment_node_names.insert(node->name());
     const auto& dev_itr = device_maps.find(itr.first);
     if (dev_itr == device_maps.end() || dev_itr->second.empty()) {
       VLOG(1) << "No device assigned to segment " << segments->size();
diff --git a/tensorflow/contrib/tensorrt/segment/segment.h b/tensorflow/contrib/tensorrt/segment/segment.h
index 81b4bfe49f..8c44eb782a 100644
--- a/tensorflow/contrib/tensorrt/segment/segment.h
+++ b/tensorflow/contrib/tensorrt/segment/segment.h
@@ -42,22 +42,6 @@ struct SegmentOptions {
 
 // Get the subgraphs of a graph that can be handled by TensorRT.
 //
-// @param gdef The GraphDef describing the network
-// @param candidate_fn A function that returns true for a NodeDef if
-// that node can be handled by TensorRT.
-// @param segments Returns the TensorRT segments/subgraphs. Each entry
-// in the vector describes a subgraph by giving a set of the names of
-// all the NodeDefs in that subgraph.
-// @return the status.
-//
-// TODO(aaroey): remove this method.
-tensorflow::Status SegmentGraph(
-    const tensorflow::GraphDef& gdef,
-    const std::function<bool(const tensorflow::Node*)>& candidate_fn,
-    const SegmentOptions& options, SegmentNodesVector* segments);
-
-// Get the subgraphs of a graph that can be handled by TensorRT.
-//
 // @param graph tensorflow::Graph of the network
 // @param candidate_fn A function that returns true for a Node* if
 // that node can be handled by TensorRT.
@@ -66,8 +50,10 @@ tensorflow::Status SegmentGraph(
 // all the NodeDefs in that subgraph.
 // @return the status.
 tensorflow::Status SegmentGraph(
-    tensorflow::Graph* tf_graph,
+    const tensorflow::Graph* tf_graph,
     const std::function<bool(const tensorflow::Node*)>& candidate_fn,
+    const std::function<bool(const tensorflow::Edge*)>& input_candidate_fn,
+    const std::function<bool(const tensorflow::Edge*)>& output_candidate_fn,
     const SegmentOptions& options, SegmentNodesVector* segments);
 
 }  // namespace segment
diff --git a/tensorflow/contrib/tensorrt/segment/segment_test.cc b/tensorflow/contrib/tensorrt/segment/segment_test.cc
index f5b2d258d7..432e7b1c04 100644
--- a/tensorflow/contrib/tensorrt/segment/segment_test.cc
+++ b/tensorflow/contrib/tensorrt/segment/segment_test.cc
@@ -14,350 +14,245 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/contrib/tensorrt/segment/segment.h"
-#include "tensorflow/c/c_api.h"
-#include "tensorflow/core/framework/graph.pb.h"
+
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/graph/testlib.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/public/session.h"
 
 namespace tensorflow {
 namespace tensorrt {
 namespace segment {
 namespace test {
+namespace ops = ::tensorflow::ops;
 
 class SegmentTest : public ::testing::Test {
- public:
-  bool GetGraphDef(TF_Graph* graph, tensorflow::GraphDef* graph_def);
-
-  TF_Operation* Placeholder(TF_Graph* graph, TF_Status* s, const char* name);
-  TF_Operation* Add(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
-                    TF_Status* s, const char* name);
-
-  std::function<bool(const tensorflow::Node*)> MakeCandidateFn(
-      const std::set<string>& node_names);
-
  protected:
-  void PlaceholderHelper(TF_Graph* graph, TF_Status* s, const char* name,
-                         TF_Operation** op);
-  void AddHelper(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
-                 TF_Status* s, const char* name, TF_Operation** op, bool check);
-
-  SegmentOptions default_options_;
-};
-
-bool SegmentTest::GetGraphDef(TF_Graph* graph,
-                              tensorflow::GraphDef* graph_def) {
-  TF_Status* s = TF_NewStatus();
-  TF_Buffer* buffer = TF_NewBuffer();
-  TF_GraphToGraphDef(graph, buffer, s);
-  bool ret = TF_GetCode(s) == TF_OK;
-  EXPECT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  if (ret) ret = graph_def->ParseFromArray(buffer->data, buffer->length);
-  TF_DeleteBuffer(buffer);
-  TF_DeleteStatus(s);
-  return ret;
-}
+  std::function<bool(const tensorflow::Node*)> MakeCandidateFn(
+      const std::set<string>& node_names) {
+    return [node_names](const tensorflow::Node* node) -> bool {
+      return node_names.find(node->name()) != node_names.end();
+    };
+  }
 
-std::function<bool(const tensorflow::Node*)> SegmentTest::MakeCandidateFn(
-    const std::set<string>& node_names) {
-  return [node_names](const tensorflow::Node* node) -> bool {
-    return node_names.find(node->name()) != node_names.end();
-  };
-}
+  std::function<bool(const tensorflow::Edge*)> MakeInputEdgeCandidateFn(
+      const std::set<string>& node_names) {
+    return [node_names](const tensorflow::Edge* in_edge) -> bool {
+      return node_names.find(in_edge->dst()->name()) != node_names.end();
+    };
+  }
 
-void SegmentTest::PlaceholderHelper(TF_Graph* graph, TF_Status* s,
-                                    const char* name, TF_Operation** op) {
-  TF_OperationDescription* desc = TF_NewOperation(graph, "Placeholder", name);
-  TF_SetAttrType(desc, "dtype", TF_INT32);
-  *op = TF_FinishOperation(desc, s);
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  ASSERT_NE(*op, nullptr);
-}
+  std::function<bool(const tensorflow::Edge*)> MakeOutputEdgeCandidateFn(
+      const std::set<string>& node_names) {
+    return [node_names](const tensorflow::Edge* out_edge) -> bool {
+      return node_names.find(out_edge->src()->name()) != node_names.end();
+    };
+  }
 
-TF_Operation* SegmentTest::Placeholder(TF_Graph* graph, TF_Status* s,
-                                       const char* name) {
-  TF_Operation* op;
-  PlaceholderHelper(graph, s, name, &op);
-  return op;
-}
+  void RunTest(const tensorflow::Graph* graph,
+               const std::set<string>& candidates,
+               const std::set<string>& input_candidates,
+               const std::set<string>& output_candidates,
+               const std::vector<std::set<string>>& expected_segments) {
+    SegmentNodesVector segments;
+    TF_EXPECT_OK(SegmentGraph(graph, MakeCandidateFn(candidates),
+                              MakeInputEdgeCandidateFn(input_candidates),
+                              MakeOutputEdgeCandidateFn(output_candidates),
+                              default_options_, &segments));
+    ValidateSegment(segments, expected_segments);
+  }
 
-void SegmentTest::AddHelper(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
-                            TF_Status* s, const char* name, TF_Operation** op,
-                            bool check) {
-  TF_OperationDescription* desc = TF_NewOperation(graph, "AddN", name);
-  TF_Output add_inputs[2] = {{l, 0}, {r, 0}};
-  TF_AddInputList(desc, add_inputs, 2);
-  *op = TF_FinishOperation(desc, s);
-  if (check) {
-    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-    ASSERT_NE(*op, nullptr);
+  void ValidateSegment(const SegmentNodesVector& segments,
+                       const std::vector<std::set<string>>& expected_segments) {
+    EXPECT_EQ(expected_segments.size(), segments.size());
+    for (int i = 0; i < segments.size(); ++i) {
+      const auto& segment_node_names = segments[i].first;
+      const auto& expected = expected_segments[i];
+      for (const auto& name : expected) {
+        EXPECT_TRUE(segment_node_names.count(name))
+            << "Segment " << i << " is missing expected node: " << name;
+      }
+      if (segment_node_names.size() == expected.size()) continue;
+      for (const auto& name : segment_node_names) {
+        EXPECT_TRUE(expected.count(name))
+            << "Unexpected node found in segment " << i << ": " << name;
+      }
+    }
   }
-}
 
-TF_Operation* SegmentTest::Add(TF_Operation* l, TF_Operation* r,
-                               TF_Graph* graph, TF_Status* s,
-                               const char* name) {
-  TF_Operation* op;
-  AddHelper(l, r, graph, s, name, &op, true);
-  return op;
+  SegmentOptions default_options_;
+};
+
+std::set<string> operator-(const std::set<string>& lhs, const string& rhs) {
+  std::set<string> result = lhs;
+  CHECK(result.erase(rhs));
+  return result;
 }
 
 TEST_F(SegmentTest, Empty) {
-  TF_Graph* graph = TF_NewGraph();
-
-  GraphDef graph_def;
-  ASSERT_TRUE(GetGraphDef(graph, &graph_def));
-
-  SegmentNodesVector segments;
-  ASSERT_EQ(
-      SegmentGraph(graph_def, MakeCandidateFn({}), default_options_, &segments),
-      tensorflow::Status::OK());
-
+  Scope s = Scope::NewRootScope();
+  tensorflow::Graph g(OpRegistry::Global());
+  TF_EXPECT_OK(s.ToGraph(&g));
   // Expect no segments/subgraphs.
-  EXPECT_TRUE(segments.empty());
-  TF_DeleteGraph(graph);
+  RunTest(&g, {}, {}, {}, {});
 }
 
 TEST_F(SegmentTest, Simple) {
-  TF_Status* s = TF_NewStatus();
-  TF_Graph* graph = TF_NewGraph();
-
   //           feed
-  //         //    ||
+  //          //  \\
   //       add0    add1
-  //        | |    /
+  //        | \    /
   //        |  add2
-  //        |  /  ||
+  //        | /   \\
   //       add3    add4
-  //           |  /
+  //          \    /
   //          <sink>
-  //
-  TF_Operation* feed = Placeholder(graph, s, "feed");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  EXPECT_EQ(string("feed"), string(TF_OperationName(feed)));
-
-  TF_Operation* add0 = Add(feed, feed, graph, s, "add0");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  TF_Operation* add1 = Add(feed, feed, graph, s, "add1");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  TF_Operation* add2 = Add(add0, add1, graph, s, "add2");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  TF_Operation* add3 = Add(add0, add2, graph, s, "add3");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  EXPECT_EQ(string("add3"), string(TF_OperationName(add3)));
-  TF_Operation* add4 = Add(add2, add2, graph, s, "add4");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  EXPECT_EQ(string("add4"), string(TF_OperationName(add4)));
-
-  GraphDef graph_def;
-  ASSERT_TRUE(GetGraphDef(graph, &graph_def));
-
-  SegmentNodesVector segments;
-  ASSERT_EQ(
-      SegmentGraph(graph_def,
-                   MakeCandidateFn({"add0", "add1", "add2", "add3", "add4"}),
-                   default_options_, &segments),
-      tensorflow::Status::OK());
-
-  // Expect all Add operations to be collapsed into a single segment
-  ASSERT_EQ(segments.size(), 1);
-  std::vector<string> expected{"add0", "add1", "add2", "add3", "add4"};
-  for (const auto& ex : expected) {
-    EXPECT_TRUE(segments[0].first.find(ex) != segments[0].first.end())
-        << "Missing expected node " << ex;
-  }
-  TF_DeleteGraph(graph);
-  TF_DeleteStatus(s);
+  Scope s = Scope::NewRootScope();
+  auto feed = ops::Placeholder(s.WithOpName("feed"), DT_FLOAT);
+  auto add0 = ops::Add(s.WithOpName("add0"), feed, feed);
+  auto add1 = ops::Add(s.WithOpName("add1"), feed, feed);
+  auto add2 = ops::Add(s.WithOpName("add2"), add0, add1);
+  auto add3 = ops::Add(s.WithOpName("add3"), add0, add2);
+  auto add4 = ops::Add(s.WithOpName("add4"), add2, add2);
+  tensorflow::Graph g(OpRegistry::Global());
+  TF_EXPECT_OK(s.ToGraph(&g));
+
+  // All Add operations are candidates, and we expect all of them to be
+  // collapsed into a single segment
+  const std::set<string> all_adds = {"add0", "add1", "add2", "add3", "add4"};
+  RunTest(&g, all_adds, all_adds, all_adds, {all_adds});
+
+  // Make add1 not a candidate, and we expect all other Add operations to be
+  // collapsed into a single segment
+  auto without_add1 = all_adds - "add1";
+  RunTest(&g, without_add1, without_add1, without_add1, {without_add1});
+
+  // Make add1 not a candidate and add2 not an input candidate, and we expect
+  // add0 and add2 are removed from the segment.
+  auto without_add2 = all_adds - "add2";
+  RunTest(&g, without_add1, without_add2, without_add1, {{"add3", "add4"}});
+
+  // Making add2 not an input candidate itself won't affect anything.
+  RunTest(&g, all_adds, without_add2, all_adds, {all_adds});
+
+  // Making add1 not an input candidate.
+  RunTest(&g, all_adds, without_add1, all_adds, {without_add1});
+
+  // Making add3 not an output candidate doesn't affect anything, since it's
+  // output is sink.
+  auto without_add3 = all_adds - "add3";
+  RunTest(&g, all_adds, all_adds, without_add3, {all_adds});
 }
 
 TEST_F(SegmentTest, AvoidCycle) {
-  TF_Status* s = TF_NewStatus();
-  TF_Graph* graph = TF_NewGraph();
-
-  // add2 is not a TRT candidate so add0/add3 cannot be formed as a
-  // subgraph
-  //
   //           feed
-  //         //    ||
+  //          //  \\
   //       add0    add1
-  //        | |    /
+  //        | \    /
   //        |  add2
-  //        |  /  ||
+  //        |  /  \\
   //       add3    add4
-  //           |  /
+  //          \    /
   //          <sink>
-  //
-  TF_Operation* feed = Placeholder(graph, s, "feed");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  EXPECT_EQ(string("feed"), string(TF_OperationName(feed)));
-
-  TF_Operation* add0 = Add(feed, feed, graph, s, "add0");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  TF_Operation* add1 = Add(feed, feed, graph, s, "add1");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  TF_Operation* add2 = Add(add0, add1, graph, s, "add2");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  TF_Operation* add3 = Add(add0, add2, graph, s, "add3");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  EXPECT_EQ(string("add3"), string(TF_OperationName(add3)));
-  TF_Operation* add4 = Add(add2, add2, graph, s, "add4");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  EXPECT_EQ(string("add4"), string(TF_OperationName(add4)));
-
-  GraphDef graph_def;
-  ASSERT_TRUE(GetGraphDef(graph, &graph_def));
-
-  SegmentNodesVector segments;
-  ASSERT_EQ(
-      SegmentGraph(graph_def, MakeCandidateFn({"add0", "add1", "add3", "add4"}),
-                   default_options_, &segments),
-      tensorflow::Status::OK());
-
-  // Expect no subgraphs
-  EXPECT_EQ(segments.size(), 0);
-  TF_DeleteGraph(graph);
-  TF_DeleteStatus(s);
+  Scope s = Scope::NewRootScope();
+  auto feed = ops::Placeholder(s.WithOpName("feed"), DT_FLOAT);
+  auto add0 = ops::Add(s.WithOpName("add0"), feed, feed);
+  auto add1 = ops::Add(s.WithOpName("add1"), feed, feed);
+  auto add2 = ops::Add(s.WithOpName("add2"), add0, add1);
+  auto add3 = ops::Add(s.WithOpName("add3"), add0, add2);
+  auto add4 = ops::Add(s.WithOpName("add4"), add2, add2);
+  tensorflow::Graph g(OpRegistry::Global());
+  TF_EXPECT_OK(s.ToGraph(&g));
+
+  // add2 is not a TRT candidate so there should be no segments generated.
+  const std::set<string> without_add2 = {"add0", "add1", "add3", "add4"};
+  RunTest(&g, without_add2, without_add2, without_add2, {});
 }
 
 TEST_F(SegmentTest, Multiple) {
-  TF_Status* s = TF_NewStatus();
-  TF_Graph* graph = TF_NewGraph();
-
-  // add5 is not a TRT candidate so two subgraphs should be formed
-  //
-  //                feed
-  //         //      ||     ||
-  //       add0    add1      add7
-  //        | |    /        /   ||
-  //        |  add2-----add5    add8
-  //        |  /  |    |  |    |
-  //       add3   add4     add6
-  //           |     |     /
-  //               <sink>
-  //
-  TF_Operation* feed = Placeholder(graph, s, "feed");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  EXPECT_EQ(string("feed"), string(TF_OperationName(feed)));
-
-  TF_Operation* add0 = Add(feed, feed, graph, s, "add0");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  TF_Operation* add1 = Add(feed, feed, graph, s, "add1");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  TF_Operation* add7 = Add(feed, feed, graph, s, "add7");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  TF_Operation* add2 = Add(add0, add1, graph, s, "add2");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  TF_Operation* add5 = Add(add2, add7, graph, s, "add5");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  TF_Operation* add8 = Add(add7, add7, graph, s, "add8");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  TF_Operation* add3 = Add(add0, add2, graph, s, "add3");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  EXPECT_EQ(string("add3"), string(TF_OperationName(add3)));
-  TF_Operation* add4 = Add(add2, add5, graph, s, "add4");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  EXPECT_EQ(string("add4"), string(TF_OperationName(add4)));
-  TF_Operation* add6 = Add(add5, add8, graph, s, "add6");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  EXPECT_EQ(string("add6"), string(TF_OperationName(add6)));
-
-  GraphDef graph_def;
-  ASSERT_TRUE(GetGraphDef(graph, &graph_def));
-
-  SegmentNodesVector segments;
-  ASSERT_EQ(SegmentGraph(graph_def,
-                         MakeCandidateFn({"add0", "add1", "add2", "add3",
-                                          "add4", "add6", "add7", "add8"}),
-                         default_options_, &segments),
-            tensorflow::Status::OK());
-
-  // Expect two subgraphs
-  EXPECT_EQ(segments.size(), 2);
-
-  std::vector<string> expected0{"add6", "add8"};
-  for (const auto& ex : expected0) {
-    EXPECT_TRUE(segments[0].first.find(ex) != segments[0].first.end())
-        << "Missing expected node " << ex;
-  }
-
-  std::vector<string> expected1{"add0", "add1", "add2", "add3"};
-  for (const auto& ex : expected1) {
-    EXPECT_TRUE(segments[1].first.find(ex) != segments[1].first.end())
-        << "Missing expected node " << ex;
-  }
-  TF_DeleteGraph(graph);
-  TF_DeleteStatus(s);
+  //              feed
+  //           //  ||  \\
+  //        add0  add1  add7
+  //        |  \  /     / \\
+  //        |  add2    /   \\
+  //        |   || \   |   ||
+  //        |   ||  add5  add8
+  //        |  /  \ /  \   /
+  //        add3  add4  add6
+  //           \   |   /
+  //             <sink>
+  Scope s = Scope::NewRootScope();
+  auto feed = ops::Placeholder(s.WithOpName("feed"), DT_FLOAT);
+  auto add0 = ops::Add(s.WithOpName("add0"), feed, feed);
+  auto add1 = ops::Add(s.WithOpName("add1"), feed, feed);
+  auto add7 = ops::Add(s.WithOpName("add7"), feed, feed);
+  auto add2 = ops::Add(s.WithOpName("add2"), add0, add1);
+  auto add5 = ops::Add(s.WithOpName("add5"), add2, add7);
+  auto add8 = ops::Add(s.WithOpName("add8"), add7, add7);
+  auto add3 = ops::Add(s.WithOpName("add3"), add0, add2);
+  auto add4 = ops::Add(s.WithOpName("add4"), add2, add5);
+  auto add6 = ops::Add(s.WithOpName("add6"), add5, add8);
+  tensorflow::Graph g(OpRegistry::Global());
+  TF_EXPECT_OK(s.ToGraph(&g));
+
+  const std::set<string> all_adds = {"add0", "add1", "add2", "add3", "add4",
+                                     "add5", "add6", "add7", "add8"};
+  // Make add5 not a TRT candidate, and we expect two segments.
+  auto without_add5 = all_adds - "add5";
+  RunTest(&g, without_add5, without_add5, without_add5,
+          {{"add6", "add8"}, {"add0", "add1", "add2", "add3"}});
+
+  // Make add8 not a candidate and add6 not an input candidate, then all direct
+  // and indirect inputs of add6 will be removed from the segment.
+  auto without_add8 = all_adds - "add8";
+  auto without_add6 = all_adds - "add6";
+  RunTest(&g, without_add8, without_add6, all_adds, {{"add3", "add4"}});
+
+  // Make add3 not a candidate and add0 not an output candidate, then all
+  // direct and indirect outputs of add0 will be removed from the segment.
+  auto without_add3 = all_adds - "add3";
+  auto without_add0 = all_adds - "add0";
+  RunTest(&g, without_add3, all_adds, without_add0, {{"add1", "add7", "add8"}});
 }
 
 TEST_F(SegmentTest, BigIfElse) {
-  TF_Status* s = TF_NewStatus();
-  TF_Graph* graph = TF_NewGraph();
-
-  // add2 is not a TRT candidate
-  //
   //           feed
   //            ||
   //           add0
-  //         //    ||
+  //         //    \\
   //       add1    add4
   //        ||      ||
   //       add2    add5
   //        ||      ||
   //       add3    add6
-  //         ||    //
+  //         \\    //
   //           add7
   //            ||
   //          <sink>
-  //
-  TF_Operation* feed = Placeholder(graph, s, "feed");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  EXPECT_EQ(string("feed"), string(TF_OperationName(feed)));
-
-  TF_Operation* add0 = Add(feed, feed, graph, s, "add0");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  TF_Operation* add1 = Add(add0, add0, graph, s, "add1");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  TF_Operation* add2 = Add(add1, add1, graph, s, "add2");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  TF_Operation* add3 = Add(add2, add2, graph, s, "add3");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  TF_Operation* add4 = Add(add0, add0, graph, s, "add4");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  TF_Operation* add5 = Add(add4, add4, graph, s, "add5");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  TF_Operation* add6 = Add(add5, add5, graph, s, "add6");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  TF_Operation* add7 = Add(add3, add6, graph, s, "add7");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  EXPECT_EQ(string("add7"), string(TF_OperationName(add7)));
-
-  GraphDef graph_def;
-  ASSERT_TRUE(GetGraphDef(graph, &graph_def));
-
-  SegmentNodesVector segments;
-  ASSERT_EQ(SegmentGraph(graph_def,
-                         MakeCandidateFn({"add0", "add1", "add3", "add4",
-                                          "add5", "add6", "add7"}),
-                         default_options_, &segments),
-            tensorflow::Status::OK());
-
-  // Expect 2 subgraphs
-  EXPECT_EQ(segments.size(), 2);
-
-  std::vector<string> expected0{"add3", "add4", "add5", "add6", "add7"};
-  for (const auto& ex : expected0) {
-    EXPECT_TRUE(segments[0].first.find(ex) != segments[0].first.end())
-        << "Missing expected node " << ex;
-  }
-
-  std::vector<string> expected1{"add0", "add1"};
-  for (const auto& ex : expected1) {
-    EXPECT_TRUE(segments[1].first.find(ex) != segments[1].first.end())
-        << "Missing expected node " << ex;
-  }
-  TF_DeleteGraph(graph);
-  TF_DeleteStatus(s);
+  Scope s = Scope::NewRootScope();
+  auto feed = ops::Placeholder(s.WithOpName("feed"), DT_FLOAT);
+  auto add0 = ops::Add(s.WithOpName("add0"), feed, feed);
+  auto add1 = ops::Add(s.WithOpName("add1"), add0, add0);
+  auto add2 = ops::Add(s.WithOpName("add2"), add1, add1);
+  auto add3 = ops::Add(s.WithOpName("add3"), add2, add2);
+  auto add4 = ops::Add(s.WithOpName("add4"), add0, add0);
+  auto add5 = ops::Add(s.WithOpName("add5"), add4, add4);
+  auto add6 = ops::Add(s.WithOpName("add6"), add5, add5);
+  auto add7 = ops::Add(s.WithOpName("add7"), add3, add6);
+  tensorflow::Graph g(OpRegistry::Global());
+  TF_EXPECT_OK(s.ToGraph(&g));
+
+  // Make add2 not a TRT candidate, and we expect 2 segments.
+  const std::set<string> all_adds = {"add0", "add1", "add2", "add3",
+                                     "add4", "add5", "add6", "add7"};
+  RunTest(&g, all_adds - "add2", all_adds, all_adds,
+          {{"add3", "add4", "add5", "add6", "add7"}, {"add0", "add1"}});
 }
 
 }  // namespace test
diff --git a/tensorflow/contrib/tensorrt/tensorrt_test.cc b/tensorflow/contrib/tensorrt/tensorrt_test.cc
index 3712a9a6fe..769982c645 100644
--- a/tensorflow/contrib/tensorrt/tensorrt_test.cc
+++ b/tensorflow/contrib/tensorrt/tensorrt_test.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/test.h"
 
 #if GOOGLE_CUDA
@@ -130,6 +132,13 @@ void Execute(nvinfer1::IExecutionContext* context, const float* input,
 }
 
 TEST(TensorrtTest, BasicFunctions) {
+  // Handle the case where the test is run on machine with no gpu available.
+  if (CHECK_NOTNULL(GPUMachineManager())->VisibleDeviceCount() <= 0) {
+    LOG(WARNING) << "No gpu device available, probably not being run on a gpu "
+                    "machine. Skipping...";
+    return;
+  }
+
   // Create the network model.
   nvinfer1::IHostMemory* model = CreateNetwork();
   // Use the model to create an engine and then an execution context.
diff --git a/tensorflow/contrib/tensorrt/test/base_test.py b/tensorflow/contrib/tensorrt/test/base_test.py
new file mode 100644
index 0000000000..edd30ad7a9
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/base_test.py
@@ -0,0 +1,126 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Basic tests for TF-TensorRT integration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.platform import test
+
+
+class SimpleSingleEngineGraphDefTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Create a graph containing single segment."""
+    # TODO(aaroey): test graph with different dtypes.
+    dtype = dtypes.float32
+    input_name = "input"
+    input_dims = [100, 24, 24, 2]
+    g = ops.Graph()
+    with g.as_default():
+      inp = array_ops.placeholder(
+          dtype=dtype, shape=[None] + input_dims[1:], name=input_name)
+      with g.device("/GPU:0"):
+        conv_filter = constant_op.constant(
+            [[[[1., 0.5, 4., 6., 0.5, 1.], [1., 0.5, 1., 1., 0.5, 1.]]]],
+            name="weights",
+            dtype=dtype)
+        conv = nn.conv2d(
+            input=inp,
+            filter=conv_filter,
+            strides=[1, 2, 2, 1],
+            padding="SAME",
+            name="conv")
+        bias = constant_op.constant(
+            [4., 1.5, 2., 3., 5., 7.], name="bias", dtype=dtype)
+        added = nn.bias_add(conv, bias, name="bias_add")
+        relu = nn.relu(added, "relu")
+        identity = array_ops.identity(relu, "identity")
+        pool = nn_ops.max_pool(
+            identity, [1, 2, 2, 1], [1, 2, 2, 1], "VALID", name="max_pool")
+      array_ops.squeeze(pool, name=self.output_name)
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=[input_name],
+        input_dims=[input_dims],
+        num_expected_engines=1,
+        expected_output_dims=(100, 6, 6, 6),
+        allclose_atol=1.e-03,
+        allclose_rtol=1.e-03)
+
+
+class SimpleMultiEngineGraphDefTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Create a graph containing multiple segment."""
+    # TODO(aaroey): test graph with different dtypes.
+    dtype = dtypes.float32
+    input_name = "input"
+    input_dims = [100, 24, 24, 2]
+    g = ops.Graph()
+    with g.as_default():
+      inp = array_ops.placeholder(
+          dtype=dtype, shape=[None] + input_dims[1:], name=input_name)
+      with g.device("/GPU:0"):
+        conv_filter = constant_op.constant(
+            [[[[1., 0.5, 4., 6., 0.5, 1.], [1., 0.5, 1., 1., 0.5, 1.]]]],
+            name="weights",
+            dtype=dtype)
+        conv = nn.conv2d(
+            input=inp,
+            filter=conv_filter,
+            strides=[1, 2, 2, 1],
+            padding="SAME",
+            name="conv")
+        c1 = constant_op.constant(
+            np.random.randn(input_dims[0], 12, 12, 6), dtype=dtype)
+        p = conv * c1
+        c2 = constant_op.constant(
+            np.random.randn(input_dims[0], 12, 12, 6), dtype=dtype)
+        q = conv / c2
+
+        edge = self.trt_incompatible_op(q)
+        edge /= edge
+        r = edge + edge
+
+        p -= edge
+        q *= edge
+        s = p + q
+        s -= r
+      array_ops.squeeze(s, name=self.output_name)
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=[input_name],
+        input_dims=[input_dims],
+        num_expected_engines=2,
+        expected_output_dims=(100, 12, 12, 6),
+        allclose_atol=1.e-03,
+        allclose_rtol=1.e-03)
+
+
+# TODO(aaroey): add a large complex graph to test.
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/tensorrt/test/batch_matmul_test.py b/tensorflow/contrib/tensorrt/test/batch_matmul_test.py
new file mode 100644
index 0000000000..730b6843fb
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/batch_matmul_test.py
@@ -0,0 +1,76 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model script to test TF-TensorRT integration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class BatchMatMulTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Testing conversion of BatchMatMul in TF-TRT conversion."""
+    dtype = dtypes.float32
+    input_name = "input"
+    input_dims = [12, 5, 8, 12]
+    w1_name = "matmul_w1"
+    w1_dims = [12, 5, 12, 7]
+    w2_name = "matmul_w2"
+    w2_dims = [12, 12, 7]
+    g = ops.Graph()
+    with g.as_default():
+      inp = array_ops.placeholder(
+          dtype=dtype, shape=[None] + input_dims[1:], name=input_name)
+      w1 = array_ops.placeholder(dtype=dtype, shape=w1_dims, name=w1_name)
+      w2 = array_ops.placeholder(dtype=dtype, shape=w2_dims, name=w2_name)
+      with g.device("/GPU:0"):
+        b = constant_op.constant(np.random.randn(12, 5, 12, 7), dtype=dtype)
+        c = constant_op.constant(np.random.randn(5, 1, 1), dtype=dtype)
+        d = constant_op.constant(np.random.randn(5, 1, 1), dtype=dtype)
+        x1 = math_ops.matmul(inp, b)
+        x1 = x1 + c
+        x2 = math_ops.matmul(inp, w1)
+        x2 = x2 * d
+        e = gen_array_ops.reshape(inp, [12, 40, 12])
+        x3 = math_ops.matmul(e, w2)
+        f = constant_op.constant(np.random.randn(40, 1), dtype=dtype)
+        x3 = x3 + f
+        x3 = gen_array_ops.reshape(x3, [12, 5, 8, 7])
+        out = x1 + x2 + x3
+      array_ops.squeeze(out, name=self.output_name)
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=[input_name, w1_name, w2_name],
+        input_dims=[input_dims, w1_dims, w2_dims],
+        num_expected_engines=1,
+        expected_output_dims=(12, 5, 8, 7),
+        allclose_atol=1.e-03,
+        allclose_rtol=1.e-03)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/tensorrt/test/biasadd_matmul_test.py b/tensorflow/contrib/tensorrt/test/biasadd_matmul_test.py
new file mode 100644
index 0000000000..0c03a10b64
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/biasadd_matmul_test.py
@@ -0,0 +1,112 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model script to test TF-TensorRT integration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.platform import test
+
+
+class BiasaddMatMulTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Testing conversion of BiasAdd MatMul in TF-TRT conversion."""
+    dtype = dtypes.float32
+    input_name = "input"
+    input_dims = [48, 12]
+    g = ops.Graph()
+    with g.as_default():
+      x = array_ops.placeholder(dtype=dtype, shape=input_dims, name=input_name)
+
+      b = constant_op.constant(np.random.randn(12, 4), dtype=dtype)
+      x1 = math_ops.matmul(x, b)
+      b = constant_op.constant(np.random.randn(1, 4), dtype=dtype)
+      x1 = x1 + b
+
+      b = constant_op.constant(np.random.randn(48, 4), dtype=dtype)
+      x2 = math_ops.matmul(x, b, transpose_a=True)
+      x2 = gen_array_ops.reshape(x2, [48, 1])
+
+      b = constant_op.constant(np.random.randn(4, 12), dtype=dtype)
+      x3 = math_ops.matmul(x, b, transpose_b=True)
+
+      b = constant_op.constant(np.random.randn(16, 48), dtype=dtype)
+      x4 = math_ops.matmul(x, b, transpose_b=True, transpose_a=True)
+      x4 = gen_array_ops.reshape(x4, [48, 4])
+
+      x5 = gen_array_ops.reshape(x, [4, 144])
+      b = constant_op.constant(np.random.randn(144, 48), dtype=dtype)
+      x5 = math_ops.matmul(x5, b)
+      b = constant_op.constant(np.random.randn(48), dtype=dtype)
+      x5 = nn.bias_add(x5, b)
+      x5 = gen_array_ops.reshape(x5, [48, 4])
+
+      x6 = gen_array_ops.reshape(x, [4, 12, 12])
+      b = constant_op.constant(np.random.randn(12), dtype=dtype)
+      x6 = nn.bias_add(x6, b, data_format="NHWC")
+      x6 = gen_array_ops.reshape(x6, [48, -1])
+
+      x7 = gen_array_ops.reshape(x, [4, 12, 3, 4])
+      b = constant_op.constant(np.random.randn(4), dtype=dtype)
+      x7 = nn.bias_add(x7, b, data_format="NHWC")
+      x7 = gen_array_ops.reshape(x7, [48, -1])
+
+      x8 = gen_array_ops.reshape(x, [4, 12, 3, 2, 2])
+      b = constant_op.constant(np.random.randn(2), dtype=dtype)
+      x8 = nn.bias_add(x8, b, data_format="NHWC")
+      x8 = gen_array_ops.reshape(x8, [48, -1])
+
+      x9 = gen_array_ops.reshape(x, [4, 12, 3, 2, 2])
+      b = constant_op.constant(np.random.randn(3), dtype=dtype)
+      x9 = nn.bias_add(x9, b, data_format="NCHW")
+      x9 = gen_array_ops.reshape(x9, [48, -1])
+
+      x10 = gen_array_ops.reshape(x, [4, 12, 3, 4])
+      b = constant_op.constant(np.random.randn(12), dtype=dtype)
+      x10 = nn.bias_add(x10, b, data_format="NCHW")
+      x10 = gen_array_ops.reshape(x10, [48, -1])
+
+      x11 = gen_array_ops.reshape(x, [4, 12, 12])
+      b = constant_op.constant(np.random.randn(4), dtype=dtype)
+      x11 = nn.bias_add(x11, b, data_format="NCHW")
+      x11 = gen_array_ops.reshape(x11, [48, -1])
+
+      out = array_ops.concat(
+          [x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11], axis=-1)
+      out = array_ops.squeeze(out, name=self.output_name)
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=[input_name],
+        input_dims=[input_dims],
+        num_expected_engines=7,
+        expected_output_dims=(48, 89),
+        allclose_atol=1.e-03,
+        allclose_rtol=1.e-03)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/tensorrt/test/binary_tensor_weight_broadcast_test.py b/tensorflow/contrib/tensorrt/test/binary_tensor_weight_broadcast_test.py
new file mode 100644
index 0000000000..dd673463a5
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/binary_tensor_weight_broadcast_test.py
@@ -0,0 +1,119 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model script to test TF-TensorRT integration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class BinaryTensorWeightBroadcastTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Tests for scale & elementwise layers in TF-TRT."""
+    dtype = dtypes.float32
+    input_name = "input"
+    input_dims = [10, 24, 24, 20]
+    g = ops.Graph()
+    with g.as_default():
+      x = array_ops.placeholder(dtype=dtype, shape=input_dims, name=input_name)
+      # scale
+      a = constant_op.constant(np.random.randn(1), dtype=dtype)
+      f = x + a
+      x = math_ops.sigmoid(f)
+      # scale
+      a = constant_op.constant(np.random.randn(1), dtype=dtype)
+      f = a + x
+      x = math_ops.sigmoid(f)
+      # scale
+      a = constant_op.constant(np.random.randn(24, 1, 1), dtype=dtype)
+      f = x + a
+      x = math_ops.sigmoid(f)
+      # scale
+      a = constant_op.constant(np.random.randn(24, 1, 1), dtype=dtype)
+      f = a + x
+      x = math_ops.sigmoid(f)
+      # scale
+      a = constant_op.constant(np.random.randn(24, 24, 20), dtype=dtype)
+      f = a + x
+      x = math_ops.sigmoid(f)
+      # scale
+      a = constant_op.constant(np.random.randn(24, 24, 20), dtype=dtype)
+      f = x + a
+      x = math_ops.sigmoid(f)
+      # elementwise
+      a = constant_op.constant(np.random.randn(20), dtype=dtype)
+      f = x + a
+      x = math_ops.sigmoid(f)
+      # elementwise
+      a = constant_op.constant(np.random.randn(20), dtype=dtype)
+      f = a + x
+      x = math_ops.sigmoid(f)
+      # elementwise
+      a = constant_op.constant(np.random.randn(1, 24, 1, 1), dtype=dtype)
+      f = a + x
+      x = math_ops.sigmoid(f)
+      # elementwise
+      a = constant_op.constant(np.random.randn(1, 24, 1, 1), dtype=dtype)
+      f = x + a
+      x = math_ops.sigmoid(f)
+      # elementwise
+      a = constant_op.constant(np.random.randn(1, 24, 24, 1), dtype=dtype)
+      f = a + x
+      x = math_ops.sigmoid(f)
+      # elementwise
+      a = constant_op.constant(np.random.randn(1, 24, 24, 1), dtype=dtype)
+      f = x + a
+      x = math_ops.sigmoid(f)
+      # elementwise
+      a = constant_op.constant(np.random.randn(1, 24, 24, 20), dtype=dtype)
+      f = a + x
+      x = math_ops.sigmoid(f)
+      # elementwise
+      a = constant_op.constant(np.random.randn(1, 24, 24, 20), dtype=dtype)
+      f = x + a
+      x = math_ops.sigmoid(f)
+      # elementwise
+      a = constant_op.constant(np.random.randn(24, 20), dtype=dtype)
+      f = a + x
+      x = math_ops.sigmoid(f)
+      # elementwise
+      a = constant_op.constant(np.random.randn(24, 20), dtype=dtype)
+      f = x + a
+      x = math_ops.sigmoid(f)
+      gen_array_ops.reshape(x, [5, -1], name=self.output_name)
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=[input_name],
+        input_dims=[input_dims],
+        num_expected_engines=16,
+        expected_output_dims=(5, 23040),
+        allclose_atol=1.e-03,
+        allclose_rtol=1.e-03)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/tensorrt/test/concatenation_test.py b/tensorflow/contrib/tensorrt/test/concatenation_test.py
new file mode 100644
index 0000000000..8c51c45b0a
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/concatenation_test.py
@@ -0,0 +1,83 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model script to test TF-TensorRT integration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.platform import test
+
+
+class ConcatenationTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Testing Concatenation in TF-TRT conversion."""
+    dtype = dtypes.float32
+    input_name = "input"
+    input_dims = [2, 3, 3, 1]
+    g = ops.Graph()
+    with g.as_default():
+      x = array_ops.placeholder(dtype=dtype, shape=input_dims, name=input_name)
+      # scale
+      a = constant_op.constant(np.random.randn(3, 1, 1), dtype=dtype)
+      r1 = x / a
+      a = constant_op.constant(np.random.randn(3, 1, 1), dtype=dtype)
+      r2 = a / x
+      a = constant_op.constant(np.random.randn(1, 3, 1), dtype=dtype)
+      r3 = a + x
+      a = constant_op.constant(np.random.randn(1, 3, 1), dtype=dtype)
+      r4 = x * a
+      a = constant_op.constant(np.random.randn(3, 1, 1), dtype=dtype)
+      r5 = x - a
+      a = constant_op.constant(np.random.randn(3, 1, 1), dtype=dtype)
+      r6 = a - x
+      a = constant_op.constant(np.random.randn(3, 1), dtype=dtype)
+      r7 = x - a
+      a = constant_op.constant(np.random.randn(3, 1), dtype=dtype)
+      r8 = a - x
+      a = constant_op.constant(np.random.randn(3, 1, 1), dtype=dtype)
+      r9 = gen_math_ops.maximum(x, a)
+      a = constant_op.constant(np.random.randn(3, 1), dtype=dtype)
+      r10 = gen_math_ops.minimum(a, x)
+      a = constant_op.constant(np.random.randn(3), dtype=dtype)
+      r11 = x * a
+      a = constant_op.constant(np.random.randn(1), dtype=dtype)
+      r12 = a * x
+      concat1 = array_ops.concat([r1, r2, r3, r4, r5, r6], axis=-1)
+      concat2 = array_ops.concat([r7, r8, r9, r10, r11, r12], axis=3)
+      x = array_ops.concat([concat1, concat2], axis=-1)
+      gen_array_ops.reshape(x, [2, -1], name=self.output_name)
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=[input_name],
+        input_dims=[input_dims],
+        num_expected_engines=1,
+        expected_output_dims=(2, 126),
+        allclose_atol=1.e-03,
+        allclose_rtol=1.e-03)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/tensorrt/test/const_broadcast_test.py b/tensorflow/contrib/tensorrt/test/const_broadcast_test.py
new file mode 100644
index 0000000000..97b29bf05d
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/const_broadcast_test.py
@@ -0,0 +1,68 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Script to test TF-TensorRT integration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.platform import test
+
+
+class ConstBroadcastTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Test for Constant broadcasting in TF-TRT."""
+    dtype = dtypes.float32
+    input_name = 'input'
+    input_dims = [5, 12, 12, 2]
+    g = ops.Graph()
+    with g.as_default():
+      x = array_ops.placeholder(dtype=dtype, shape=input_dims, name=input_name)
+      filt1 = constant_op.constant(
+          0.3, shape=(3, 3, 2, 1), dtype=dtype, name='filt1')
+      y1 = nn.conv2d(x, filt1, strides=[1, 1, 1, 1], padding='SAME', name='y1')
+      z1 = nn.relu(y1, name='z1')
+      filt2 = constant_op.constant(
+          np.random.randn(9), shape=(3, 3, 1, 1), dtype=dtype, name='filt2')
+      y2 = nn.conv2d(z1, filt2, strides=[1, 1, 1, 1], padding='SAME', name='y2')
+      z2 = nn.relu(y2, name='z')
+      filt3 = constant_op.constant(
+          np.random.randn(3, 3, 1, 1),
+          shape=(3, 3, 1, 1),
+          dtype=dtype,
+          name='filt3')
+      y3 = nn.conv2d(z2, filt3, strides=[1, 1, 1, 1], padding='SAME', name='y3')
+      nn.relu(y3, name='output')
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=[input_name],
+        input_dims=[input_dims],
+        num_expected_engines=1,
+        expected_output_dims=(5, 12, 12, 1),
+        allclose_atol=1.e-02,
+        allclose_rtol=1.e-02)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/tensorrt/test/memory_alignment_test.py b/tensorflow/contrib/tensorrt/test/memory_alignment_test.py
new file mode 100644
index 0000000000..3dd95c6f62
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/memory_alignment_test.py
@@ -0,0 +1,72 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model script to test TF-TensorRT integration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.platform import test
+
+
+class MemoryAlignmentTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Testing conversion of BatchMatMul in TF-TRT conversion."""
+    dtype = dtypes.float32
+    input_name = "input"
+    input_dims = [2, 15, 15, 3]
+    g = ops.Graph()
+    with g.as_default():
+      inp = array_ops.placeholder(
+          dtype=dtype, shape=[None] + input_dims[1:], name=input_name)
+      with g.device("/GPU:0"):
+        e1 = constant_op.constant(
+            np.random.randn(1, 1, 3, 5), name="kernel_1", dtype=dtype)
+        e2 = constant_op.constant(
+            np.random.randn(1, 1, 5, 10), name="kernel_2", dtype=dtype)
+        conv = nn.conv2d(
+            input=inp,
+            filter=e1,
+            strides=[1, 1, 1, 1],
+            padding="VALID",
+            name="conv")
+        out = nn.conv2d(
+            input=conv,
+            filter=e2,
+            strides=[1, 1, 1, 1],
+            padding="VALID",
+            name="conv_2")
+      array_ops.squeeze(out, name=self.output_name)
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=[input_name],
+        input_dims=[input_dims],
+        num_expected_engines=1,
+        expected_output_dims=(2, 15, 15, 10),
+        allclose_atol=1.e-02,
+        allclose_rtol=1.e-02)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/tensorrt/test/multi_connection_neighbor_engine_test.py b/tensorflow/contrib/tensorrt/test/multi_connection_neighbor_engine_test.py
new file mode 100644
index 0000000000..734ccf6345
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/multi_connection_neighbor_engine_test.py
@@ -0,0 +1,87 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model script to test TF-TensorRT integration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.platform import test
+
+
+class MultiConnectionNeighborEngineTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Test for multi connection neighboring nodes wiring tests in TF-TRT."""
+    dtype = dtypes.float32
+    input_name = "input"
+    input_dims = [2, 3, 7, 5]
+    g = ops.Graph()
+    with g.as_default():
+      x = array_ops.placeholder(dtype=dtype, shape=input_dims, name=input_name)
+      e = constant_op.constant(
+          np.random.normal(.05, .005, [3, 2, 3, 4]),
+          name="weights",
+          dtype=dtype)
+      conv = nn.conv2d(
+          input=x,
+          filter=e,
+          data_format="NCHW",
+          strides=[1, 1, 1, 1],
+          padding="VALID",
+          name="conv")
+      b = constant_op.constant(
+          np.random.normal(2.0, 1.0, [1, 4, 1, 1]), name="bias", dtype=dtype)
+      t = conv + b
+
+      b = constant_op.constant(
+          np.random.normal(5.0, 1.0, [1, 4, 1, 1]), name="bias", dtype=dtype)
+      q = conv - b
+      edge = math_ops.sigmoid(q)
+
+      b = constant_op.constant(
+          np.random.normal(5.0, 1.0, [1, 4, 1, 1]), name="bias", dtype=dtype)
+      d = b + conv
+      edge3 = math_ops.sigmoid(d)
+
+      edge1 = gen_math_ops.tan(conv)
+      t = t - edge1
+      q = q + edge
+      t = t + q
+      t = t + d
+      t = t - edge3
+      array_ops.squeeze(t, name=self.output_name)
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=[input_name],
+        input_dims=[input_dims],
+        num_expected_engines=2,
+        expected_output_dims=(2, 4, 5, 4),
+        allclose_atol=1.e-03,
+        allclose_rtol=1.e-03)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/tensorrt/test/neighboring_engine_test.py b/tensorflow/contrib/tensorrt/test/neighboring_engine_test.py
new file mode 100644
index 0000000000..50265c0845
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/neighboring_engine_test.py
@@ -0,0 +1,69 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model script to test TF-TensorRT integration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.platform import test
+
+
+class NeighboringEngineTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Neighboring node wiring tests in TF-TRT conversion."""
+    dtype = dtypes.float32
+    input_name = "input"
+    input_dims = [2, 3, 7, 5]
+    g = ops.Graph()
+    with g.as_default():
+      x = array_ops.placeholder(dtype=dtype, shape=input_dims, name=input_name)
+      e = constant_op.constant(
+          np.random.normal(.3, 0.05, [3, 2, 3, 4]), name="weights", dtype=dtype)
+      conv = nn.conv2d(
+          input=x,
+          filter=e,
+          data_format="NCHW",
+          strides=[1, 1, 1, 1],
+          padding="VALID",
+          name="conv")
+      b = constant_op.constant(
+          np.random.normal(1.0, 1.0, [1, 4, 1, 1]), name="bias", dtype=dtype)
+      t = conv * b
+      e = gen_math_ops.tan(conv)
+      t = t - e
+      array_ops.squeeze(t, name=self.output_name)
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=[input_name],
+        input_dims=[input_dims],
+        num_expected_engines=2,
+        expected_output_dims=(2, 4, 5, 4),
+        allclose_atol=1.e-03,
+        allclose_rtol=1.e-03)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/tensorrt/test/tf_trt_integration_test.py b/tensorflow/contrib/tensorrt/test/tf_trt_integration_test.py
deleted file mode 100644
index 035b112254..0000000000
--- a/tensorflow/contrib/tensorrt/test/tf_trt_integration_test.py
+++ /dev/null
@@ -1,353 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Script to test TF-TensorRT integration."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from collections import namedtuple
-import itertools
-import warnings
-import numpy as np
-import six
-
-from tensorflow.contrib import tensorrt as trt
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.core.protobuf import rewriter_config_pb2
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import importer
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.platform import test
-
-INPUT_NAME = "input"
-OUTPUT_NAME = "output"
-INPUT_DIMS = [100, 24, 24, 2]
-MODE_FP32 = "FP32"
-MODE_FP16 = "FP16"
-MODE_INT8 = "INT8"
-
-if six.PY2:
-  to_bytes = lambda s: s
-  to_string = lambda s: s
-else:
-  to_bytes = lambda s: s.encode("utf-8", errors="surrogateescape")
-  to_string = lambda s: s.decode("utf-8")
-
-
-# TODO(aaroey): test graph with different dtypes.
-def GetSingleEngineGraphDef(dtype=dtypes.float32):
-  """Create a graph containing single segment."""
-  g = ops.Graph()
-  with g.as_default():
-    inp = array_ops.placeholder(
-        dtype=dtype, shape=[None] + INPUT_DIMS[1:], name=INPUT_NAME)
-    with g.device("/GPU:0"):
-      conv_filter = constant_op.constant(
-          [[[[1., 0.5, 4., 6., 0.5, 1.], [1., 0.5, 1., 1., 0.5, 1.]]]],
-          name="weights",
-          dtype=dtype)
-      conv = nn.conv2d(
-          input=inp,
-          filter=conv_filter,
-          strides=[1, 2, 2, 1],
-          padding="SAME",
-          name="conv")
-      bias = constant_op.constant(
-          [4., 1.5, 2., 3., 5., 7.], name="bias", dtype=dtype)
-      added = nn.bias_add(conv, bias, name="bias_add")
-      relu = nn.relu(added, "relu")
-      identity = array_ops.identity(relu, "identity")
-      pool = nn_ops.max_pool(
-          identity, [1, 2, 2, 1], [1, 2, 2, 1], "VALID", name="max_pool")
-    array_ops.squeeze(pool, name=OUTPUT_NAME)
-  return g.as_graph_def()
-
-
-# TODO(aaroey): test graph with different dtypes.
-def GetMultiEngineGraphDef(dtype=dtypes.float32):
-  """Create a graph containing multiple segment."""
-  g = ops.Graph()
-  with g.as_default():
-    inp = array_ops.placeholder(
-        dtype=dtype, shape=[None] + INPUT_DIMS[1:], name=INPUT_NAME)
-    with g.device("/GPU:0"):
-      conv_filter = constant_op.constant(
-          [[[[1., 0.5, 4., 6., 0.5, 1.], [1., 0.5, 1., 1., 0.5, 1.]]]],
-          name="weights",
-          dtype=dtype)
-      conv = nn.conv2d(
-          input=inp,
-          filter=conv_filter,
-          strides=[1, 2, 2, 1],
-          padding="SAME",
-          name="conv")
-      c1 = constant_op.constant(
-          np.random.randn(INPUT_DIMS[0], 12, 12, 6), dtype=dtype)
-      p = conv * c1
-      c2 = constant_op.constant(
-          np.random.randn(INPUT_DIMS[0], 12, 12, 6), dtype=dtype)
-      q = conv / c2
-
-      edge = math_ops.sin(q)
-      edge /= edge
-      r = edge + edge
-
-      p -= edge
-      q *= edge
-      s = p + q
-      s -= r
-    array_ops.squeeze(s, name=OUTPUT_NAME)
-  return g.as_graph_def()
-
-
-TestGraph = namedtuple("TestGraph",
-                       ["gdef", "num_expected_engines", "expected_output_dims"])
-
-TEST_GRAPHS = {
-    "SingleEngineGraph":
-        TestGraph(
-            gdef=GetSingleEngineGraphDef(),
-            num_expected_engines=1,
-            expected_output_dims=(100, 6, 6, 6)),
-    "MultiEngineGraph":
-        TestGraph(
-            gdef=GetMultiEngineGraphDef(),
-            num_expected_engines=2,
-            expected_output_dims=(100, 12, 12, 6)),
-    # TODO(aaroey): add a large complex graph to test.
-}
-
-
-class TfTrtIntegrationTest(test_util.TensorFlowTestCase):
-  """Class to test Tensorflow-TensorRT integration."""
-
-  def setUp(self):
-    """Setup method."""
-    super(TfTrtIntegrationTest, self).setUp()
-    warnings.simplefilter("always")
-    self._input = np.random.random_sample(INPUT_DIMS)
-
-  def _GetConfigProto(self,
-                      use_optimizer,
-                      precision_mode=None,
-                      is_dynamic_op=None):
-    if use_optimizer:
-      rewriter_cfg = rewriter_config_pb2.RewriterConfig()
-      rewriter_cfg.optimizers.extend(["constfold", "layout"])
-      custom_op = rewriter_cfg.custom_optimizers.add()
-      custom_op.name = "TensorRTOptimizer"
-      custom_op.parameter_map["minimum_segment_size"].i = 3
-      custom_op.parameter_map["max_batch_size"].i = self._input.shape[0]
-      custom_op.parameter_map["is_dynamic_op"].b = is_dynamic_op
-      custom_op.parameter_map["max_workspace_size_bytes"].i = 1 << 25
-      custom_op.parameter_map["precision_mode"].s = to_bytes(precision_mode)
-      graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_cfg)
-    else:
-      graph_options = config_pb2.GraphOptions()
-
-    gpu_options = config_pb2.GPUOptions()
-    if trt.trt_convert.get_linked_tensorrt_version()[0] == 3:
-      gpu_options.per_process_gpu_memory_fraction = 0.50
-
-    config = config_pb2.ConfigProto(
-        gpu_options=gpu_options, graph_options=graph_options)
-    return config
-
-  def _RunGraph(self, graph_key, gdef, input_data, config, num_runs=2):
-    """Run given graphdef multiple times."""
-    g = ops.Graph()
-    with g.as_default():
-      inp, out = importer.import_graph_def(
-          graph_def=gdef, return_elements=[INPUT_NAME, OUTPUT_NAME], name="")
-      inp = inp.outputs[0]
-      out = out.outputs[0]
-    with self.test_session(
-        graph=g, config=config, use_gpu=True, force_gpu=True) as sess:
-      val = None
-      # Defaults to 2 runs to verify result across multiple runs is same.
-      for _ in range(num_runs):
-        new_val = sess.run(out, {inp: input_data})
-        self.assertEqual(TEST_GRAPHS[graph_key].expected_output_dims,
-                         new_val.shape)
-        if val is not None:
-          self.assertAllEqual(new_val, val)
-        val = new_val
-    return val
-
-  # Use real data that is representative of the inference dataset
-  # for calibration. For this test script it is random data.
-  def _RunCalibration(self, graph_key, gdef, input_data, config):
-    """Run calibration on given graph."""
-    return self._RunGraph(graph_key, gdef, input_data, config, 30)
-
-  def _GetTrtGraph(self, gdef, precision_mode, is_dynamic_op):
-    """Return trt converted graph."""
-    return trt.create_inference_graph(
-        input_graph_def=gdef,
-        outputs=[OUTPUT_NAME],
-        max_batch_size=self._input.shape[0],
-        max_workspace_size_bytes=1 << 25,
-        precision_mode=precision_mode,
-        minimum_segment_size=2,
-        is_dynamic_op=is_dynamic_op)
-
-  def _VerifyGraphDef(self,
-                      graph_key,
-                      gdef,
-                      precision_mode=None,
-                      is_calibrated=None,
-                      dynamic_engine=None):
-    num_engines = 0
-    for n in gdef.node:
-      if n.op == "TRTEngineOp":
-        num_engines += 1
-        self.assertNotEqual(to_bytes(""), n.attr["serialized_segment"].s)
-        self.assertNotEqual(to_bytes(""), n.attr["segment_funcdef_name"].s)
-        self.assertEqual(n.attr["precision_mode"].s, to_bytes(precision_mode))
-        self.assertEqual(n.attr["static_engine"].b, not dynamic_engine)
-        if precision_mode == MODE_INT8 and is_calibrated:
-          self.assertNotEqual(to_bytes(""), n.attr["calibration_data"].s)
-        else:
-          self.assertEqual(to_bytes(""), n.attr["calibration_data"].s)
-    if precision_mode is None:
-      self.assertEqual(num_engines, 0)
-    else:
-      self.assertEqual(num_engines,
-                       TEST_GRAPHS[graph_key].num_expected_engines)
-
-  def _RunTest(self, graph_key, use_optimizer, precision_mode,
-               dynamic_infer_engine, dynamic_calib_engine):
-    assert precision_mode in [MODE_FP32, MODE_FP16, MODE_INT8]
-    input_gdef = TEST_GRAPHS[graph_key].gdef
-    self._VerifyGraphDef(graph_key, input_gdef)
-
-    # Get reference result without running trt.
-    config_no_trt = self._GetConfigProto(False)
-    print("Running original graph w/o trt, config:\n%s" % str(config_no_trt))
-    ref_result = self._RunGraph(graph_key, input_gdef, self._input,
-                                config_no_trt)
-
-    # Run calibration if necessary.
-    if precision_mode == MODE_INT8:
-
-      calib_config = self._GetConfigProto(use_optimizer, precision_mode,
-                                          dynamic_calib_engine)
-      print("Running calibration graph, config:\n%s" % str(calib_config))
-      if use_optimizer:
-        self.assertTrue(False)
-        # TODO(aaroey): uncomment this and get infer_gdef when this mode is
-        # supported.
-        # result = self._RunCalibration(graph_key, input_gdef, self._input,
-        #                               calib_config)
-      else:
-        calib_gdef = self._GetTrtGraph(input_gdef, precision_mode,
-                                       dynamic_calib_engine)
-        self._VerifyGraphDef(graph_key, calib_gdef, precision_mode, False,
-                             dynamic_calib_engine)
-        result = self._RunCalibration(graph_key, calib_gdef, self._input,
-                                      calib_config)
-        infer_gdef = trt.calib_graph_to_infer_graph(calib_gdef)
-        self._VerifyGraphDef(graph_key, infer_gdef, precision_mode, True,
-                             dynamic_calib_engine)
-      self.assertAllClose(ref_result, result, rtol=1.e-03)
-    else:
-      infer_gdef = input_gdef
-
-    # Run inference.
-    infer_config = self._GetConfigProto(use_optimizer, precision_mode,
-                                        dynamic_infer_engine)
-    print("Running final inference graph, config:\n%s" % str(infer_config))
-    if use_optimizer:
-      result = self._RunGraph(graph_key, infer_gdef, self._input, infer_config)
-    else:
-      trt_infer_gdef = self._GetTrtGraph(infer_gdef, precision_mode,
-                                         dynamic_infer_engine)
-      self._VerifyGraphDef(graph_key, trt_infer_gdef, precision_mode, True,
-                           dynamic_infer_engine)
-      result = self._RunGraph(graph_key, trt_infer_gdef, self._input,
-                              infer_config)
-    self.assertAllClose(ref_result, result, rtol=1.e-03)
-
-  def testIdempotence(self):
-    # Test that applying tensorrt optimizer or offline conversion tools multiple
-    # times to the same graph will result in same graph.
-    #
-    # TODO(aaroey): currently the conversion is not deterministic, this is
-    # mainly because during tensorflow::ConvertGraphDefToGraph(), the graph uses
-    # EdgeSet which use a map keyed by Edge*, so the order of input/output edges
-    # of a node is nondeterministic, thus the order for segmenter to contract
-    # edges is nondeterministic. Need to evaluate whether we should fix this.
-    pass
-
-
-def GetTests():
-
-  def _GetTest(g, u, p, i, c):
-
-    def _Test(self):
-      print("Running test with parameters: graph_key=%s, use_optimizer=%s, "
-            "precision_mode=%s, dynamic_infer_engine=%s, "
-            "dynamic_calib_engine=%s" % (g, u, p, i, c))
-      self._RunTest(g, u, p, i, c)
-
-    return _Test
-
-  use_optimizer_options = [False, True]
-  precision_mode_options = [MODE_FP32, MODE_FP16, MODE_INT8]
-  dynamic_infer_engine_options = [False, True]
-  dynamic_calib_engine_options = [False, True]
-  for (graph_key, use_optimizer, precision_mode,
-       dynamic_infer_engine, dynamic_calib_engine) in itertools.product(
-           TEST_GRAPHS, use_optimizer_options, precision_mode_options,
-           dynamic_infer_engine_options, dynamic_calib_engine_options):
-    if precision_mode == MODE_INT8:
-      if not dynamic_calib_engine and dynamic_infer_engine:
-        # TODO(aaroey): test this case, the conversion from static calibration
-        # engine to dynamic inference engine should be a noop.
-        continue
-      if use_optimizer:
-        # TODO(aaroey): if use_optimizer is True we need to get the inference
-        # graphdef using custom python wrapper class, which is not currently
-        # supported yet.
-        continue
-      if not dynamic_calib_engine:
-        # TODO(aaroey): construction of static calibration engine is not
-        # supported yet.
-        continue
-      if dynamic_calib_engine and not dynamic_infer_engine:
-        # TODO(aaroey): construction of static inference engine using dynamic
-        # calibration engine is not supported yet.
-        continue
-    else:  # In non int8 mode.
-      if dynamic_calib_engine:
-        # dynamic_calib_engine doesn't affect non-int8 modes, so just let
-        # related tests run once on dynamic_calib_engine=False.
-        continue
-    yield _GetTest(graph_key, use_optimizer, precision_mode,
-                   dynamic_infer_engine, dynamic_calib_engine)
-
-
-if __name__ == "__main__":
-  if trt.is_tensorrt_enabled():
-    for index, t in enumerate(GetTests()):
-      setattr(TfTrtIntegrationTest, "testTfTRT_" + str(index), t)
-  test.main()
diff --git a/tensorflow/contrib/tensorrt/test/tf_trt_integration_test_base.py b/tensorflow/contrib/tensorrt/test/tf_trt_integration_test_base.py
new file mode 100644
index 0000000000..bb7f5a77f0
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/tf_trt_integration_test_base.py
@@ -0,0 +1,329 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities to test TF-TensorRT integration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import namedtuple
+import itertools
+import warnings
+import numpy as np
+import six
+
+from tensorflow.contrib.tensorrt.python import trt_convert
+# pylint: disable=unused-import
+from tensorflow.contrib.tensorrt.python.ops import trt_engine_op
+# pylint: enable=unused-import
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.python.framework import importer
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import tf_logging as logging
+
+TfTrtIntegrationTestParams = namedtuple("TfTrtIntegrationTestParams", [
+    "gdef", "input_names", "input_dims", "num_expected_engines",
+    "expected_output_dims", "allclose_atol", "allclose_rtol"
+])
+
+PRECISION_MODES = ["FP32", "FP16", "INT8"]
+
+
+def _IsQuantizationMode(mode):
+  return mode == "INT8"
+
+
+class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
+  """Class to test Tensorflow-TensorRT integration."""
+
+  @property
+  def output_name(self):
+    return "output"
+
+  @property
+  def trt_incompatible_op(self):
+    return math_ops.sin
+
+  @property
+  def precision_modes(self):
+    return ["FP32", "FP16", "INT8"]
+
+  def _ToBytes(self, s):
+    if six.PY2:
+      return s
+    else:
+      return s.encode("utf-8")
+
+  def _ToString(self, s):
+    if six.PY2:
+      return s
+    else:
+      return s.decode("utf-8")
+
+  def setUp(self):
+    """Setup method."""
+    super(TfTrtIntegrationTestBase, self).setUp()
+    warnings.simplefilter("always")
+
+  def GetParams(self):
+    """Return a TfTrtIntegrationTestParams for test, implemented by subclass."""
+    raise NotImplementedError()
+
+  def _GetConfigProto(self,
+                      params,
+                      use_optimizer,
+                      precision_mode=None,
+                      is_dynamic_op=None):
+    """Get config proto based on specific settings."""
+    if use_optimizer:
+      rewriter_cfg = rewriter_config_pb2.RewriterConfig()
+      rewriter_cfg.optimizers.extend(["constfold", "layout"])
+      custom_op = rewriter_cfg.custom_optimizers.add()
+      custom_op.name = "TensorRTOptimizer"
+      custom_op.parameter_map["minimum_segment_size"].i = 3
+      custom_op.parameter_map["max_batch_size"].i = max(
+          [dims[0] for dims in params.input_dims])
+      custom_op.parameter_map["is_dynamic_op"].b = is_dynamic_op
+      custom_op.parameter_map["max_workspace_size_bytes"].i = 1 << 25
+      custom_op.parameter_map["precision_mode"].s = self._ToBytes(
+          precision_mode)
+      graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_cfg)
+    else:
+      graph_options = config_pb2.GraphOptions()
+
+    gpu_options = config_pb2.GPUOptions()
+    gpu_options.allow_growth = True
+    if trt_convert.get_linked_tensorrt_version()[0] == 3:
+      gpu_options.per_process_gpu_memory_fraction = 0.50
+
+    config = config_pb2.ConfigProto(
+        gpu_options=gpu_options, graph_options=graph_options)
+    return config
+
+  def _RunGraph(self, params, gdef, input_data, config, num_runs=2):
+    """Run given graphdef multiple times."""
+    assert len(params.input_names) == len(input_data)
+    g = ops.Graph()
+    with g.as_default():
+      io_ops = importer.import_graph_def(
+          graph_def=gdef,
+          return_elements=params.input_names + [self.output_name],
+          name="")
+      inp = [i.outputs[0] for i in io_ops[:-1]]
+      assert len(inp) == len(input_data)
+      out = io_ops[-1].outputs[0]
+    with self.test_session(
+        graph=g, config=config, use_gpu=True, force_gpu=True) as sess:
+      val = None
+      # Defaults to 2 runs to verify result across multiple runs is same.
+      for _ in range(num_runs):
+        new_val = sess.run(out,
+                           {inp[i]: input_data[i] for i in range(len(inp))})
+        self.assertEqual(params.expected_output_dims, new_val.shape)
+        if val is not None:
+          self.assertAllEqual(val, new_val)
+        val = new_val
+    return val
+
+  # Use real data that is representative of the inference dataset
+  # for calibration. For this test script it is random data.
+  def _RunCalibration(self, params, gdef, input_data, config):
+    """Run calibration on given graph."""
+    return self._RunGraph(params, gdef, input_data, config, 30)
+
+  def _GetTrtGraphDef(self, params, gdef, precision_mode, is_dynamic_op):
+    """Return trt converted graphdef."""
+    return trt_convert.create_inference_graph(
+        input_graph_def=gdef,
+        outputs=[self.output_name],
+        max_batch_size=max([dims[0] for dims in params.input_dims]),
+        max_workspace_size_bytes=1 << 25,
+        precision_mode=precision_mode,
+        minimum_segment_size=2,
+        is_dynamic_op=is_dynamic_op)
+
+  def _VerifyGraphDef(self,
+                      params,
+                      gdef,
+                      precision_mode=None,
+                      is_calibrated=None,
+                      dynamic_engine=None):
+    num_engines = 0
+    for n in gdef.node:
+      # TODO(jie): we should have coverage for failed conversion (TF fallback).
+      # where the conversion will fail and we shouldn't count this engine as the
+      # converted engines.
+      if n.op == "TRTEngineOp":
+        num_engines += 1
+        self.assertNotEqual(self._ToBytes(""), n.attr["serialized_segment"].s)
+        self.assertNotEqual(self._ToBytes(""), n.attr["segment_funcdef_name"].s)
+        self.assertEqual(
+            self._ToBytes(precision_mode), n.attr["precision_mode"].s)
+        self.assertEqual(not dynamic_engine, n.attr["static_engine"].b)
+        if _IsQuantizationMode(precision_mode) and is_calibrated:
+          self.assertNotEqual(self._ToBytes(""), n.attr["calibration_data"].s)
+        else:
+          self.assertEqual(self._ToBytes(""), n.attr["calibration_data"].s)
+    if precision_mode is None:  # This means gdef is the original GraphDef.
+      self.assertEqual(0, num_engines)
+    else:
+      self.assertEqual(num_engines, params.num_expected_engines)
+
+  def RunTest(self, params, use_optimizer, precision_mode,
+              dynamic_infer_engine, dynamic_calib_engine):
+    assert precision_mode in PRECISION_MODES
+    input_data = [np.random.random_sample(dims) for dims in params.input_dims]
+    input_gdef = params.gdef
+    self._VerifyGraphDef(params, input_gdef)
+
+    # Get reference result without running trt.
+    config_no_trt = self._GetConfigProto(params, False)
+    logging.info("Running original graph w/o trt, config:\n%s",
+                 str(config_no_trt))
+    ref_result = self._RunGraph(params, input_gdef, input_data, config_no_trt)
+
+    # Run calibration if necessary.
+    if _IsQuantizationMode(precision_mode):
+
+      calib_config = self._GetConfigProto(params, use_optimizer, precision_mode,
+                                          dynamic_calib_engine)
+      logging.info("Running calibration graph, config:\n%s", str(calib_config))
+      if use_optimizer:
+        self.assertTrue(False)
+        # TODO(aaroey): uncomment this and get infer_gdef when this mode is
+        # supported.
+        # result = self._RunCalibration(params, input_gdef, input_data,
+        #                               calib_config)
+      else:
+        calib_gdef = self._GetTrtGraphDef(params, input_gdef, precision_mode,
+                                          dynamic_calib_engine)
+        self._VerifyGraphDef(params, calib_gdef, precision_mode, False,
+                             dynamic_calib_engine)
+        result = self._RunCalibration(params, calib_gdef, input_data,
+                                      calib_config)
+        infer_gdef = trt_convert.calib_graph_to_infer_graph(calib_gdef)
+        self._VerifyGraphDef(params, infer_gdef, precision_mode, True,
+                             dynamic_calib_engine)
+
+      self.assertAllClose(
+          ref_result,
+          result,
+          atol=params.allclose_atol,
+          rtol=params.allclose_rtol)
+    else:
+      infer_gdef = input_gdef
+
+    # Run inference.
+    infer_config = self._GetConfigProto(params, use_optimizer, precision_mode,
+                                        dynamic_infer_engine)
+    logging.info("Running final inference graph, config:\n%s",
+                 str(infer_config))
+    if use_optimizer:
+      result = self._RunGraph(params, infer_gdef, input_data, infer_config)
+    else:
+      trt_infer_gdef = self._GetTrtGraphDef(params, infer_gdef, precision_mode,
+                                            dynamic_infer_engine)
+      self._VerifyGraphDef(params, trt_infer_gdef, precision_mode, True,
+                           dynamic_infer_engine)
+      result = self._RunGraph(params, trt_infer_gdef, input_data, infer_config)
+
+    self.assertAllClose(
+        ref_result,
+        result,
+        atol=params.allclose_atol,
+        rtol=params.allclose_rtol)
+
+  def testIdempotence(self):
+    # Test that applying tensorrt optimizer or offline conversion tools multiple
+    # times to the same graph will result in same graph.
+    #
+    # TODO(aaroey): currently the conversion is not deterministic, this is
+    # mainly because during tensorflow::ConvertGraphDefToGraph(), the graph uses
+    # EdgeSet which use a map keyed by Edge*, so the order of input/output edges
+    # of a node is nondeterministic, thus the order for segmenter to contract
+    # edges is nondeterministic. Need to evaluate whether we should fix this.
+    pass
+
+
+def _AddTests(test_class):
+  """Adds test methods to TfTrtIntegrationTestBase."""
+
+  def _GetTest(use_optimizer, precision_mode, dynamic_infer_engine,
+               dynamic_calib_engine):
+    """Gets a single test method based on the parameters."""
+
+    def _Test(self):
+      params = self.GetParams()
+      logging.info(
+          "Running test with parameters: use_optimizer=%s, precision_mode=%s, "
+          "dynamic_infer_engine=%s, dynamic_calib_engine=%s", use_optimizer,
+          precision_mode, dynamic_infer_engine, dynamic_calib_engine)
+      self.RunTest(params, use_optimizer, precision_mode, dynamic_infer_engine,
+                   dynamic_calib_engine)
+
+    return _Test
+
+  use_optimizer_options = [False, True]
+  dynamic_infer_engine_options = [False, True]
+  dynamic_calib_engine_options = [False, True]
+  for (use_optimizer, precision_mode,
+       dynamic_infer_engine, dynamic_calib_engine) in itertools.product(
+           use_optimizer_options, PRECISION_MODES, dynamic_infer_engine_options,
+           dynamic_calib_engine_options):
+    if _IsQuantizationMode(precision_mode):
+      if not dynamic_calib_engine and dynamic_infer_engine:
+        # TODO(aaroey): test this case, the conversion from static calibration
+        # engine to dynamic inference engine should be a noop.
+        continue
+      if use_optimizer:
+        # TODO(aaroey): if use_optimizer is True we need to get the inference
+        # graphdef using custom python wrapper class, which is not currently
+        # supported yet.
+        continue
+      if not dynamic_calib_engine:
+        # TODO(aaroey): construction of static calibration engine is not
+        # supported yet.
+        continue
+      if dynamic_calib_engine and not dynamic_infer_engine:
+        # TODO(aaroey): construction of static inference engine using dynamic
+        # calibration engine is not supported yet.
+        continue
+    else:  # In non int8 mode.
+      if dynamic_calib_engine:
+        # dynamic_calib_engine doesn't affect non-int8 modes, so just let
+        # related tests run once on dynamic_calib_engine=False.
+        continue
+
+    conversion = "OptimizerConversion" if use_optimizer else "ToolConversion"
+    infer_engine_type = ("DynamicInferEngine"
+                         if dynamic_infer_engine else "StaticInferEngine")
+    calib_engine_type = ""
+    if precision_mode == "INT8":
+      calib_engine_type = ("DynamicCalibEngine"
+                           if dynamic_calib_engine else "StaticCalibEngine")
+    test_name = "%s_%s_%s%s" % (conversion, precision_mode, infer_engine_type,
+                                ("_" + calib_engine_type)
+                                if len(calib_engine_type) else "")
+    setattr(
+        test_class, "testTfTRT_" + test_name,
+        _GetTest(use_optimizer, precision_mode, dynamic_infer_engine,
+                 dynamic_calib_engine))
+
+
+if trt_convert.is_tensorrt_enabled():
+  _AddTests(TfTrtIntegrationTestBase)
diff --git a/tensorflow/contrib/tensorrt/test/unary_test.py b/tensorflow/contrib/tensorrt/test/unary_test.py
new file mode 100644
index 0000000000..b9e977cf67
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/unary_test.py
@@ -0,0 +1,110 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model script to test TF-TensorRT integration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class UnaryTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Test for unary operations in TF-TRT."""
+    dtype = dtypes.float32
+    input_name = "input"
+    input_dims = [12, 5, 8, 1, 1, 12]
+    input2_name = "input_2"
+    input2_dims = [12, 5, 8, 1, 12, 1, 1]
+    g = ops.Graph()
+    with g.as_default():
+      x = array_ops.placeholder(dtype=dtype, shape=input_dims, name=input_name)
+      q = math_ops.abs(x)
+      q = q + 1.0
+      q = gen_math_ops.exp(q)
+      q = gen_math_ops.log(q)
+      q = array_ops.squeeze(q, axis=-2)
+      q = math_ops.abs(q)
+      q = q + 2.2
+      q = gen_math_ops.sqrt(q)
+      q = gen_math_ops.rsqrt(q)
+      q = math_ops.negative(q)
+      q = array_ops.squeeze(q, axis=3)
+      q = math_ops.abs(q)
+      q = q + 3.0
+      a = gen_math_ops.reciprocal(q)
+
+      x = constant_op.constant(np.random.randn(5, 8, 12), dtype=dtype)
+      q = math_ops.abs(x)
+      q = q + 2.0
+      q = gen_math_ops.exp(q)
+      q = gen_math_ops.log(q)
+      q = math_ops.abs(q)
+      q = q + 2.1
+      q = gen_math_ops.sqrt(q)
+      q = gen_math_ops.rsqrt(q)
+      q = math_ops.negative(q)
+      q = math_ops.abs(q)
+      q = q + 4.0
+      b = gen_math_ops.reciprocal(q)
+
+      # TODO(jie): this one will break, broadcasting on batch.
+      x = array_ops.placeholder(
+          dtype=dtype, shape=input2_dims, name=input2_name)
+      q = math_ops.abs(x)
+      q = q + 5.0
+      q = gen_math_ops.exp(q)
+      q = array_ops.squeeze(q, axis=[-1, -2, 3])
+      q = gen_math_ops.log(q)
+      q = math_ops.abs(q)
+      q = q + 5.1
+      q = gen_array_ops.reshape(q, [12, 5, 1, 1, 8, 1, 12])
+      q = array_ops.squeeze(q, axis=[5, 2, 3])
+      q = gen_math_ops.sqrt(q)
+      q = math_ops.abs(q)
+      q = q + 5.2
+      q = gen_math_ops.rsqrt(q)
+      q = math_ops.negative(q)
+      q = math_ops.abs(q)
+      q = q + 5.3
+      c = gen_math_ops.reciprocal(q)
+
+      q = a * b
+      q = q / c
+      array_ops.squeeze(q, name=self.output_name)
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=[input_name, input2_name],
+        input_dims=[input_dims, input2_dims],
+        num_expected_engines=5,
+        expected_output_dims=(12, 5, 8, 12),
+        allclose_atol=1.e-03,
+        allclose_rtol=1.e-03)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/tensorrt/test/vgg_block_nchw_test.py b/tensorflow/contrib/tensorrt/test/vgg_block_nchw_test.py
new file mode 100644
index 0000000000..2b134c3bce
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/vgg_block_nchw_test.py
@@ -0,0 +1,82 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model script to test TF-TensorRT integration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import nn_impl
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.platform import test
+
+
+class VGGBlockNCHWTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Single vgg layer in NCHW unit tests in TF-TRT."""
+    dtype = dtypes.float32
+    input_name = "input"
+    input_dims = [5, 2, 8, 8]
+    g = ops.Graph()
+    with g.as_default():
+      x = array_ops.placeholder(dtype=dtype, shape=input_dims, name=input_name)
+      x, _, _ = nn_impl.fused_batch_norm(
+          x,
+          np.random.randn(2).astype(np.float32),
+          np.random.randn(2).astype(np.float32),
+          mean=np.random.randn(2).astype(np.float32),
+          variance=np.random.randn(2).astype(np.float32),
+          data_format="NCHW",
+          is_training=False)
+      e = constant_op.constant(
+          np.random.randn(1, 1, 2, 6), name="weights", dtype=dtype)
+      conv = nn.conv2d(
+          input=x,
+          filter=e,
+          data_format="NCHW",
+          strides=[1, 1, 2, 2],
+          padding="SAME",
+          name="conv")
+      b = constant_op.constant(np.random.randn(6), name="bias", dtype=dtype)
+      t = nn.bias_add(conv, b, data_format="NCHW", name="biasAdd")
+      relu = nn.relu(t, "relu")
+      idty = array_ops.identity(relu, "ID")
+      v = nn_ops.max_pool(
+          idty, [1, 1, 2, 2], [1, 1, 2, 2],
+          "VALID",
+          data_format="NCHW",
+          name="max_pool")
+      array_ops.squeeze(v, name="output")
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=[input_name],
+        input_dims=[input_dims],
+        num_expected_engines=1,
+        expected_output_dims=(5, 6, 2, 2),
+        allclose_atol=1.e-03,
+        allclose_rtol=1.e-03)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/tensorrt/test/vgg_block_test.py b/tensorflow/contrib/tensorrt/test/vgg_block_test.py
new file mode 100644
index 0000000000..bec2f23eff
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/vgg_block_test.py
@@ -0,0 +1,73 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model script to test TF-TensorRT integration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import nn_impl
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.platform import test
+
+
+class VGGBlockTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Single vgg layer test in TF-TRT conversion."""
+    dtype = dtypes.float32
+    input_name = "input"
+    input_dims = [5, 8, 8, 2]
+    g = ops.Graph()
+    with g.as_default():
+      x = array_ops.placeholder(dtype=dtype, shape=input_dims, name=input_name)
+      x, _, _ = nn_impl.fused_batch_norm(
+          x,
+          np.random.randn(2).astype(np.float32),
+          np.random.randn(2).astype(np.float32),
+          mean=np.random.randn(2).astype(np.float32),
+          variance=np.random.randn(2).astype(np.float32),
+          is_training=False)
+      e = constant_op.constant(
+          np.random.randn(1, 1, 2, 6), name="weights", dtype=dtype)
+      conv = nn.conv2d(
+          input=x, filter=e, strides=[1, 2, 2, 1], padding="SAME", name="conv")
+      b = constant_op.constant(np.random.randn(6), name="bias", dtype=dtype)
+      t = nn.bias_add(conv, b, name="biasAdd")
+      relu = nn.relu(t, "relu")
+      idty = array_ops.identity(relu, "ID")
+      v = nn_ops.max_pool(
+          idty, [1, 2, 2, 1], [1, 2, 2, 1], "VALID", name="max_pool")
+      array_ops.squeeze(v, name="output")
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=[input_name],
+        input_dims=[input_dims],
+        num_expected_engines=1,
+        expected_output_dims=(5, 2, 2, 6),
+        allclose_atol=1.e-03,
+        allclose_rtol=1.e-03)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD
index ef6c752851..5a7825f29a 100644
--- a/tensorflow/contrib/tpu/BUILD
+++ b/tensorflow/contrib/tpu/BUILD
@@ -15,6 +15,7 @@ package(
     default_visibility = [
         "//cloud/vmm/testing/tests/tpu:__subpackages__",
         "//learning/brain:__subpackages__",
+        "//learning/deepmind:__subpackages__",
         "//tensorflow:__subpackages__",
     ],
 )
@@ -36,6 +37,7 @@ cc_library(
 py_library(
     name = "tpu_estimator",
     srcs = [
+        "python/tpu/error_handling.py",
         "python/tpu/tpu_config.py",
         "python/tpu/tpu_context.py",
         "python/tpu/tpu_estimator.py",
@@ -165,6 +167,17 @@ py_library(
         "python/tpu/keras_support.py",
     ],
     srcs_version = "PY2AND3",
+    visibility = [
+        "//cloud/vmm/testing/tests/tpu:__subpackages__",
+        "//learning/brain:__subpackages__",
+        # TODO(b/111651964): Clean special visibility for keras_support.
+        #
+        # Note: If you are an end user, please do not add your project to this
+        # visibility. This feature is experimental, and will be made public
+        # when ready.
+        "//third_party/cloud_tpu/models/keras:__subpackages__",
+        "//tensorflow:__subpackages__",
+    ],
     deps = [
         ":tpu_lib",
         ":tpu_py",
diff --git a/tensorflow/contrib/tpu/profiler/tpu_profiler.proto b/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
index f0fca63db0..da4a95e045 100644
--- a/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
+++ b/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
@@ -11,6 +11,9 @@ service TPUProfiler {
   // Starts a profiling session, blocks until it completes, and returns data.
   rpc Profile(ProfileRequest) returns (ProfileResponse) {
   }
+  // Collects profiling data and returns user-friendly metrics.
+  rpc Monitor(MonitorRequest) returns (MonitorResponse) {
+  }
 }
 
 message ProfileOptions {
@@ -104,3 +107,26 @@ message ProfileResponse {
 
   // next-field: 8
 }
+
+message MonitorRequest {
+  // Duration for which to profile between each update.
+  uint64 duration_ms = 1;
+
+  // Indicates the level at which we want to monitor. Currently, two levels are
+  // supported:
+  // Level 1: An ultra lightweight mode that captures only some utilization
+  // metrics.
+  // Level 2: More verbose than level 1. Collects utilization metrics, device
+  // information, step time information, etc. Do not use this option if the TPU
+  // host is being very heavily used.
+  int32 monitoring_level = 2;
+
+  // next-field: 3
+}
+
+message MonitorResponse {
+  // Properly formatted string data that can be directly returned back to user.
+  string data = 1;
+
+  // next-field: 2
+}
diff --git a/tensorflow/contrib/tpu/proto/optimization_parameters.proto b/tensorflow/contrib/tpu/proto/optimization_parameters.proto
index 9150606f5e..2cc17d6d92 100644
--- a/tensorflow/contrib/tpu/proto/optimization_parameters.proto
+++ b/tensorflow/contrib/tpu/proto/optimization_parameters.proto
@@ -1,10 +1,12 @@
-syntax = "proto2";
+syntax = "proto3";
 
 package tensorflow.tpu;
 
+import "google/protobuf/wrappers.proto";
+
 message ClippingLimits {
-  optional float lower = 1 [default = -inf];
-  optional float upper = 2 [default = inf];
+  google.protobuf.FloatValue lower = 1;  // -inf if not set
+  google.protobuf.FloatValue upper = 2;  // +inf if not set
 }
 
 // Get the learning rate from a <yet to be determined> source that can change
@@ -21,18 +23,18 @@ message LearningRate {
 }
 
 message AdagradParameters {
-  optional float initial_accumulator = 1 [default = 0.];
+  float initial_accumulator = 1;
 }
 
 message StochasticGradientDescentParameters {
 }
 
 message FtrlParameters {
-  optional float l1 = 1 [default = 0.];
-  optional float l2 = 2 [default = 0.];
-  optional float lr_power = 3 [default = 0.];
-  optional float initial_accum = 4 [default = 0.];
-  optional float initial_linear = 5 [default = 0.];
+  float l1 = 1;
+  float l2 = 2;
+  float lr_power = 3;
+  float initial_accum = 4;
+  float initial_linear = 5;
 }
 
 // The Adam optimizer does not implement hyper-parameter update; use the dynamic
@@ -41,84 +43,84 @@ message FtrlParameters {
 // Here, t is the current timestep.
 // https://github.com/tensorflow/tensorflow/blob/ab51450c817674c8ff08a7ae4f8ac50cdc4bed8b/tensorflow/python/training/adam.py#L54
 message AdamParameters {
-  optional float beta1 = 3 [default = 0.];
-  optional float beta2 = 4 [default = 0.];
-  optional float epsilon = 5 [default = 0.];
-  optional float initial_m = 6 [default = 0.];
-  optional float initial_v = 7 [default = 0.];
+  float beta1 = 3;
+  float beta2 = 4;
+  float epsilon = 5;
+  float initial_m = 6;
+  float initial_v = 7;
 }
 
 message MomentumParameters {
-  optional float momentum = 1 [default = 0.];
-  optional bool use_nesterov = 2 [default = false];
-  optional float initial_accum = 3 [default = 0.];
+  float momentum = 1;
+  bool use_nesterov = 2;
+  float initial_accum = 3;
 }
 
 message RmsPropParameters {
-  optional float rho = 1 [default = 0.];
-  optional float momentum = 2 [default = 0.];
-  optional float epsilon = 3 [default = 0.];
-  optional float initial_ms = 4 [default = 0.];
-  optional float initial_mom = 5 [default = 0.];
+  float rho = 1;
+  float momentum = 2;
+  float epsilon = 3;
+  float initial_ms = 4;
+  float initial_mom = 5;
 }
 
 message CenteredRmsPropParameters {
-  optional float rho = 1 [default = 0.];
-  optional float momentum = 2 [default = 0.];
-  optional float epsilon = 3 [default = 0.];
-  optional float initial_ms = 4 [default = 0.];
-  optional float initial_mom = 5 [default = 0.];
-  optional float initial_mg = 6 [default = 0.];
+  float rho = 1;
+  float momentum = 2;
+  float epsilon = 3;
+  float initial_ms = 4;
+  float initial_mom = 5;
+  float initial_mg = 6;
 }
 
 message MdlAdagradLightParameters {
-  optional float l2 = 1;
-  optional float lr_power = 2;
-  optional float min_servable_mdl_benefit = 3;
-  optional float mdl_mix_in_margin = 4;
-  optional float mdl_benefit_rampup_coeff = 5;
-  optional float mdl_min_weight = 6;
-  optional float benefit_revisit_scale = 7;
-  optional float max_event_benefit = 8;
-  optional float max_total_benefit = 9;
-  optional float mdl_hard_limit = 10;
-  optional bool hard_limit_min_benefit = 11;
-  optional bool mdl_regularize = 12;
-  optional float initial_accumulator = 13;
-  optional float initial_weight = 14;
-  optional float initial_benefit = 15;
+  float l2 = 1;
+  float lr_power = 2;
+  float min_servable_mdl_benefit = 3;
+  float mdl_mix_in_margin = 4;
+  float mdl_benefit_rampup_coeff = 5;
+  float mdl_min_weight = 6;
+  float benefit_revisit_scale = 7;
+  float max_event_benefit = 8;
+  float max_total_benefit = 9;
+  float mdl_hard_limit = 10;
+  bool hard_limit_min_benefit = 11;
+  bool mdl_regularize = 12;
+  float initial_accumulator = 13;
+  float initial_weight = 14;
+  float initial_benefit = 15;
 }
 
 message AdadeltaParameters {
-  optional float rho = 1;
-  optional float epsilon = 2;
-  optional float initial_accumulator = 3 [default = 0.];
-  optional float initial_update = 4 [default = 0.];
+  float rho = 1;
+  float epsilon = 2;
+  float initial_accumulator = 3;
+  float initial_update = 4;
 }
 
 message ProximalAdagradParameters {
-  optional float l1 = 1;
-  optional float l2 = 2;
-  optional float initial_accumulator = 3;
+  float l1 = 1;
+  float l2 = 2;
+  float initial_accumulator = 3;
 }
 
 message OptimizationParameters {
   // Learning rate used for updating the embedding layer parameters.
-  optional LearningRate learning_rate = 13;
+  LearningRate learning_rate = 13;
   reserved 1;  // Old learning rate tag.
 
   // Limits to which to clip the weight values after the backward pass; not
   // present means no limits are applied.
-  optional ClippingLimits clipping_limits = 2;
+  ClippingLimits clipping_limits = 2;
 
   // Limits to which to clip the backward pass gradient before using it for
   // updates; not present means no limits are applied.
-  optional ClippingLimits gradient_clipping_limits = 7;
+  ClippingLimits gradient_clipping_limits = 7;
 
   // Whether to use gradient accumulation (do two passes over the input
   // gradients: one to accumulate them into a temporary array and another to
   // apply them using the actual optimization algorithm).
-  optional bool use_gradient_accumulation = 15 [default = false];
+  bool use_gradient_accumulation = 15;
 
   // Optimization algorithm parameters; which field is selected determines which
   // algorithm to use.
@@ -140,7 +142,7 @@ message OptimizationParameters {
 // value vector and any extra accumulators, etc.).
 message StateVariableSpecification {
   // Parameter name for the state variable.
-  optional string name = 1;
+  string name = 1;
 
   // A normal state variable that should be saved and restored in checkpoints
   // and used as an input or output to non-debug TensorFlow ops.
@@ -151,7 +153,7 @@ message StateVariableSpecification {
   // from users (used for intermediate gradients being accumulated, for
   // example).
   message FillWithConstant {
-    optional double initial_value = 1;
+    double initial_value = 1;
   }
 
   // Usage type of this state variable.
diff --git a/tensorflow/contrib/tpu/python/tpu/error_handling.py b/tensorflow/contrib/tpu/python/tpu/error_handling.py
new file mode 100644
index 0000000000..52e1ea4237
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/error_handling.py
@@ -0,0 +1,132 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===================================================================
+"""ErrorRendezvous handler for collecting errors from multiple threads."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+import sys
+import threading
+import time
+
+import six
+
+from tensorflow.python.framework import errors
+from tensorflow.python.platform import tf_logging as logging
+
+_UNINTERESTING_ERRORS = (errors.CancelledError,)
+
+
+class ErrorRendezvous(object):
+  """Resolve errors from multiple threads during TPU execution.
+
+  TPU errors can occur on the infeed or outfeed threads as well as the main
+  training thread.
+
+  Depending on which thread "wins" and receives the session error first, we may
+  end up showing users a confusing and non-actionable error message (session
+  cancelled) instead of a root cause (e.g. a bad filename).
+
+  The rendezvous object provides a location to capture these errors until all
+  threads terminate.  At that point we can choose the most informative error
+  to report.
+  """
+
+  def __init__(self, num_sources):
+    # string -> (message, traceback)
+    self._errors = {}
+    self._num_sources = num_sources
+    self._session_cancel_timer = None
+
+  def record_error(self, source, exc_info, session=None):
+    """Report an exception from the given source.
+
+    If a session is passed, a timer will be registered to close it after a few
+    seconds.  This is necessary to ensure the main training loop does not hang
+    if an infeed/oufeed error occurs.  We sleep a few seconds to allow a more
+    interesting error from another thread to propagate.
+
+    Args:
+      source: string, source of the error
+      exc_info: Output from `sys.exc_info` (type, value, traceback)
+      session: Session to close after delay.
+    """
+    _, value, _ = exc_info
+    self._errors[source] = exc_info
+    logging.info('Error recorded from %s: %s', source, value)
+
+    if session is not None and self._session_cancel_timer is None:
+
+      def _cancel_session():
+        time.sleep(5)
+        try:
+          session.close()
+        except:  # pylint: disable=bare-except
+          pass
+
+      self._session_cancel_timer = threading.Thread(target=_cancel_session,)
+      self._session_cancel_timer.daemon = True
+      self._session_cancel_timer.start()
+
+  def record_done(self, source):
+    """Mark execution source `source` as done.
+
+    If an error was originally reported from `source` it is left intact.
+
+    Args:
+      source: `str`, source being recorded
+    """
+    logging.info('%s marked as finished', source)
+    if source not in self._errors:
+      self._errors[source] = None
+
+  @contextlib.contextmanager
+  def catch_errors(self, source, session=None):
+    """Context manager to report any errors within a block."""
+    try:
+      yield
+    except Exception:  # pylint: disable=broad-except
+      self.record_error(source, sys.exc_info(), session)
+
+  def raise_errors(self, timeout_sec=0):
+    """Wait for up to `timeout` seconds for all error sources to finish.
+
+    Preferentially raise "interesting" errors (errors not in the
+    _UNINTERESTING_ERRORS) set.
+
+    Args:
+      timeout_sec: Seconds to wait for other error sources.
+    """
+    for _ in range(timeout_sec):
+      if len(self._errors) == self._num_sources:
+        break
+      time.sleep(1)
+
+    kept_errors = [(k, v) for (k, v) in self._errors.items() if v is not None]
+
+    # First check for any interesting errors, then fall back on the session
+    # cancelled errors etc.
+    for k, (typ, value, traceback) in kept_errors:
+      if isinstance(value, _UNINTERESTING_ERRORS):
+        continue
+      else:
+        logging.warn('Reraising captured error')
+        six.reraise(typ, value, traceback)
+
+    for k, (typ, value, traceback) in kept_errors:
+      logging.warn('Reraising captured error')
+      six.reraise(typ, value, traceback)
diff --git a/tensorflow/contrib/tpu/python/tpu/keras_support.py b/tensorflow/contrib/tpu/python/tpu/keras_support.py
index 8292c920fc..81798ee423 100644
--- a/tensorflow/contrib/tpu/python/tpu/keras_support.py
+++ b/tensorflow/contrib/tpu/python/tpu/keras_support.py
@@ -441,21 +441,23 @@ class TPUNumpyInfeedManager(TPUInfeedManager):
     shard_infeed_tensors = []
 
     for shard_id in range(self._strategy.num_towers):
-      with ops.device('/device:TPU:%d' % shard_id):
+      with ops.device('/device:CPU:0'):
         infeed_tensors = []
-        for spec in input_specs:
-          # Construct placeholders for each of the inputs.
-          infeed_tensors.append(
-              array_ops.placeholder(
-                  dtype=spec.dtype,
-                  shape=spec.shape,
-                  name='infeed-enqueue-%s-%d' % (spec.name, shard_id)))
+        with ops.device('/device:TPU:%d' % shard_id):
+          for spec in input_specs:
+            # Construct placeholders for each of the inputs.
+            infeed_tensors.append(
+                array_ops.placeholder(
+                    dtype=spec.dtype,
+                    shape=spec.shape,
+                    name='infeed-enqueue-%s-%d' % (spec.name, shard_id)))
         shard_infeed_tensors.append(infeed_tensors)
 
         infeed_op.append(
             tpu_ops.infeed_enqueue_tuple(
                 infeed_tensors, [spec.shape for spec in input_specs],
-                name='infeed-enqueue-%s-%d' % (execution_mode, shard_id)))
+                name='infeed-enqueue-%s-%d' % (execution_mode, shard_id),
+                device_ordinal=shard_id))
     return SizedInfeed(infeed_ops=infeed_op,
                        sharded_infeed_tensors=shard_infeed_tensors)
 
@@ -584,12 +586,13 @@ class TPUDatasetInfeedManager(TPUInfeedManager):
     assert len(shard_infeed_tensors) == self._strategy.num_towers
     infeed_ops = []
     for shard_id in range(self._strategy.num_towers):
-      with ops.device('/device:TPU:%d' % shard_id):
+      with ops.device('/device:CPU:0'):
         infeed_ops.append(
             tpu_ops.infeed_enqueue_tuple(
                 shard_infeed_tensors[shard_id],
                 [spec.shape for spec in input_specs],
-                name='infeed-enqueue-%s-%d' % (execution_mode, shard_id)))
+                name='infeed-enqueue-%s-%d' % (execution_mode, shard_id),
+                device_ordinal=shard_id))
     return SizedInfeed(infeed_ops=infeed_ops,
                        sharded_infeed_tensors=shard_infeed_tensors)
 
@@ -740,12 +743,13 @@ class TPUFunction(object):
     # Build output ops.
     outfeed_op = []
     for shard_id in range(self._strategy.num_towers):
-      with ops.device('/device:TPU:%d' % shard_id):
+      with ops.device('/device:CPU:0'):
         outfeed_op.extend(
             tpu_ops.outfeed_dequeue_tuple(
                 dtypes=[spec.dtype for spec in self._outfeed_spec],
                 shapes=[spec.shape for spec in self._outfeed_spec],
-                name='outfeed-dequeue-%s-%d' % (self.execution_mode, shard_id)))
+                name='outfeed-dequeue-%s-%d' % (self.execution_mode, shard_id),
+                device_ordinal=shard_id))
 
     return TPUModelOp(
         compile_op,
@@ -1126,7 +1130,7 @@ Output shape: %(output_shape)s
       'layer': layer,
       'input_shape': layer.input_shape,
       'output_shape': layer.output_shape
-      })
+  })
 
 
 @experimental
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu.py b/tensorflow/contrib/tpu/python/tpu/tpu.py
index 7216626a58..92c1eaba71 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu.py
@@ -314,7 +314,9 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
       # Capture the device function stack at the time of first entry
       # since that is the stack that will be used outside_compilation.
       graph = ops.get_default_graph()
-      self._outer_device_function_stack = list(graph._device_function_stack)  # pylint: disable=protected-access
+      # pylint: disable=protected-access
+      self._outer_device_function_stack = graph._device_function_stack.copy()
+      # pylint: enable=protected-access
     super(TPUReplicateContext, self).Enter()
 
   def HostComputeCore(self):
@@ -633,23 +635,14 @@ def split_compile_and_replicate(computation,
     with tpu_function.tpu_shard_context(
         num_replicas), ops.control_dependencies([metadata]):
 
-      # For backward compatibility reasons, we tag replicated inputs with the
-      # _tpu_replicated_input attribute. This does nothing and exists only for
-      # backward compatibility.
-      # TODO(phawkins): delete the attr_scope after 6/28/2018.
-      # pylint: disable=protected-access
-      with graph._attr_scope({
-          "_tpu_replicated_input": attr_value_pb2.AttrValue(b=True)
-      }):
-        # Add identity ops so even unused inputs are "consumed" by the
-        # computation. This is to avoid orphaned TPUReplicatedInput nodes.
-        # TODO(phawkins): consider instead pruning unused TPUReplicatedInput
-        # and eliding trivial TPUReplicatedInput/TPUReplicatedOutput pairs.
-        computation_inputs = [
-            array_ops.identity(x, name="replicated_input_{}".format(i))
-            for i, x in enumerate(computation_inputs)
-        ]
-      # pylint: enable=protected-access
+      # Add identity ops so even unused inputs are "consumed" by the
+      # computation. This is to avoid orphaned TPUReplicatedInput nodes.
+      # TODO(phawkins): consider instead pruning unused TPUReplicatedInput
+      # and eliding trivial TPUReplicatedInput/TPUReplicatedOutput pairs.
+      computation_inputs = [
+          array_ops.identity(x, name="replicated_input_{}".format(i))
+          for i, x in enumerate(computation_inputs)
+      ]
 
       # If there is an infeed queue, adds the dequeued values to the
       # computation's inputs.
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_context.py b/tensorflow/contrib/tpu/python/tpu/tpu_context.py
index 750e677263..a9cf54f77d 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_context.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_context.py
@@ -146,24 +146,7 @@ class TPUContext(object):
     # Note that: For the non-model parallelism, the mapping could be
     # a random permutation. The order should not matter in most cases
     # as far as model is replicated to all cores in the system.
-
-    # If the precise replica_id to device mapping is required, please
-    # set the num_cores_per_replica to 1 in TPUConfig to enable the
-    # model parallelism.
-    if self._internal_ctx.model_parallelism_enabled:
-      return RuntimeError(
-          'device_for_replica is not yet implemented for model parallelism. '
-          'b/79689078.')
-
-    master = self._internal_ctx.master_job
-    job_device = '' if master is None else ('/job:%s' % master)
-
-    num_of_replicas_per_host = self._internal_ctx.num_of_replicas_per_host
-    host_id = replica_id / num_of_replicas_per_host
-    ordinal_id = replica_id % num_of_replicas_per_host
-
-    host_device = '%s/task:%d/device:CPU:0' % (job_device, host_id)
-    return (host_device, ordinal_id)
+    return self._internal_ctx.device_for_replica(replica_id)
 
 
 class _InternalTPUContext(object):
@@ -611,7 +594,7 @@ class _InternalTPUContext(object):
         raise ValueError(
             'eval batch size {} must be divisible by number of replicas {}'
             .format(self._eval_batch_size, num_replicas))
-      if num_hosts > 1:
+      if num_hosts > 1 and not self.is_input_broadcast_with_iterators():
         raise ValueError(
             'TPUEstimator.evaluate should be running on single TPU worker. '
             'got {}.'.format(num_hosts))
@@ -626,7 +609,7 @@ class _InternalTPUContext(object):
         raise ValueError(
             'predict batch size {} must be divisible by number of replicas {}'
             .format(self._predict_batch_size, num_replicas))
-      if num_hosts > 1:
+      if num_hosts > 1 and not self.is_input_broadcast_with_iterators():
         raise ValueError(
             'TPUEstimator.predict should be running on single TPU worker. '
             'got {}.'.format(num_hosts))
@@ -634,6 +617,33 @@ class _InternalTPUContext(object):
     # Record the state "validated" into lazy dictionary.
     self._lazy_validation_dict[mode] = True
 
+  def device_for_replica(self, replica_id):
+    """Returns the tuple of (CPU device and device ordinal) for replica.
+
+    This should be used for full replicate for non-model-parallelism.
+
+    Args:
+       replica_id: Int, the replica index.
+
+    Returns:
+       A tuple of device spec for CPU device and int device ordinal.
+    """
+    master = self.master_job
+
+    if self.model_parallelism_enabled:
+      return (self.device_assignment.host_device(
+          replica=replica_id, job=master),
+              self.device_assignment.tpu_ordinal(replica=replica_id))
+
+    job_device = '' if master is None else ('/job:%s' % master)
+
+    num_of_replicas_per_host = self.num_of_replicas_per_host
+    host_id = replica_id / num_of_replicas_per_host
+    ordinal_id = replica_id % num_of_replicas_per_host
+
+    host_device = '%s/task:%d/device:CPU:0' % (job_device, host_id)
+    return (host_device, ordinal_id)
+
 
 class _OneCoreTPUContext(_InternalTPUContext):
   """Special _InternalTPUContext for one core usage."""
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 73dfefd19c..ee9ad525ee 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -22,9 +22,9 @@ import collections
 import copy
 import os
 import signal
+import sys
 import threading
 import time
-import traceback
 
 import numpy as np
 import six
@@ -32,6 +32,7 @@ from six.moves import queue as Queue  # pylint: disable=redefined-builtin
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.contrib.tpu.python.ops import tpu_ops
+from tensorflow.contrib.tpu.python.tpu import error_handling
 from tensorflow.contrib.tpu.python.tpu import session_support
 from tensorflow.contrib.tpu.python.tpu import tpu
 from tensorflow.contrib.tpu.python.tpu import tpu_config
@@ -257,7 +258,10 @@ class TPUEstimatorSpec(model_fn_lib._TPUEstimatorSpec):  # pylint: disable=prote
               eval_metrics=None,
               export_outputs=None,
               scaffold_fn=None,
-              host_call=None):
+              host_call=None,
+              training_hooks=None,
+              evaluation_hooks=None,
+              prediction_hooks=None):
     """Creates a validated `TPUEstimatorSpec` instance."""
     host_calls = {}
     if eval_metrics is not None:
@@ -265,6 +269,17 @@ class TPUEstimatorSpec(model_fn_lib._TPUEstimatorSpec):  # pylint: disable=prote
     if host_call is not None:
       host_calls['host_call'] = host_call
     _OutfeedHostCall.validate(host_calls)
+
+    training_hooks = list(training_hooks or [])
+    evaluation_hooks = list(evaluation_hooks or [])
+    prediction_hooks = list(prediction_hooks or [])
+
+    for hook in training_hooks + evaluation_hooks + prediction_hooks:
+      if not isinstance(hook, session_run_hook.SessionRunHook):
+        raise TypeError(
+            'All hooks must be SessionRunHook instances, given: {}'.format(
+                hook))
+
     return super(TPUEstimatorSpec, cls).__new__(
         cls,
         mode=mode,
@@ -274,7 +289,10 @@ class TPUEstimatorSpec(model_fn_lib._TPUEstimatorSpec):  # pylint: disable=prote
         eval_metrics=eval_metrics,
         export_outputs=export_outputs,
         scaffold_fn=scaffold_fn,
-        host_call=host_call)
+        host_call=host_call,
+        training_hooks=training_hooks,
+        evaluation_hooks=evaluation_hooks,
+        prediction_hooks=prediction_hooks)
 
   def as_estimator_spec(self):
     """Creates an equivalent `EstimatorSpec` used by CPU train/eval."""
@@ -290,6 +308,7 @@ class TPUEstimatorSpec(model_fn_lib._TPUEstimatorSpec):  # pylint: disable=prote
     hooks = None
     if self.host_call is not None:
       hooks = [_OutfeedHostCallHook(host_call_ret['host_call'])]
+    hooks = list(hooks or [])
     scaffold = self.scaffold_fn() if self.scaffold_fn else None
     return model_fn_lib.EstimatorSpec(
         mode=self.mode,
@@ -299,9 +318,9 @@ class TPUEstimatorSpec(model_fn_lib._TPUEstimatorSpec):  # pylint: disable=prote
         eval_metric_ops=eval_metric_ops,
         export_outputs=self.export_outputs,
         scaffold=scaffold,
-        training_hooks=hooks,
-        evaluation_hooks=hooks,
-        prediction_hooks=hooks)
+        training_hooks=self.training_hooks + hooks,
+        evaluation_hooks=self.evaluation_hooks + hooks,
+        prediction_hooks=self.prediction_hooks + hooks)
 
 
 class _OpQueueContext(object):
@@ -365,17 +384,17 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
                ctx,
                enqueue_ops,
                dequeue_ops,
-               run_infeed_loop_on_coordinator=True):
+               run_infeed_loop_on_coordinator=True,
+               rendezvous=None):
     self._master_job = ctx.master_job
     self._enqueue_ops = enqueue_ops
     self._dequeue_ops = dequeue_ops
+    self._rendezvous = rendezvous
 
     self._run_infeed_loop_on_coordinator = run_infeed_loop_on_coordinator
     self._initial_infeed_sleep_secs = (
         ctx.config.tpu_config.initial_infeed_sleep_secs)
 
-    self._session_cancel_timer = None
-
     self._feed_error = None
     self._finished = False
 
@@ -392,62 +411,6 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
     for op in summary_writer_init_ops:
       self._finalize_ops.append(contrib_summary.flush(writer=op.inputs[0]))
 
-  def _log_error(self, session, error):
-    """Log an infeed or outfeed error.
-
-    This logs a short error message immediately, and schedules a timer to
-    emit the full stack trace and error message after a short period of time.
-    If the main session has terminated by the time the timer triggers, we
-    assume the real source of the error was from the main session and avoid
-    emitting a stack trace for the infeed.
-
-    Args:
-      session: `tf.Session`, session to be terminated error: exception that
-        triggered logging.
-      error: the Exception to log.
-    """
-    logging.warning(
-        '\n\n'
-        'Error occurred during infeed/outfeed.  This may be due to a compile '
-        'error in the main session.  Waiting for a short time for the main '
-        'session to come back.\n\n%s', error)
-
-    self._feed_error = traceback.format_exc()
-
-    # If we've already encountered a feed error, don't schedule another
-    # cancellation op.
-    if self._session_cancel_timer:
-      return
-
-    def _cancel_session():
-      """Close the session to avoid the main thread from hanging.
-
-      If input pipeline triggers any error, the infeed thread dies but the main
-      thread for TPU computation waits for the infeed enqueue forever. Close the
-      Session to cancel the main thread Session.run execution.
-
-      We sleep for a few seconds before closing to give some time for the TPU
-      compilation error, if any, propagating, from TPU to CPU host. Compilation
-      errors should be reported by the main thread so that the program can be
-      interrupted and users can take action.  Due to a race condition, the
-      infeed thread might see an error first.  Closing the session here
-      immediately would result in a session cancellation exception in the main
-      thread, instead of the expected compile error.  User code that depends on
-      having the proper exception type will therefore be confused.
-      """
-      time.sleep(5)
-
-      # If the main session is still running, the infeed/outfeed errors are
-      # legitimate, and should be logged.
-      if not self._finished and self._feed_error:
-        logging.error('Feed error: %s', self._feed_error)
-        logging.error('Closing session.  A RuntimeError should follow.')
-        session.close()
-
-    self._session_cancel_timer = threading.Thread(target=_cancel_session)
-    self._session_cancel_timer.daemon = True
-    self._session_cancel_timer.start()
-
   def _run_infeed(self, queue_ctx, session):
     logging.info('Starting infeed thread controller.')
     if self._initial_infeed_sleep_secs:
@@ -456,7 +419,7 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
       time.sleep(self._initial_infeed_sleep_secs)
       logging.info('%s thread starting after sleep', self._name)
 
-    try:
+    with self._rendezvous.catch_errors(source='infeed', session=session):
       if self._run_infeed_loop_on_coordinator:
         for count, steps in enumerate(queue_ctx.read_iteration_counts()):
           for i in xrange(steps):
@@ -466,19 +429,15 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
         for _ in queue_ctx.read_iteration_counts():
           session.run(self._enqueue_ops)
       logging.info('Infeed thread finished, shutting down.')
-    except Exception as e:  # pylint: disable=broad-except
-      self._log_error(session, e)
 
   def _run_outfeed(self, queue_ctx, session):
     logging.info('Starting outfeed thread controller.')
-    try:
+    with self._rendezvous.catch_errors(source='outfeed', session=session):
       for count, steps in enumerate(queue_ctx.read_iteration_counts()):
         for i in xrange(steps):
           logging.debug('Outfeed dequeue for iteration (%d, %d)', count, i)
           session.run(self._dequeue_ops)
       logging.info('Outfeed thread finished, shutting down.')
-    except Exception as e:  # pylint: disable=broad-except
-      self._log_error(session, e)
 
   def _create_infeed_controller(self, name, target, args):
     return _OpQueueContext(name=name, target=target, args=args)
@@ -497,11 +456,6 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
   def before_run(self, run_context):
     self._feed_error = None
 
-    # Wait for the cancellation timer to complete before continuing.
-    if self._session_cancel_timer:
-      self._session_cancel_timer.join()
-      self._session_cancel_timer = None
-
     iterations = run_context.session.run(self._iterations_per_loop_var)
 
     logging.info('Enqueue next (%d) batch(es) of data to infeed.', iterations)
@@ -512,16 +466,14 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
     self._outfeed_controller.send_next_batch_signal(iterations)
 
   def end(self, session):
-    if self._session_cancel_timer:
-      logging.warning('Feed error occurred; waiting for message.')
-      self._session_cancel_timer.join()
-
     self._finished = True
     logging.info('Stop infeed thread controller')
     self._infeed_controller.join()
+    self._rendezvous.record_done('infeed')
 
     logging.info('Stop output thread controller')
     self._outfeed_controller.join()
+    self._rendezvous.record_done('outfeed')
 
     logging.info('Shutdown TPU system.')
     session.run(self._finalize_ops)
@@ -529,9 +481,10 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
 
 class TPUInfeedOutfeedSessionHookForPrediction(TPUInfeedOutfeedSessionHook):
 
-  def __init__(self, ctx, enqueue_ops, dequeue_ops):
+  def __init__(self, ctx, enqueue_ops, dequeue_ops, rendezvous=None):
     super(TPUInfeedOutfeedSessionHookForPrediction, self).__init__(
-        ctx, enqueue_ops, dequeue_ops, run_infeed_loop_on_coordinator=False)
+        ctx, enqueue_ops, dequeue_ops, run_infeed_loop_on_coordinator=False,
+        rendezvous=rendezvous)
 
   def _create_infeed_controller(self, name, target, args):
     return _OpSignalOnceQueueContext(name=name, target=target, args=args)
@@ -856,7 +809,15 @@ def generate_broadcast_enqueue_ops_fn(ctx, input_fn, inputs_structure_recorder,
 
     is_dataset = inputs.is_dataset
     if ctx.mode == model_fn_lib.ModeKeys.PREDICT:
-      raise TypeError('Mode PREDICT not yet supported in BROADCAST mode.')
+      if not is_dataset:
+        raise TypeError(
+            'For mode PREDICT, `input_fn` must return `Dataset` instead of '
+            '`features` and `labels`.')
+
+      inputs = _InputsWithStoppingSignals(
+          dataset=inputs.dataset,
+          batch_size=ctx.batch_size_for_input_fn,
+          add_padding=True)
 
     if is_dataset:
       hooks.append(inputs.dataset_initializer_hook())
@@ -875,6 +836,7 @@ def generate_broadcast_enqueue_ops_fn(ctx, input_fn, inputs_structure_recorder,
     """Generates enqueue ops for all the hosts."""
     broadcasted_inputs = []
     flattened_inputs = None  # Cache result from input_fn.
+    signals = None
     for host_id in xrange(num_hosts):
       with ops.device(ctx.tpu_host_placement_function(host_id=host_id)):
         for _ in xrange(ctx.num_of_replicas_per_host):
@@ -884,11 +846,13 @@ def generate_broadcast_enqueue_ops_fn(ctx, input_fn, inputs_structure_recorder,
           # hosts).
           if flattened_inputs is None:
             features, labels = inputs.features_and_labels()  # Calls get_next()
+            signals = inputs.signals()
+
             inputs_structure_recorder.validate_and_record_structure(
-                features, labels)
+                features, labels, signals)
             flattened_inputs = (
                 inputs_structure_recorder.flatten_features_and_labels(
-                    features, labels))
+                    features, labels, signals))
           broadcasted_inputs.append(flattened_inputs)
 
     infeed_queue = tpu_feed.InfeedQueue(
@@ -898,7 +862,14 @@ def generate_broadcast_enqueue_ops_fn(ctx, input_fn, inputs_structure_recorder,
         broadcasted_inputs,
         tpu_ordinal_function=tpu_ordinal_function_impl,
         placement_function=device_function_impl)
-    return enqueue_ops
+
+    if signals is None:
+      return enqueue_ops
+    else:
+      return {
+          'ops': enqueue_ops,
+          'signals': signals,
+      }
 
   return enqueue_ops_fn, captured_infeed_queue, hooks, is_dataset
 
@@ -1145,9 +1116,11 @@ class _InputPipeline(object):
       all_hooks.extend(hooks)
       if is_dataset:
         run_infeed_loop_on_coordinator = False
-        enqueue_ops.append(
-            _wrap_computation_in_while_loop(
-                device=host_device, op_fn=enqueue_ops_fn))
+        wrap_fn = (
+            _wrap_computation_in_while_loop
+            if self._ctx.mode != model_fn_lib.ModeKeys.PREDICT else
+            _wrap_computation_in_while_loop_with_stopping_signals)
+        enqueue_ops.append(wrap_fn(device=host_device, op_fn=enqueue_ops_fn))
       else:
         enqueue_ops.append(enqueue_ops_fn())
       infeed_queues.append(captured_infeed_queue.get())
@@ -1265,6 +1238,7 @@ class _ModelFnWrapper(object):
 
     host_call = _OutfeedHostCall(self._ctx)
     captured_scaffold_fn = _CapturedObject()
+    captured_training_hooks = _CapturedObject()
 
     def train_step(loss):
       """Training step function for use inside a while loop."""
@@ -1281,6 +1255,8 @@ class _ModelFnWrapper(object):
       else:
         captured_scaffold_fn.capture(None)
 
+      captured_training_hooks.capture(estimator_spec.training_hooks)
+
       # We must run train_op to update the variables prior to running the
       # outfeed.
       with ops.control_dependencies([train_op]):
@@ -1292,7 +1268,8 @@ class _ModelFnWrapper(object):
         with ops.control_dependencies(host_call_outfeed_ops):
           return array_ops.identity(loss)
 
-    return train_step, host_call, captured_scaffold_fn
+    return (train_step, host_call, captured_scaffold_fn,
+            captured_training_hooks)
 
   def convert_to_single_tpu_eval_step(self, dequeue_fn):
     """Converts user provided model_fn` as a single eval step on TPU.
@@ -1322,6 +1299,7 @@ class _ModelFnWrapper(object):
     """
     host_calls = _OutfeedHostCall(self._ctx)
     captured_scaffold_fn = _CapturedObject()
+    captured_eval_hooks = _CapturedObject()
 
     def eval_step(total_loss):
       """Evaluation step function for use inside a while loop."""
@@ -1336,6 +1314,8 @@ class _ModelFnWrapper(object):
 
       loss = tpu_estimator_spec.loss
       captured_scaffold_fn.capture(tpu_estimator_spec.scaffold_fn)
+      captured_eval_hooks.capture(tpu_estimator_spec.evaluation_hooks)
+
       to_record = {}
       if tpu_estimator_spec.eval_metrics:
         to_record['eval_metrics'] = tpu_estimator_spec.eval_metrics
@@ -1348,7 +1328,7 @@ class _ModelFnWrapper(object):
       with ops.control_dependencies(host_calls.create_enqueue_op()):
         return math_ops.add(total_loss, loss)
 
-    return eval_step, host_calls, captured_scaffold_fn
+    return eval_step, host_calls, captured_scaffold_fn, captured_eval_hooks
 
   def convert_to_single_tpu_predict_step(self, dequeue_fn):
     """Converts user provided model_fn` as a single predict step on TPU.
@@ -1363,6 +1343,7 @@ class _ModelFnWrapper(object):
     """
     host_calls = _OutfeedHostCall(self._ctx)
     captured_scaffold_fn = _CapturedObject()
+    captured_predict_hooks = _CapturedObject()
 
     def predict_step(unused_scalar_stopping_signal):
       """Evaluation step function for use inside a while loop."""
@@ -1383,6 +1364,7 @@ class _ModelFnWrapper(object):
       self._verify_tpu_spec_predictions(tpu_estimator_spec.predictions)
 
       captured_scaffold_fn.capture(tpu_estimator_spec.scaffold_fn)
+      captured_predict_hooks.capture(tpu_estimator_spec.prediction_hooks)
       to_record = {}
       identity_fn = lambda **kwargs: kwargs
       to_record['predictions'] = [identity_fn, tpu_estimator_spec.predictions]
@@ -1394,7 +1376,8 @@ class _ModelFnWrapper(object):
       with ops.control_dependencies(host_calls.create_enqueue_op()):
         return _StopSignals.as_scalar_stopping_signal(stopping_signals)
 
-    return predict_step, host_calls, captured_scaffold_fn
+    return (predict_step, host_calls, captured_scaffold_fn,
+            captured_predict_hooks)
 
   def _verify_tpu_spec_predictions(self, predictions):
     """Validates TPUEstimatorSpec.predictions dict."""
@@ -1516,11 +1499,9 @@ class _ModelFnWrapper(object):
 
     err_msg = '{} returned by EstimatorSpec is not supported in TPUEstimator.'
     if estimator_spec.training_chief_hooks:
-      raise ValueError(err_msg.format('training_chief_hooks'))
-    if estimator_spec.training_hooks:
-      raise ValueError(err_msg.format('training_hooks'))
-    if estimator_spec.evaluation_hooks:
-      raise ValueError(err_msg.format('evaluation_hooks'))
+      raise ValueError(
+          err_msg.format('training_chief_hooks') + 'If you want' +
+          ' to pass training hooks, please pass via training_hooks.')
 
     if estimator_spec.scaffold:
       logging.warning('EstimatorSpec.Scaffold is ignored by TPU train/eval. '
@@ -1660,11 +1641,13 @@ class _OutfeedHostCall(object):
     # Outfeed ops execute on each replica's first logical core. Note: we must
     # constraint it such that we have at most one outfeed dequeue and enqueue
     # per replica.
-    tpu_device_placement_fn = self._ctx.tpu_device_placement_function
     for i in xrange(self._ctx.num_replicas):
-      with ops.device(tpu_device_placement_fn(i)):
+      host_device, ordinal_id = self._ctx.device_for_replica(i)
+      with ops.device(host_device):
         outfeed_tensors = tpu_ops.outfeed_dequeue_tuple(
-            dtypes=tensor_dtypes, shapes=tensor_shapes)
+            dtypes=tensor_dtypes,
+            shapes=tensor_shapes,
+            device_ordinal=ordinal_id)
         for j, item in enumerate(outfeed_tensors):
           dequeue_ops[j].append(item)
 
@@ -1788,6 +1771,9 @@ class InstallSignalHandlerHook(session_run_hook.SessionRunHook):
 class TPUEstimator(estimator_lib.Estimator):
   """Estimator with TPU support.
 
+  TPUEstimator also supports training on CPU and GPU. You don't need to define
+  a separate `tf.estimator.Estimator`.
+
   TPUEstimator handles many of the details of running on TPU devices, such as
   replicating inputs and models for each core, and returning to host
   periodically to run hooks.
@@ -1825,7 +1811,8 @@ class TPUEstimator(estimator_lib.Estimator):
   Current limitations:
   --------------------
 
-  1. TPU evaluation only works on a single host (one TPU worker).
+  1. TPU evaluation only works on a single host (one TPU worker) except
+     BROADCAST mode.
 
   2. `input_fn` for evaluation should **NOT** raise an end-of-input exception
      (`OutOfRangeError` or `StopIteration`). And all evaluation steps and all
@@ -1996,10 +1983,9 @@ class TPUEstimator(estimator_lib.Estimator):
     """Constructs an `TPUEstimator` instance.
 
     Args:
-      model_fn: Model function as required by `Estimator`. For training, the
-        returned `EstimatorSpec` cannot have hooks as it is not supported in
-        `TPUEstimator`. Instead, the user can pass the training hooks as
-        an argument to `TPUEstimator.train()`.
+      model_fn: Model function as required by `Estimator` which returns
+      EstimatorSpec or TPUEstimatorSpec. `training_hooks`, 'evaluation_hooks',
+      and `prediction_hooks` must not capure any TPU Tensor inside the model_fn.
       model_dir: Directory to save model parameters, graph and etc. This can
         also be used to load checkpoints from the directory into a estimator to
         continue training a previously saved model. If `None`, the model_dir in
@@ -2111,6 +2097,7 @@ class TPUEstimator(estimator_lib.Estimator):
     self._export_to_tpu = export_to_tpu
 
     self._is_input_fn_invoked = None
+    self._rendezvous = {}
 
   def _add_meta_graph_for_mode(self,
                                builder,
@@ -2354,6 +2341,65 @@ class TPUEstimator(estimator_lib.Estimator):
     """
     pass
 
+  def train(self,
+            input_fn,
+            hooks=None,
+            steps=None,
+            max_steps=None,
+            saving_listeners=None):
+    rendezvous = error_handling.ErrorRendezvous(num_sources=3)
+    self._rendezvous[model_fn_lib.ModeKeys.TRAIN] = rendezvous
+    try:
+      return super(TPUEstimator, self).train(
+          input_fn=input_fn, hooks=hooks, steps=steps, max_steps=max_steps,
+          saving_listeners=saving_listeners
+      )
+    except Exception:  # pylint: disable=broad-except
+      rendezvous.record_error('training_loop', sys.exc_info())
+    finally:
+      rendezvous.record_done('training_loop')
+      rendezvous.raise_errors()
+
+  def evaluate(self, input_fn, steps=None, hooks=None, checkpoint_path=None,
+               name=None):
+    rendezvous = error_handling.ErrorRendezvous(num_sources=3)
+    self._rendezvous[model_fn_lib.ModeKeys.EVAL] = rendezvous
+    try:
+      return super(TPUEstimator, self).evaluate(
+          input_fn, steps=steps, hooks=hooks, checkpoint_path=checkpoint_path,
+          name=name
+      )
+    except Exception:  # pylint: disable=broad-except
+      rendezvous.record_error('evaluation_loop', sys.exc_info())
+    finally:
+      rendezvous.record_done('evaluation_loop')
+      rendezvous.raise_errors()
+
+  def predict(self,
+              input_fn,
+              predict_keys=None,
+              hooks=None,
+              checkpoint_path=None,
+              yield_single_examples=True):
+    rendezvous = error_handling.ErrorRendezvous(num_sources=3)
+    self._rendezvous[model_fn_lib.ModeKeys.PREDICT] = rendezvous
+    try:
+      for result in super(TPUEstimator, self).predict(
+          input_fn=input_fn,
+          predict_keys=predict_keys,
+          hooks=hooks,
+          checkpoint_path=checkpoint_path,
+          yield_single_examples=yield_single_examples):
+        yield result
+    except Exception:  # pylint: disable=broad-except
+      rendezvous.record_error('prediction_loop', sys.exc_info())
+    finally:
+      rendezvous.record_done('prediction_loop')
+      rendezvous.raise_errors()
+
+    rendezvous.record_done('prediction_loop')
+    rendezvous.raise_errors()
+
   def _augment_model_fn(self, model_fn, batch_axis):
     """Returns a new model_fn, which wraps the TPU support."""
 
@@ -2408,7 +2454,7 @@ class TPUEstimator(estimator_lib.Estimator):
             graph.add_to_collection(_TPU_ENQUEUE_OPS, enqueue_op)
 
         if mode == model_fn_lib.ModeKeys.TRAIN:
-          loss, host_call, scaffold = (
+          loss, host_call, scaffold, training_hooks = (
               _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn))
           host_ops = host_call.create_tpu_hostcall()
           if host_ops is None:
@@ -2448,7 +2494,9 @@ class TPUEstimator(estimator_lib.Estimator):
                   enqueue_ops,
                   host_ops,
                   run_infeed_loop_on_coordinator=(
-                      run_infeed_loop_on_coordinator)),
+                      run_infeed_loop_on_coordinator),
+                  rendezvous=self._rendezvous[mode],
+              ),
               InstallSignalHandlerHook(),
               training.LoggingTensorHook(
                   {
@@ -2461,6 +2509,9 @@ class TPUEstimator(estimator_lib.Estimator):
               self._config.tpu_config.iterations_per_loop)
           hooks.append(examples_hook)
 
+          if training_hooks:
+            hooks.extend(training_hooks)
+
           chief_hooks = []
           if (self._config.save_checkpoints_secs or
               self._config.save_checkpoints_steps):
@@ -2472,6 +2523,7 @@ class TPUEstimator(estimator_lib.Estimator):
             checkpoint_hook._set_steps_per_run(   # pylint: disable=protected-access
                 self._config.tpu_config.iterations_per_loop)
             chief_hooks.append(checkpoint_hook)
+
           summary.scalar(model_fn_lib.LOSS_METRIC_KEY, loss)
           with ops.control_dependencies([loss]):
             update_ops = _sync_variables_ops()
@@ -2491,7 +2543,7 @@ class TPUEstimator(estimator_lib.Estimator):
               scaffold=scaffold)
 
         if mode == model_fn_lib.ModeKeys.EVAL:
-          total_loss, host_calls, scaffold = _eval_on_tpu_system(
+          total_loss, host_calls, scaffold, eval_hooks = _eval_on_tpu_system(
               ctx, model_fn_wrapper, dequeue_fn)
           iterations_per_loop_var = _create_or_get_iterations_per_loop()
           mean_loss = math_ops.div(total_loss,
@@ -2531,9 +2583,13 @@ class TPUEstimator(estimator_lib.Estimator):
                   enqueue_ops,
                   eval_update_ops + host_ops,
                   run_infeed_loop_on_coordinator=(
-                      run_infeed_loop_on_coordinator)),
+                      run_infeed_loop_on_coordinator),
+                  rendezvous=self._rendezvous[mode]),
           ] + input_hooks
 
+          if eval_hooks:
+            hooks.extend(eval_hooks)
+
           return model_fn_lib.EstimatorSpec(
               mode,
               loss=mean_loss,
@@ -2544,8 +2600,9 @@ class TPUEstimator(estimator_lib.Estimator):
         # Predict
         assert mode == model_fn_lib.ModeKeys.PREDICT
 
-        dummy_predict_op, host_calls, scaffold = _predict_on_tpu_system(
-            ctx, model_fn_wrapper, dequeue_fn)
+        (dummy_predict_op, host_calls,
+         scaffold, prediction_hooks) = _predict_on_tpu_system(
+             ctx, model_fn_wrapper, dequeue_fn)
         with ops.control_dependencies([dummy_predict_op]):
           internal_ops_to_run = _sync_variables_ops()
           with ops.control_dependencies(internal_ops_to_run):
@@ -2597,10 +2654,13 @@ class TPUEstimator(estimator_lib.Estimator):
 
         hooks = [
             _StoppingPredictHook(scalar_stopping_signal),
-            TPUInfeedOutfeedSessionHookForPrediction(ctx, enqueue_ops,
-                                                     host_ops),
+            TPUInfeedOutfeedSessionHookForPrediction(
+                ctx, enqueue_ops, host_ops, rendezvous=self._rendezvous[mode]),
         ] + input_hooks
 
+        if prediction_hooks:
+          hooks.extend(prediction_hooks)
+
         return model_fn_lib.EstimatorSpec(
             mode,
             prediction_hooks=hooks,
@@ -2684,8 +2744,8 @@ def _eval_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
   """Executes `model_fn_wrapper` multiple times on all TPU shards."""
   iterations_per_loop_var = _create_or_get_iterations_per_loop()
 
-  single_tpu_eval_step, host_calls, captured_scaffold_fn = (
-      model_fn_wrapper.convert_to_single_tpu_eval_step(dequeue_fn))
+  (single_tpu_eval_step, host_calls, captured_scaffold_fn, captured_eval_hooks
+  ) = model_fn_wrapper.convert_to_single_tpu_eval_step(dequeue_fn)
 
   def multi_tpu_eval_steps_on_single_shard():
     return training_loop.repeat(
@@ -2700,15 +2760,16 @@ def _eval_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
       device_assignment=ctx.device_assignment)
 
   scaffold = _get_scaffold(captured_scaffold_fn)
-  return loss, host_calls, scaffold
+  return loss, host_calls, scaffold, captured_eval_hooks.get()
 
 
 def _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
   """Executes `model_fn_wrapper` multiple times on all TPU shards."""
   iterations_per_loop_var = _create_or_get_iterations_per_loop()
 
-  single_tpu_train_step, host_call, captured_scaffold_fn = (
-      model_fn_wrapper.convert_to_single_tpu_train_step(dequeue_fn))
+  (single_tpu_train_step, host_call, captured_scaffold_fn,
+   captured_training_hooks) = (
+       model_fn_wrapper.convert_to_single_tpu_train_step(dequeue_fn))
 
   def multi_tpu_train_steps_on_single_shard():
     return training_loop.repeat(
@@ -2723,15 +2784,16 @@ def _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
       device_assignment=ctx.device_assignment)
 
   scaffold = _get_scaffold(captured_scaffold_fn)
-  return loss, host_call, scaffold
+  return loss, host_call, scaffold, captured_training_hooks.get()
 
 
 def _predict_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
   """Executes `model_fn_wrapper` multiple times on all TPU shards."""
   num_cores = ctx.num_cores
 
-  single_tpu_predict_step, host_calls, captured_scaffold_fn = (
-      model_fn_wrapper.convert_to_single_tpu_predict_step(dequeue_fn))
+  (single_tpu_predict_step, host_calls, captured_scaffold_fn,
+   captured_predict_hooks
+  ) = model_fn_wrapper.convert_to_single_tpu_predict_step(dequeue_fn)
 
   def multi_tpu_predict_steps_on_single_shard():
 
@@ -2751,7 +2813,7 @@ def _predict_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
       outputs_from_all_shards=False)
 
   scaffold = _get_scaffold(captured_scaffold_fn)
-  return dummy_predict_op, host_calls, scaffold
+  return dummy_predict_op, host_calls, scaffold, captured_predict_hooks.get()
 
 
 def _wrap_computation_in_while_loop(device, op_fn):
diff --git a/tensorflow/contrib/verbs/rdma_mgr.cc b/tensorflow/contrib/verbs/rdma_mgr.cc
index 9cb3d1fbbf..3cb5e61fac 100644
--- a/tensorflow/contrib/verbs/rdma_mgr.cc
+++ b/tensorflow/contrib/verbs/rdma_mgr.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/bfc_allocator.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_util.h"
+#include "tensorflow/core/common_runtime/pool_allocator.h"
 #include "tensorflow/core/common_runtime/process_state.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.h"
 #include "tensorflow/core/distributed_runtime/session_mgr.h"
@@ -255,28 +256,25 @@ void MRDeleter(ibv_mr* mr) {
   }
 }
 
-// TODO(byronyi): remove this class duplicated from the one in
-// common/runtime/gpu/pool_allocator.h when it is available in common_runtime
-class BasicCPUAllocator : public SubAllocator {
- public:
-  ~BasicCPUAllocator() override {}
-
-  void* Alloc(size_t alignment, size_t num_bytes) override {
-    return port::AlignedMalloc(num_bytes, alignment);
-  }
-  void Free(void* ptr, size_t) override { port::AlignedFree(ptr); }
-};
-
 // TODO(byronyi): remove this class and its registration when the default
-// cpu_allocator() returns visitable allocator
+// cpu_allocator() returns visitable allocator, or cpu_allocator() is no
+// longer in use.
 class BFCRdmaAllocator : public BFCAllocator {
  public:
   BFCRdmaAllocator()
-      : BFCAllocator(new BasicCPUAllocator(), 1LL << 36, true, "cpu_rdma_bfc") {
+      : BFCAllocator(new BasicCPUAllocator(port::kNUMANoAffinity), 1LL << 36,
+                     true, "cpu_rdma_bfc") {}
+};
+class BFCRdmaAllocatorFactory : public AllocatorFactory {
+ public:
+  Allocator* CreateAllocator() { return new BFCRdmaAllocator; }
+
+  SubAllocator* CreateSubAllocator(int numa_node) {
+    return new BasicCPUAllocator(numa_node);
   }
 };
 
-REGISTER_MEM_ALLOCATOR("BFCRdmaAllocator", 101, BFCRdmaAllocator);
+REGISTER_MEM_ALLOCATOR("BFCRdmaAllocator", 101, BFCRdmaAllocatorFactory);
 
 void RdmaMgr::InitAllocators() {
   RdmaMemoryMgr::Singleton().pd_ = rdma_adapter_->pd_;
@@ -284,8 +282,8 @@ void RdmaMgr::InitAllocators() {
   Allocator* allocators[] = {
 #if GOOGLE_CUDA
     GPUProcessState::singleton()->GetCUDAHostAllocator(0),
-    ProcessState::singleton()->GetCPUAllocator(0),
 #endif  // GOOGLE_CUDA
+    ProcessState::singleton()->GetCPUAllocator(0),
     cpu_allocator(),
   };
 
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index fc12027291..35a112e834 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -662,6 +662,7 @@ cc_library(
         "lib/random/random_distributions.h",
         "lib/random/simple_philox.h",
         "lib/strings/numbers.h",
+        "lib/strings/proto_serialization.h",
         "lib/strings/str_util.h",
         "lib/strings/strcat.h",
         "lib/strings/stringprintf.h",
@@ -883,6 +884,16 @@ cc_library(
     copts = tf_copts(),
 )
 
+tf_cc_test(
+    name = "stats_calculator_test",
+    srcs = ["util/stats_calculator_test.cc"],
+    deps = [
+        ":stats_calculator_portable",
+        ":test",
+        ":test_main",
+    ],
+)
+
 cc_library(
     name = "overflow",
     hdrs = ["util/overflow.h"],
@@ -1645,6 +1656,7 @@ cc_library(
     copts = tf_copts(android_optimization_level_override = None) + [
         "-DSUPPORT_SELECTIVE_REGISTRATION",
     ],
+    linkopts = if_android(["-lz"]),
     tags = [
         "manual",
         "notap",
@@ -1668,6 +1680,7 @@ cc_library(
     copts = tf_copts(android_optimization_level_override = None) + tf_opts_nortti_if_android() + [
         "-DSUPPORT_SELECTIVE_REGISTRATION",
     ],
+    linkopts = if_android(["-lz"]),
     tags = [
         "manual",
         "notap",
@@ -2912,6 +2925,14 @@ tf_cuda_library(
 )
 
 cc_library(
+    name = "session_ref",
+    srcs = ["common_runtime/session_ref.cc"],
+    hdrs = ["common_runtime/session_ref.h"],
+    copts = tf_copts(),
+    deps = [":core_cpu_base"],
+)
+
+cc_library(
     name = "gpu_id",
     hdrs = [
         "common_runtime/gpu/gpu_id.h",
@@ -3736,7 +3757,6 @@ tf_cc_tests_gpu(
         "common_runtime/gpu/gpu_bfc_allocator_test.cc",
         "common_runtime/gpu/gpu_device_test.cc",
         "common_runtime/gpu/gpu_id_manager_test.cc",
-        "common_runtime/gpu/gpu_event_mgr_test.cc",
         "common_runtime/gpu/pool_allocator_test.cc",
     ],
     linkstatic = tf_kernel_tests_linkstatic(),
@@ -3760,6 +3780,23 @@ tf_cc_tests_gpu(
     ],
 )
 
+tf_cc_test_gpu(
+    name = "gpu_event_mgr_test",
+    srcs = ["common_runtime/gpu/gpu_event_mgr_test.cc"],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    tags = tf_cuda_tests_tags(),
+    deps = [
+        ":framework",
+        ":framework_internal",
+        ":lib",
+        ":lib_internal",
+        ":protos_all_cc",
+        ":test",
+        ":test_main",
+        ":testlib",
+    ],
+)
+
 tf_cuda_cc_test(
     name = "gpu_device_unified_memory_test",
     size = "small",
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdAdd.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdAdd.pbtxt
index 3b3a274df5..2b58969da2 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdAdd.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdAdd.pbtxt
@@ -51,7 +51,7 @@ For example, say we want to update 4 scattered elements to a rank-1 tensor to
 8 elements. In Python, that update would look like this:
 
 ```python
-    ref = tfe.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+    ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
     indices = tf.constant([[4], [3], [1] ,[7]])
     updates = tf.constant([9, 10, 11, 12])
     update = tf.scatter_nd_add(ref, indices, updates)
diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt
index 58753a651a..ad1c527b01 100644
--- a/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt
@@ -32,8 +32,12 @@ slices within a tensor (initially zero for numeric, empty for string) of
 the given `shape` according to indices.  This operator is the inverse of the
 @{tf.gather_nd} operator which extracts values or slices from a given tensor.
 
+If `indices` contains duplicates, then their updates are accumulated (summed).
+
 **WARNING**: The order in which updates are applied is nondeterministic, so the
-output will be nondeterministic if `indices` contains duplicates.
+output will be nondeterministic if `indices` contains duplicates -- because
+of some numerical approximation issues, numbers summed in different order
+may yield different results.
 
 `indices` is an integer tensor containing indices into a new tensor of shape
 `shape`.  The last dimension of `indices` can be at most the rank of `shape`:
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 4c670820be..d1fd930d25 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -235,7 +235,11 @@ void DirectSession::SchedClosure(thread::ThreadPool* pool,
   // safe given the reasoning above.
   c();
 #else
-  pool->Schedule(std::move(c));
+  if (pool != nullptr) {
+    pool->Schedule(std::move(c));
+  } else {
+    c();
+  }
 #endif  // __ANDROID__
 }
 
@@ -522,8 +526,9 @@ Status DirectSession::RunInternal(int64 step_id, const RunOptions& run_options,
     }
   }
 
-  if (run_options.inter_op_thread_pool() < 0 ||
-      run_options.inter_op_thread_pool() >= thread_pools_.size()) {
+  if (run_options.inter_op_thread_pool() < -1 ||
+      run_options.inter_op_thread_pool() >=
+          static_cast<int32>(thread_pools_.size())) {
     run_state.executors_done.Notify();
     delete barrier;
     return errors::InvalidArgument("Invalid inter_op_thread_pool: ",
@@ -548,7 +553,19 @@ Status DirectSession::RunInternal(int64 step_id, const RunOptions& run_options,
   }
 
   thread::ThreadPool* pool =
-      thread_pools_[run_options.inter_op_thread_pool()].first;
+      run_options.inter_op_thread_pool() >= 0
+          ? thread_pools_[run_options.inter_op_thread_pool()].first
+          : nullptr;
+
+  if (pool == nullptr) {
+    // We allow using the caller thread only when having a single executor
+    // specified.
+    if (executors_and_keys->items.size() > 1) {
+      pool = thread_pools_[0].first;
+    } else {
+      VLOG(1) << "Executing Session::Run() synchronously!";
+    }
+  }
 
   Executor::Args::Runner default_runner = [this,
                                            pool](Executor::Args::Closure c) {
@@ -700,7 +717,8 @@ Status DirectSession::Run(const RunOptions& run_options,
   // Receive outputs.
   if (outputs) {
     std::vector<Tensor> sorted_outputs;
-    const Status s = call_frame.ConsumeRetvals(&sorted_outputs);
+    const Status s = call_frame.ConsumeRetvals(
+        &sorted_outputs, /* allow_dead_tensors = */ false);
     if (errors::IsInternal(s)) {
       return errors::InvalidArgument(s.error_message());
     } else if (!s.ok()) {
diff --git a/tensorflow/core/common_runtime/direct_session_test.cc b/tensorflow/core/common_runtime/direct_session_test.cc
index 142d613129..4b51b20bb1 100644
--- a/tensorflow/core/common_runtime/direct_session_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <map>
 #include <memory>
 #include <string>
+#include <thread>
 #include <unordered_map>
 #include <vector>
 
@@ -896,6 +897,125 @@ TEST(DirectSessionTest, FetchMultipleTimes) {
   }
 }
 
+TEST(DirectSessionTest, MultipleFeedTestSomeSyncRun) {
+  GraphDef def;
+  Graph g(OpRegistry::Global());
+  RunOptions run_options;
+  run_options.set_inter_op_thread_pool(-1);
+
+  Tensor first_value(DT_FLOAT, TensorShape({}));
+  first_value.scalar<float>()() = 1.0;
+  Node* first_const = test::graph::Constant(&g, first_value);
+  Node* first_identity = test::graph::Identity(&g, first_const);
+
+  Tensor second_value(DT_FLOAT, TensorShape({}));
+  second_value.scalar<float>()() = 2.0;
+  Node* second_const = test::graph::Constant(&g, second_value);
+  Node* second_identity = test::graph::Identity(&g, second_const);
+
+  test::graph::ToGraphDef(&g, &def);
+
+  auto session = CreateSession();
+  ASSERT_TRUE(session != nullptr);
+  TF_ASSERT_OK(session->Create(def));
+
+  std::vector<Tensor> outputs;
+
+  // Fetch without feeding.
+  Status s = session->Run(
+      run_options, {},
+      {first_identity->name() + ":0", second_identity->name() + ":0"}, {},
+      &outputs, nullptr);
+  TF_ASSERT_OK(s);
+  ASSERT_EQ(2, outputs.size());
+  ASSERT_EQ(1.0, outputs[0].flat<float>()(0));
+  ASSERT_EQ(2.0, outputs[1].flat<float>()(0));
+
+  s = session->Run(
+      {}, {second_identity->name() + ":0", first_identity->name() + ":0"}, {},
+      &outputs);
+  TF_ASSERT_OK(s);
+  ASSERT_EQ(2, outputs.size());
+  ASSERT_EQ(2.0, outputs[0].flat<float>()(0));
+  ASSERT_EQ(1.0, outputs[1].flat<float>()(0));
+
+  Tensor value_11(DT_FLOAT, TensorShape({}));
+  value_11.scalar<float>()() = 11.0;
+  Tensor value_22(DT_FLOAT, TensorShape({}));
+  value_22.scalar<float>()() = 22.0;
+
+  // Feed [first_const, second_const]
+  s = session->Run(
+      {{first_const->name(), value_11}, {second_const->name(), value_22}},
+      {first_identity->name() + ":0", second_identity->name() + ":0"}, {},
+      &outputs);
+  TF_ASSERT_OK(s);
+  ASSERT_EQ(2, outputs.size());
+  ASSERT_EQ(11.0, outputs[0].flat<float>()(0));
+  ASSERT_EQ(22.0, outputs[1].flat<float>()(0));
+
+  // Feed [second_const, first_const]
+  s = session->Run(
+      {{second_const->name(), value_22}, {first_const->name(), value_11}},
+      {first_identity->name() + ":0", second_identity->name() + ":0"}, {},
+      &outputs);
+  TF_ASSERT_OK(s);
+  ASSERT_EQ(2, outputs.size());
+  ASSERT_EQ(11.0, outputs[0].flat<float>()(0));
+  ASSERT_EQ(22.0, outputs[1].flat<float>()(0));
+
+  // Feed [first_const, first_const]
+  s = session->Run(
+      run_options,
+      {{first_const->name(), value_11}, {first_const->name(), value_22}},
+      {first_identity->name() + ":0", second_identity->name() + ":0"}, {},
+      &outputs, nullptr);
+  EXPECT_TRUE(errors::IsInvalidArgument(s));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(), "fed more than once"));
+}
+
+REGISTER_OP("ThreadID").Input("x: int64").Output("y: int64").Doc(R"doc(
+ThreadID returns the thread ID that called compute.
+
+x: int64
+y: int64
+)doc");
+
+// The ThreadID kernel returns the thread ID that executed Compute.
+class ThreadIDOp : public OpKernel {
+ public:
+  explicit ThreadIDOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  void Compute(OpKernelContext* ctx) override {
+    Tensor* out_tensor = nullptr;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output("y", TensorShape({}), &out_tensor));
+    std::hash<std::thread::id> hasher;
+    out_tensor->scalar<int64>()() =
+        static_cast<int64>(hasher(std::this_thread::get_id()));
+  }
+};
+REGISTER_KERNEL_BUILDER(Name("ThreadID").Device(DEVICE_CPU), ThreadIDOp);
+
+TEST(DirectSessionTest, SessionSyncRun) {
+  Graph g(OpRegistry::Global());
+  Tensor vx(DT_INT64, TensorShape({}));
+  vx.scalar<int64>()() = 17;
+  Node* x = test::graph::Constant(&g, vx);
+  Node* y = test::graph::Unary(&g, "ThreadID", x);
+  GraphDef def;
+  test::graph::ToGraphDef(&g, &def);
+  auto sess = CreateSession();
+  TF_ASSERT_OK(sess->Create(def));
+  std::vector<Tensor> outputs;
+  RunOptions run_opts;
+  run_opts.set_inter_op_thread_pool(-1);
+  auto s = sess->Run(run_opts, {}, {y->name() + ":0"}, {}, &outputs, nullptr);
+
+  std::hash<std::thread::id> hasher;
+  EXPECT_EQ(static_cast<int64>(hasher(std::this_thread::get_id())),
+            static_cast<int64>(outputs[0].scalar<int64>()()));
+}
+
 REGISTER_OP("Darth").Input("x: float").Output("y: float").Doc(R"doc(
 Darth promises one return value.
 
@@ -1400,6 +1520,7 @@ static void TestSessionInterOpThreadsImpl(bool use_function_lib,
   p = options.config.add_session_inter_op_thread_pool();
   if (use_global_pools) p->set_global_name("small pool");
   p->set_num_threads(1);
+  const int kSyncPool = -1;
   const int kLargePool = 0;
   const int kSmallPool = 1;
 
@@ -1442,7 +1563,11 @@ static void TestSessionInterOpThreadsImpl(bool use_function_lib,
           EXPECT_FLOAT_EQ(1.2, flat(0));
           num_done.fetch_add(1);
         };
-        tp->Schedule(fn);
+        if (tp != nullptr) {
+          tp->Schedule(fn);
+        } else {
+          fn();
+        }
       };
 
   // For blocking states:
@@ -1463,9 +1588,10 @@ static void TestSessionInterOpThreadsImpl(bool use_function_lib,
 
   tp1 = new thread::ThreadPool(Env::Default(), "tp1", 5);
 
-  // Launch 2 session run calls. Neither will finish until the blocking op is
+  // Launch a session run call. It will not finish until the blocking op is
   // unblocked, because it is using all threads in the small pool.
   add_session_run_call(tp1, y, kSmallPool);
+
   blocking_op_state->AwaitState(1);  // Wait for the blocking op to Compute.
 
   // These will block on <BlockingOpState>.
@@ -1484,10 +1610,15 @@ static void TestSessionInterOpThreadsImpl(bool use_function_lib,
   delete tp2;
   EXPECT_EQ(kUnblockedThreads, num_done.load());
 
+  // Launch a session call using this thread. This will finish as it runs
+  // synchronously in this thread.
+  add_session_run_call(nullptr, x, kSyncPool);
+
   // Unblock the blocked op and wait for the blocked functions to finish.
   blocking_op_state->MoveToState(1, 2);
   delete tp1;
-  EXPECT_EQ(kUnblockedThreads + kBlockedThreads + 1, num_done.load());
+
+  EXPECT_EQ(kUnblockedThreads + kBlockedThreads + 1 + 1, num_done.load());
   delete blocking_op_state;
   blocking_op_state = nullptr;
 }
@@ -1532,7 +1663,7 @@ TEST(DirectSessionTest, TestSessionInterOpThreadsInvalidOptions) {
   {
     std::unique_ptr<Session> session(NewSession(options));
     TF_ASSERT_OK(session->Create(def));
-    for (int pool_num = -1; pool_num <= 1; pool_num += 2) {
+    for (int pool_num = -2; pool_num <= 1; pool_num += 3) {
       RunOptions run_options;
       run_options.set_inter_op_thread_pool(pool_num);
       std::vector<Tensor> outputs;
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index 1c5e9a2a31..5e0f0a45f8 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -17,8 +17,20 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/lib/core/blocking_counter.h"
+#include "tensorflow/core/util/env_var.h"
 
 namespace tensorflow {
+namespace {
+
+bool ReadBoolFromEnvVar(StringPiece env_var_name, bool default_val) {
+  bool val;
+  if (ReadBoolFromEnvVar(env_var_name, default_val, &val).ok()) {
+    return val;
+  }
+  return default_val;
+}
+
+}  // namespace
 
 EagerContext::EagerContext(const SessionOptions& opts,
                            ContextDevicePlacementPolicy default_policy,
@@ -34,7 +46,8 @@ EagerContext::EagerContext(const SessionOptions& opts,
           local_device_manager_.get(), opts.env, TF_GRAPH_DEF_VERSION,
           &func_lib_def_, {}, thread_pool_.get())),
       log_device_placement_(opts.config.log_device_placement()),
-      async_default_(async) {
+      async_default_(async),
+      use_send_tensor_rpc_(false) {
   InitDeviceMapAndAsync();
   if (opts.config.inter_op_parallelism_threads() > 0) {
     runner_ = [this](std::function<void()> closure) {
@@ -66,7 +79,9 @@ EagerContext::EagerContext(
       remote_device_manager_(std::move(remote_device_manager)),
       server_(std::move(server)),
       remote_eager_workers_(std::move(remote_eager_workers)),
-      remote_contexts_(remote_contexts) {
+      remote_contexts_(remote_contexts),
+      use_send_tensor_rpc_(
+          ReadBoolFromEnvVar("TF_EAGER_REMOTE_USE_SEND_TENSOR_RPC", false)) {
   InitDeviceMapAndAsync();
 }
 #endif
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index d0563280bf..4a180e074d 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -182,6 +182,11 @@ class EagerContext {
 #ifndef __ANDROID__
   Status GetClientAndContextID(Device* device, eager::EagerClient** client,
                                uint64* context_id);
+
+  // If true, then tensors should be shipped across processes via the
+  // EagerService.SendTensor RPC. If false, _Send/_Recv ops should be used
+  // instead (which in-turn use WorkerService.RecvTensor RPCs.
+  bool UseSendTensorRPC() { return use_send_tensor_rpc_; }
 #endif
  private:
   void InitDeviceMapAndAsync();
@@ -239,16 +244,18 @@ class EagerContext {
 
   const std::unique_ptr<DeviceMgr> remote_device_manager_;
 
+#ifndef __ANDROID__
   // The server_ is not const since we release it when the context is destroyed.
   // Therefore the server_ object is not marked as const (even though it should
   // be).
-#ifndef __ANDROID__
   std::unique_ptr<ServerInterface> server_;
   const std::unique_ptr<eager::EagerClientCache> remote_eager_workers_;
 
   const gtl::FlatMap<string, uint64> remote_contexts_;
   gtl::FlatMap<Device*, std::pair<eager::EagerClient*, uint64>>
       device_to_client_cache_;
+
+  const bool use_send_tensor_rpc_;
 #endif
 };
 
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index 5ea814ed4e..f97fa4fadc 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -88,6 +88,8 @@ Status MaybeCopyInputToExpectedDevice(EagerOperation* op, int i,
   TF_RETURN_IF_ERROR((*handle)->Device(&handle_device));
   const Device* actual_device =
       handle_device == nullptr ? ctx->HostCPU() : handle_device;
+  const Device* op_device =
+      op->Device() == nullptr ? ctx->HostCPU() : op->Device();
 
   if (expected_device != actual_device) {
     switch (ctx->GetDevicePlacementPolicy()) {
@@ -106,8 +108,8 @@ Status MaybeCopyInputToExpectedDevice(EagerOperation* op, int i,
             " cannot compute ",
             op->Name(), " as input #", i, " was expected to be on ",
             expected_device->name(), " but is actually on ",
-            actual_device->name(), " (operation running on ",
-            op->Device()->name(), ")",
+            actual_device->name(), " (operation running on ", op_device->name(),
+            ")",
             " Tensors can be copied explicitly using .gpu() or .cpu() "
             "methods,"
             " or transparently copied by using tf.enable_eager_execution("
@@ -118,7 +120,7 @@ Status MaybeCopyInputToExpectedDevice(EagerOperation* op, int i,
         LOG(WARNING) << "before computing " << op->Name() << " input #" << i
                      << " was expected to be on " << expected_device->name()
                      << " but is actually on " << actual_device->name()
-                     << " (operation running on " << op->Device()->name()
+                     << " (operation running on " << op_device->name()
                      << "). This triggers a copy which can be a performance "
                         "bottleneck.";
         break;
@@ -173,7 +175,7 @@ Status ValidateInputTypeAndPlacement(EagerContext* ctx, Device* op_device,
     tensorflow::TensorHandle* handle = op->Inputs()[i];
     if (handle->dtype != kernel->input_type(i)) {
       return errors::InvalidArgument(
-          "cannot compute ", op->Name(), " as input #", i,
+          "cannot compute ", op->Name(), " as input #", i, "(zero-based)",
           " was expected to be a ", DataTypeString(kernel->input_type(i)),
           " tensor but is a ", DataTypeString(handle->dtype), " tensor");
     }
@@ -446,6 +448,14 @@ bool IsLocal(EagerContext* ctx, tensorflow::Device* d) {
   return ctx->local_device_mgr()->LookupDevice(d->name(), &tmp).ok();
 }
 
+bool OnSameTask(EagerContext* ctx, Device* first, Device* second) {
+  if (first == nullptr) first = ctx->HostCPU();
+  if (second == nullptr) second = ctx->HostCPU();
+  return first->parsed_name().job == second->parsed_name().job &&
+         first->parsed_name().replica == second->parsed_name().replica &&
+         first->parsed_name().task == second->parsed_name().task;
+}
+
 Status EagerLocalExecute(EagerOperation* op,
                          gtl::InlinedVector<TensorHandle*, 2>* retvals,
                          int* num_retvals) {
@@ -583,6 +593,87 @@ Status EagerLocalExecute(EagerOperation* op,
   return status;
 }
 
+std::function<void()> GetRemoteTensorDestructor(
+    EagerContext* ctx, eager::EagerClient* eager_client, uint64 context_id,
+    uint64 op_id, int output_num) {
+  return [ctx, eager_client, context_id, op_id, output_num]() {
+    std::unique_ptr<eager::EnqueueRequest> request(new eager::EnqueueRequest);
+    request->set_context_id(context_id);
+
+    auto* handle_to_decref = request->add_queue()->mutable_handle_to_decref();
+    handle_to_decref->set_op_id(op_id);
+    handle_to_decref->set_output_num(output_num);
+
+    if (ctx->Async()) {
+      tensorflow::uint64 id = ctx->NextId();
+      auto* node =
+          new eager::RemoteExecuteNode(id, std::move(request), eager_client);
+      ctx->ExecutorAdd(node);
+    } else {
+      eager::EnqueueRequest* actual_request = request.release();
+      eager::EnqueueResponse* response = new eager::EnqueueResponse;
+      eager_client->EnqueueAsync(
+          actual_request, response,
+          [actual_request, response](const tensorflow::Status& s) {
+            delete actual_request;
+            delete response;
+          });
+    }
+
+    return tensorflow::Status::OK();
+  };
+}
+
+// When !ctx->UseSendTensorRPC(), then tensors are shipped between remote
+// devices by the receiver invoking the WorkerService.RecvTensor RPC *on the
+// sender* (Rendezvous::RecvAsync() invoked by the _Recv kernel).
+//
+// However, in some configurations the node that has the tensor to be copied
+// isn't running a server (WorkerService RPC interface). For such cases,
+// this function enables sending tensors using the EagerService.SendTensor RPC
+// *on the receiver*.
+Status EagerRemoteSendTensor(EagerContext* ctx, TensorHandle* h,
+                             Device* recv_device, TensorHandle** result) {
+  eager::EagerClient* eager_client;
+  uint64 context_id;
+  TF_RETURN_IF_ERROR(
+      ctx->GetClientAndContextID(recv_device, &eager_client, &context_id));
+
+  eager::SendTensorRequest request;
+  eager::SendTensorResponse response;
+
+  request.set_context_id(context_id);
+  request.set_op_id(ctx->NextId());
+  request.set_device_name(recv_device->name());
+
+  const Tensor* tensor;
+  TF_RETURN_IF_ERROR(h->Tensor(&tensor));
+  tensor->AsProtoTensorContent(request.add_tensors());
+
+  const tensorflow::uint64 id = request.op_id();
+
+  // TODO(nareshmodi): support making this call async.
+  Notification n;
+  Status status;
+  eager_client->SendTensorAsync(&request, &response,
+                                [&n, &status](const Status& s) {
+                                  status = s;
+                                  n.Notify();
+                                });
+  n.WaitForNotification();
+  if (!status.ok()) return status;
+
+  std::function<void()> destructor =
+      GetRemoteTensorDestructor(ctx, eager_client, context_id, id, 0);
+
+  *result = new TensorHandle(id, /*output_num=*/0, /*remote_shape_node_id=*/0,
+                             tensor->dtype(), std::move(destructor),
+                             recv_device, recv_device, ctx);
+  (*result)->SetRemoteShape(MakeUnique<TensorShape>(tensor->shape()));
+
+  return Status::OK();
+}
+
 Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
                           int* num_retvals) {
 #ifdef __ANDROID__
@@ -596,15 +687,21 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
   TF_RETURN_IF_ERROR(
       ctx->GetClientAndContextID(op->Device(), &eager_client, &context_id));
 
-  eager::EnqueueRequest request;
+  std::unique_ptr<eager::EnqueueRequest> request(new eager::EnqueueRequest);
   eager::EnqueueResponse response;
 
-  auto* remote_op = request.add_queue()->mutable_operation();
+  request->set_context_id(context_id);
+
+  auto* remote_op = request->add_queue()->mutable_operation();
 
   for (int i = 0; i < op->Inputs().size(); i++) {
     tensorflow::Device* input_device;
     TF_RETURN_IF_ERROR(op->Inputs()[i]->Device(&input_device));
-    if (op->Device() != input_device) {
+    if (op->Device() != input_device &&
+        // If the expected and actual devices are on the same task, don't
+        // explicitly copy, and instead depend on the copy to happen locally
+        // when the op is executed on the device.
+        !OnSameTask(ctx, op->Device(), input_device)) {
       // TODO(b/110044833): It's possible the same tensor gets copied to the
       // remote device repeatedly.
       TF_RETURN_IF_ERROR(MaybeCopyInputToExpectedDevice(
@@ -629,8 +726,6 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
   op->Attrs().FillAttrValueMap(remote_op->mutable_attrs());
   remote_op->set_device(op->Device()->name());
 
-  request.set_context_id(context_id);
-
   DataTypeVector output_dtypes;
   TF_RETURN_IF_ERROR(GetOutputDTypes(op, &output_dtypes));
 
@@ -652,32 +747,11 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
   for (int i = 0; i < *num_retvals; i++) {
     // TODO(nareshmodi): Change the callback to instead add the decref to a list
     // of pending decrefs that we can send as a batch with the next execute.
-    std::function<void()> callback = [ctx, eager_client, context_id, id, i]() {
-      eager::EnqueueRequest request;
-      request.set_context_id(context_id);
-
-      auto* handle_to_decref = request.add_queue()->mutable_handle_to_decref();
-      handle_to_decref->set_op_id(id);
-      handle_to_decref->set_output_num(i);
-
-      if (ctx->Async()) {
-        tensorflow::uint64 id = ctx->NextId();
-        auto* node = new eager::RemoteExecuteNode(id, request, eager_client);
-        ctx->ExecutorAdd(node);
-      } else {
-        Notification n;
-        eager::EnqueueResponse response;
-        eager_client->EnqueueAsync(
-            &request, &response,
-            [&n](const tensorflow::Status& s) { n.Notify(); });
-        n.WaitForNotification();
-      }
-
-      return tensorflow::Status::OK();
-    };
+    std::function<void()> destructor =
+        GetRemoteTensorDestructor(ctx, eager_client, context_id, id, i);
 
     retvals[i] = new TensorHandle(remote_op->id(), i, remote_node_id,
-                                  output_dtypes[i], std::move(callback),
+                                  output_dtypes[i], std::move(destructor),
                                   op_device, op_device, op->EagerContext());
   }
 
@@ -691,7 +765,7 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
     }
     // Unable to capture via std::move, so bind instead.
     auto* node = new eager::RemoteExecuteNode(
-        remote_node_id, request, eager_client, op->Inputs(),
+        remote_node_id, std::move(request), eager_client, op->Inputs(),
         std::bind(
             [](const gtl::InlinedVector<TensorHandle*, 2>& retvals,
                const Status& status, const eager::EnqueueResponse& response) {
@@ -708,7 +782,7 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
   } else {
     Notification n;
     Status status;
-    eager_client->EnqueueAsync(&request, &response,
+    eager_client->EnqueueAsync(request.get(), &response,
                                [&n, &status](const Status& s) {
                                  status = s;
                                  n.Notify();
@@ -937,6 +1011,8 @@ Status EagerCopyToDevice(TensorHandle* h, EagerContext* ctx,
 
   if (sender_is_local && recver_is_local) {
     return LocalEagerCopyToDevice(h, ctx, recv_device, result);
+  } else if (ctx->UseSendTensorRPC() && sender_is_local && !recver_is_local) {
+    return EagerRemoteSendTensor(ctx, h, recv_device, result);
   } else {
     string wire_id = GetUniqueWireID();
 
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index a93cfa2ec5..54bbe84b57 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -746,6 +746,8 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
     rets_alloc_attrs.push_back(ret_alloc_attrs);
   }
 
+  bool allow_dead_tensors = opts.allow_dead_tensors;
+
   // The ProcFLR sends the arguments to the function from the source_device to
   // the target_device. So here we receive those arguments. Similarly, when the
   // computation is done and stored in *rets, we send the return values back
@@ -756,7 +758,7 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
       device_context, args_alloc_attrs, rendezvous, remote_args,
       [frame, remote_args, item, source_device, target_device,
        target_incarnation, rendezvous, device_context, rets, done, exec_args,
-       rets_alloc_attrs](const Status& status) {
+       rets_alloc_attrs, allow_dead_tensors](const Status& status) {
         Status s = status;
         if (s.ok()) {
           s = frame->SetArgs(*remote_args);
@@ -769,13 +771,13 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
           return;
         }
         item->exec->RunAsync(
-            *exec_args,
-            [frame, rets, done, source_device, target_device,
-             target_incarnation, rendezvous, device_context, remote_args,
-             exec_args, rets_alloc_attrs](const Status& status) {
+            *exec_args, [frame, rets, done, source_device, target_device,
+                         target_incarnation, rendezvous, device_context,
+                         remote_args, exec_args, rets_alloc_attrs,
+                         allow_dead_tensors](const Status& status) {
               Status s = status;
               if (s.ok()) {
-                s = frame->ConsumeRetvals(rets);
+                s = frame->ConsumeRetvals(rets, allow_dead_tensors);
               }
               delete frame;
               if (!s.ok()) {
@@ -859,14 +861,15 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
     return;
   }
 
+  bool allow_dead_tensors = opts.allow_dead_tensors;
   item->exec->RunAsync(
       // Executor args
       *exec_args,
       // Done callback.
-      [frame, rets, done, exec_args](const Status& status) {
+      [frame, rets, done, exec_args, allow_dead_tensors](const Status& status) {
         Status s = status;
         if (s.ok()) {
-          s = frame->ConsumeRetvals(rets);
+          s = frame->ConsumeRetvals(rets, allow_dead_tensors);
         }
         delete frame;
         delete exec_args;
diff --git a/tensorflow/core/common_runtime/function_test.cc b/tensorflow/core/common_runtime/function_test.cc
index 1e837e9a7e..120f480198 100644
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@@ -1019,8 +1019,9 @@ TEST_F(FunctionLibraryRuntimeTest, Error_BadControlFlow) {
   DCHECK_EQ(x.dtype(), DT_INT32);
   Tensor y;
   HasError(InstantiateAndRun(flr0_, "InvalidControlFlow", {}, {x}, {&y}),
-           "The node 'add' has inputs from different frames. The input 'enter' "
-           "is in frame 'while'. The input 'i' is in frame ''.");
+           "{{node add}} has inputs from different frames. The input"
+           " {{node enter}} is in frame 'while'. The input {{node i}} is in"
+           " frame ''.");
 }
 
 TEST_F(FunctionLibraryRuntimeTest, Gradient_XTimesTwo) {
diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
index 4898448476..3c1c31aa73 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
@@ -15,11 +15,80 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
 
+#include "tensorflow/core/platform/stacktrace.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 
 namespace tensorflow {
 
+namespace {
+// The EventMgr has 1 thread for the polling loop and one to execute
+// event callback functions. Issues for reconsideration:
+//  - Is this the right number of threads?
+//  - Should EventMgrs be shared between GPUDevices on a multi-GPU machine?
+static const int kNumThreads = 2;
+}  // namespace
+
+namespace gpu_event_mgr {
+class ThreadLabel {
+ public:
+  static const char* GetValue() { return value_; }
+
+  // v must be a static const because value_ will capture and use its value
+  // until reset or thread terminates.
+  static void SetValue(const char* v) { value_ = v; }
+
+ private:
+  static thread_local const char* value_;
+};
+thread_local const char* ThreadLabel::value_ = "";
+
+void WarnIfInCallback(std::function<void()> f) {
+  const char* label = ThreadLabel::GetValue();
+  if (label && !strcmp(label, "gpu_event_mgr")) {
+    if (f) {
+      f();
+    } else {
+      LOG(WARNING) << "Executing inside EventMgr callback thread: "
+                   << CurrentStackTrace();
+    }
+  }
+}
+
+void InitThreadpoolLabels(thread::ThreadPool* threadpool) {
+  static const char* label = "gpu_event_mgr";
+  mutex mu;
+  int init_count = 0;
+  condition_variable all_initialized;
+  int exit_count = 0;
+  condition_variable ready_to_exit;
+  const int num_threads = threadpool->NumThreads();
+  for (int i = 0; i < num_threads; ++i) {
+    threadpool->Schedule([num_threads, &mu, &init_count, &all_initialized,
+                          &exit_count, &ready_to_exit]() {
+      gpu_event_mgr::ThreadLabel::SetValue(label);
+      mutex_lock l(mu);
+      ++init_count;
+      if (init_count == num_threads) {
+        all_initialized.notify_all();
+      }
+      while (init_count < num_threads) {
+        all_initialized.wait(l);
+      }
+      if (++exit_count == num_threads) {
+        ready_to_exit.notify_all();
+      }
+    });
+  }
+  {
+    mutex_lock l(mu);
+    while (exit_count < num_threads) {
+      ready_to_exit.wait(l);
+    }
+  }
+}
+}  // namespace gpu_event_mgr
+
 EventMgr::EventMgr(se::StreamExecutor* se, const GPUOptions& gpu_options)
     : exec_(se),
       deferred_bytes_threshold_(gpu_options.deferred_deletion_bytes()
@@ -31,9 +100,8 @@ EventMgr::EventMgr(se::StreamExecutor* se, const GPUOptions& gpu_options)
       accumulated_stream_(nullptr),
       accumulated_tensors_(new TensorReferenceVector),
       accumulated_tensor_bytes_(0),
-      // threadpool_ has 1 thread for the polling loop, and one to execute
-      // event callback functions. Maybe we should have more?
-      threadpool_(Env::Default(), "GPU_Event_Manager", 2) {
+      threadpool_(Env::Default(), "GPU_Event_Manager", kNumThreads) {
+  gpu_event_mgr::InitThreadpoolLabels(&threadpool_);
   StartPollingLoop();
 }
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
index b26f88a201..f0a109cc10 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
@@ -39,6 +39,25 @@ namespace tensorflow {
 
 class GPUOptions;
 
+// The callback provided to EventMgr::ThenExecute must not block or take a long
+// time.  If it does, performance may be impacted and GPU memory may be
+// exhausted.  This macro is for checking that an EventMgr thread is not
+// accidentally entering blocking parts of the code, e.g. the RPC subsystem.
+//
+// Intended use is something like
+//
+//   void RespondToAnRPC(Params* params) {
+//      WARN_IF_IN_EVENT_MGR_THREAD;
+//      if (params->status.ok()) { ...
+//
+namespace gpu_event_mgr {
+// Logs a stack trace if current execution thread belongs to this EventMgr
+// object.  If f is not nullptr, executes instead of  logging the stack trace.
+// trace.
+void WarnIfInCallback(std::function<void()> f);
+}  // namespace gpu_event_mgr
+#define WARN_IF_IN_EVENT_MGR_THREAD gpu_event_mgr::WarnIfInCallback(nullptr)
+
 // An object to keep track of pending Events in the StreamExecutor streams
 // and associated Tensors that cannot safely be deleted until the associated
 // Events are recorded.
@@ -74,6 +93,9 @@ class EventMgr {
     FreeMemory(to_free);
   }
 
+  // Execute func when all pending stream actions have completed.
+  // func must be brief and non-blocking since it executes in the one
+  // thread used for all such callbacks and also buffer deletions.
   inline void ThenExecute(se::Stream* stream, std::function<void()> func) {
     ToFreeVector to_free;
     {
diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc b/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
index c5ff6c97a1..d2adf699f5 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include <atomic>
 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
+#include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/config.pb.h"
@@ -243,6 +244,28 @@ TEST(EventMgr, NonEmptyShutdown) {
   }
 }
 
+// Tests that WarnIfInCallback() triggers correctly.
+TEST(EventMgr, WarnIfInCallback) {
+  auto stream_exec = GPUMachineManager()->ExecutorForDevice(0).ValueOrDie();
+  EventMgr em(stream_exec, GPUOptions());
+  TEST_EventMgrHelper th(&em);
+  std::unique_ptr<se::Stream> stream(new se::Stream(stream_exec));
+  CHECK(stream);
+  stream->Init();
+  bool hit = false;
+  gpu_event_mgr::WarnIfInCallback([&hit] { hit = true; });
+  EXPECT_FALSE(hit);
+  Notification note;
+  em.ThenExecute(stream.get(), [&hit, &note]() {
+    gpu_event_mgr::WarnIfInCallback([&hit, &note] {
+      hit = true;
+      note.Notify();
+    });
+  });
+  note.WaitForNotification();
+  EXPECT_TRUE(hit);
+}
+
 }  // namespace
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.h b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
index 29f702699f..94e10dbfa2 100644
--- a/tensorflow/core/common_runtime/mkl_cpu_allocator.h
+++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
@@ -22,7 +22,6 @@ limitations under the License.
 #ifdef INTEL_MKL
 
 #include <cstdlib>
-#include <string>
 #include "tensorflow/core/common_runtime/bfc_allocator.h"
 #include "tensorflow/core/common_runtime/visitable_allocator.h"
 #include "tensorflow/core/lib/strings/numbers.h"
diff --git a/tensorflow/core/common_runtime/placer.cc b/tensorflow/core/common_runtime/placer.cc
index 6781c87f6c..613470365d 100644
--- a/tensorflow/core/common_runtime/placer.cc
+++ b/tensorflow/core/common_runtime/placer.cc
@@ -41,10 +41,8 @@ namespace {
 const StringPiece kColocationAttrNameStringPiece(kColocationAttrName);
 const StringPiece kColocationGroupPrefixStringPiece(kColocationGroupPrefix);
 
-// Returns a list of devices sorted by preferred type and then name
-// from 'devices' whose type is in 'supported_device_types'.  This
-// function searches the device types in 'supported_device_types' and
-// returns the subset of devices that match.
+// Returns a list of devices having type in supported_device_types.  The
+// returned list is sorted by preferred type (higher numeric type is preferred).
 std::vector<Device*> FilterSupportedDevices(
     const std::vector<Device*>& devices,
     const DeviceTypeVector& supported_device_types) {
@@ -81,12 +79,12 @@ std::vector<Device*> FilterSupportedDevices(
 //   DeviceSet device_set = ...;
 //   ColocationGraph colocation_graph(graph, device_set);
 //
-//   // Add all the nodes of graph to colocation_graph.
+//   // Add all the nodes of the `graph` to the `colocation_graph`.
 //   for (Node* node : graph.nodes()) {
 //     TF_RETURN_IF_ERROR(colocation_graph.AddNode(*node));
 //   }
 //
-//   // Add one or more colocation constraint.
+//   // Add one or more colocation constraints.
 //   Node node_1 = *graph.FindNodeId(...);
 //   Node node_2 = *graph.FindNodeId(...);
 //   TF_RETURN_IF_ERROR(colocation_graph.ColocateNodes(node_1, node_2));
@@ -96,9 +94,9 @@ std::vector<Device*> FilterSupportedDevices(
 //     TF_RETURN_IF_ERROR(colocation_graph.AssignDevice(node));
 //   }
 //
-// The implementation uses the union-find algorithm to maintain the
-// connected components efficiently and incrementally as edges
-// (implied by ColocationGraph::ColocateNodes() invocations) are added.
+// This implementation uses the Union-Find algorithm to efficiently maintain the
+// connected components and incrementally adds edges via
+// ColocationGraph::ColocateNodes() invocations.
 class ColocationGraph {
  public:
   ColocationGraph(Graph* graph, const DeviceSet* device_set,
@@ -134,13 +132,9 @@ class ColocationGraph {
     std::unordered_map<StringPiece, const Node*, StringPieceHasher>
         colocation_group_root;
 
-    for (Node* node : graph_->nodes()) {
-      if (!node->IsOp()) {
-        continue;
-      }
-
-      // When adding the node, identify whether it is part of a
-      // colocation group.
+    for (Node* node : graph_->op_nodes()) {
+      // When adding the node, identify whether it is part of a colocation
+      // group.
 
       // This code is effectively the equivalent of GetNodeAttr() for a string
       // array, but it avoids all internal allocations (the allocation of the
@@ -219,11 +213,10 @@ class ColocationGraph {
     Member& x_root_member = members_[x_root];
     Member& y_root_member = members_[y_root];
 
-    // Merge the sets by swinging the parent pointer of the smaller
-    // tree to point to the root of the larger tree. Together with
-    // path compression in ColocationGraph::FindRoot, this ensures
-    // that we do not experience pathological performance on graphs
-    // such as chains.
+    // Merge the sets by setting the parent pointer of the smaller tree's root
+    // node to point to the root of the larger tree. Together with path
+    // compression in ColocationGraph::FindRoot, this ensures that we do not
+    // experience pathological performance on graphs such as chains.
     int new_root, old_root;
     if (x_root_member.rank < y_root_member.rank) {
       // The tree rooted at x_root is shallower, so connect it to
@@ -611,22 +604,16 @@ class ColocationGraph {
   // given id is connected.
   int FindRoot(int node_id) {
     Member& member = members_[node_id];
-
-    int parent = member.parent;
-    DCHECK_GE(parent, 0);
-
-    if (parent != node_id) {
-      // NOTE: Compress paths from node_id to its root, so that future
-      // calls to FindRoot and ColocateNodes are more efficient.
-      int root = FindRoot(parent);
-      if (parent != root) {
-        parent = root;
-        member.parent = root;
-      }
+    DCHECK_GE(member.parent, 0);
+    if (member.parent == node_id) {
+      // member.parent is the root of this disjoint tree.  Do nothing.
+    } else {
+      member.parent = FindRoot(member.parent);
     }
-
-    DCHECK_GE(parent, 0);
-    return parent;
+    // Now it is guaranteed that member.parent is the root of this disjoint
+    // tree.
+    DCHECK_GE(member.parent, 0);
+    return member.parent;
   }
 
   // Ensures that the devices of 'dst's resource and reference match the device
diff --git a/tensorflow/core/common_runtime/session_ref.cc b/tensorflow/core/common_runtime/session_ref.cc
new file mode 100644
index 0000000000..b931ef4229
--- /dev/null
+++ b/tensorflow/core/common_runtime/session_ref.cc
@@ -0,0 +1,170 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/session_ref.h"
+
+#include <utility>
+
+namespace tensorflow {
+
+namespace {
+
+// Scope helper to track active calls and manage session lifetime.
+struct RunCounter {
+  std::shared_ptr<Session> session;
+  uint64* value;
+  mutex* m;
+  condition_variable* cv;
+
+  explicit RunCounter(std::shared_ptr<Session> s, uint64* v, mutex* m,
+                      condition_variable* cv)
+      : session(std::move(s)), value(v), m(m), cv(cv) {
+    mutex_lock l(*m);
+    ++*value;
+  }
+
+  ~RunCounter() {
+    mutex_lock l(*m);
+    if (--*value == 0) {
+      cv->notify_all();
+    }
+  }
+};
+
+}  // namespace
+
+Status SessionRef::CheckNotClosed() {
+  mutex_lock l(run_lock_);
+  if (session_ == nullptr) return errors::Cancelled("Session has been closed.");
+  return ::tensorflow::Status::OK();
+}
+
+Status SessionRef::Run(const RunOptions& run_options,
+                       const std::vector<std::pair<string, Tensor> >& inputs,
+                       const std::vector<string>& output_tensor_names,
+                       const std::vector<string>& target_node_names,
+                       std::vector<Tensor>* outputs,
+                       RunMetadata* run_metadata) {
+  TF_RETURN_IF_ERROR(CheckNotClosed());
+  RunCounter rc(session_, &run_count_, &run_lock_, &run_finished_);
+  return rc.session->Run(run_options, inputs, output_tensor_names,
+                         target_node_names, outputs, run_metadata);
+}
+
+Status SessionRef::Create(const GraphDef& graph) {
+  TF_RETURN_IF_ERROR(CheckNotClosed());
+  RunCounter rc(session_, &run_count_, &run_lock_, &run_finished_);
+  return rc.session->Create(graph);
+}
+
+Status SessionRef::Create(const RunOptions& run_options,
+                          const GraphDef& graph) {
+  TF_RETURN_IF_ERROR(CheckNotClosed());
+  RunCounter rc(session_, &run_count_, &run_lock_, &run_finished_);
+  return rc.session->Create(run_options, graph);
+}
+
+Status SessionRef::Extend(const RunOptions& run_options,
+                          const GraphDef& graph) {
+  TF_RETURN_IF_ERROR(CheckNotClosed());
+  RunCounter rc(session_, &run_count_, &run_lock_, &run_finished_);
+  return rc.session->Extend(run_options, graph);
+}
+
+Status SessionRef::Extend(const GraphDef& graph) {
+  TF_RETURN_IF_ERROR(CheckNotClosed());
+  RunCounter rc(session_, &run_count_, &run_lock_, &run_finished_);
+  return rc.session->Extend(graph);
+}
+
+Status SessionRef::Close(const RunOptions& run_options) {
+  TF_RETURN_IF_ERROR(CheckNotClosed());
+  mutex_lock l(run_lock_);
+  Status status = session_->Close(run_options);
+  session_.reset();
+  while (run_count_ > 0) {
+    run_finished_.wait(l);
+  }
+  return status;
+}
+
+Status SessionRef::Close() {
+  TF_RETURN_IF_ERROR(CheckNotClosed());
+  mutex_lock l(run_lock_);
+  Status status = session_->Close();
+  session_.reset();
+  while (run_count_ > 0) {
+    run_finished_.wait(l);
+  }
+  return status;
+}
+
+Status SessionRef::Run(const std::vector<std::pair<string, Tensor> >& inputs,
+                       const std::vector<string>& output_tensor_names,
+                       const std::vector<string>& target_node_names,
+                       std::vector<Tensor>* outputs) {
+  TF_RETURN_IF_ERROR(CheckNotClosed());
+  RunCounter rc(session_, &run_count_, &run_lock_, &run_finished_);
+  return rc.session->Run(inputs, output_tensor_names, target_node_names,
+                         outputs);
+}
+
+Status SessionRef::ListDevices(std::vector<DeviceAttributes>* response) {
+  TF_RETURN_IF_ERROR(CheckNotClosed());
+  RunCounter rc(session_, &run_count_, &run_lock_, &run_finished_);
+  return rc.session->ListDevices(response);
+}
+
+Status SessionRef::PRunSetup(const std::vector<string>& input_names,
+                             const std::vector<string>& output_names,
+                             const std::vector<string>& target_nodes,
+                             string* handle) {
+  TF_RETURN_IF_ERROR(CheckNotClosed());
+  RunCounter rc(session_, &run_count_, &run_lock_, &run_finished_);
+  return rc.session->PRunSetup(input_names, output_names, target_nodes, handle);
+}
+
+Status SessionRef::PRun(const string& handle,
+                        const std::vector<std::pair<string, Tensor> >& inputs,
+                        const std::vector<string>& output_names,
+                        std::vector<Tensor>* outputs) {
+  TF_RETURN_IF_ERROR(CheckNotClosed());
+  RunCounter rc(session_, &run_count_, &run_lock_, &run_finished_);
+  return rc.session->PRun(handle, inputs, output_names, outputs);
+}
+
+Status SessionRef::MakeCallable(const CallableOptions& callable_options,
+                                CallableHandle* out_handle) {
+  TF_RETURN_IF_ERROR(CheckNotClosed());
+  RunCounter rc(session_, &run_count_, &run_lock_, &run_finished_);
+  return rc.session->MakeCallable(callable_options, out_handle);
+}
+
+Status SessionRef::RunCallable(CallableHandle handle,
+                               const std::vector<Tensor>& feed_tensors,
+                               std::vector<Tensor>* fetch_tensors,
+                               RunMetadata* run_metadata) {
+  TF_RETURN_IF_ERROR(CheckNotClosed());
+  RunCounter rc(session_, &run_count_, &run_lock_, &run_finished_);
+  return rc.session->RunCallable(handle, feed_tensors, fetch_tensors,
+                                 run_metadata);
+}
+
+Status SessionRef::ReleaseCallable(CallableHandle handle) {
+  TF_RETURN_IF_ERROR(CheckNotClosed());
+  RunCounter rc(session_, &run_count_, &run_lock_, &run_finished_);
+  return rc.session->ReleaseCallable(handle);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/session_ref.h b/tensorflow/core/common_runtime/session_ref.h
new file mode 100644
index 0000000000..9459e7edbe
--- /dev/null
+++ b/tensorflow/core/common_runtime/session_ref.h
@@ -0,0 +1,86 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_SESSION_REF_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_SESSION_REF_H_
+
+#include <memory>
+
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+
+// A `SessionRef` manages the lifetime of a wrapped `Session` pointer.
+//
+// SessionRef blocks the return of Close() until all pending operations have
+// been completed or cancelled and underlying session has been freed.  Any
+// subsequent operations on the SessionRef object will return errors::Cancelled.
+class SessionRef : public Session {
+ public:
+  SessionRef(Session* session) : session_(session) {}
+  virtual ~SessionRef() {}
+
+  Status Create(const GraphDef& graph) override;
+  Status Extend(const GraphDef& graph) override;
+  Status Create(const RunOptions& run_options, const GraphDef& graph) override;
+  Status Extend(const RunOptions& run_options, const GraphDef& graph) override;
+  Status Run(const std::vector<std::pair<string, Tensor> >& inputs,
+             const std::vector<string>& output_tensor_names,
+             const std::vector<string>& target_node_names,
+             std::vector<Tensor>* outputs) override;
+
+  Status ListDevices(std::vector<DeviceAttributes>* response) override;
+
+  Status Close() override;
+  Status Close(const RunOptions& run_options) override;
+
+  Status Run(const RunOptions& run_options,
+             const std::vector<std::pair<string, Tensor> >& inputs,
+             const std::vector<string>& output_tensor_names,
+             const std::vector<string>& target_node_names,
+             std::vector<Tensor>* outputs, RunMetadata* run_metadata) override;
+
+  Status PRunSetup(const std::vector<string>& input_names,
+                   const std::vector<string>& output_names,
+                   const std::vector<string>& target_nodes,
+                   string* handle) override;
+
+  Status PRun(const string& handle,
+              const std::vector<std::pair<string, Tensor> >& inputs,
+              const std::vector<string>& output_names,
+              std::vector<Tensor>* outputs) override;
+
+  Status MakeCallable(const CallableOptions& callable_options,
+                      CallableHandle* out_handle) override;
+
+  Status RunCallable(CallableHandle handle,
+                     const std::vector<Tensor>& feed_tensors,
+                     std::vector<Tensor>* fetch_tensors,
+                     RunMetadata* run_metadata) override;
+
+  Status ReleaseCallable(CallableHandle handle) override;
+
+ private:
+  mutex run_lock_;
+  condition_variable run_finished_;
+  uint64 run_count_ GUARDED_BY(run_lock_) = {0};
+  std::shared_ptr<Session> session_;
+
+  Status CheckNotClosed();
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_SESSION_REF_H_
diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc
index 74a87215e1..7406ecf4f8 100644
--- a/tensorflow/core/common_runtime/threadpool_device.cc
+++ b/tensorflow/core/common_runtime/threadpool_device.cc
@@ -111,7 +111,21 @@ Status ThreadPoolDevice::MakeTensorFromProto(
 }
 
 #ifdef INTEL_MKL
-REGISTER_MEM_ALLOCATOR("MklCPUAllocator", 200, MklCPUAllocator);
+namespace {
+class MklCPUAllocatorFactory : public AllocatorFactory {
+ public:
+  bool NumaEnabled() override { return false; }
+
+  Allocator* CreateAllocator() override { return new MklCPUAllocator; }
+
+  // Note: Ignores numa_node, for now.
+  virtual SubAllocator* CreateSubAllocator(int numa_node) {
+    return new MklSubAllocator;
+  }
+};
+
+REGISTER_MEM_ALLOCATOR("MklCPUAllocator", 200, MklCPUAllocatorFactory);
+}  // namespace
 #endif
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/eager/eager_client.h b/tensorflow/core/distributed_runtime/eager/eager_client.h
index 9ba8c8d80c..707f3234b9 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_client.h
+++ b/tensorflow/core/distributed_runtime/eager/eager_client.h
@@ -39,6 +39,7 @@ class EagerClient {
   CLIENT_METHOD(KeepAlive);
   CLIENT_METHOD(CloseContext);
   CLIENT_METHOD(RegisterFunction);
+  CLIENT_METHOD(SendTensor);
 
 #undef CLIENT_METHOD
 };
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
index 466e779fab..916c8720f0 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
@@ -81,10 +81,11 @@ Status GetNumRetvals(tensorflow::EagerContext* context, const string& op_name,
 
 Status EagerServiceImpl::CreateContext(const CreateContextRequest* request,
                                        CreateContextResponse* response) {
-  //make sure env_ , env_->rendezvous_mgr available
+  // make sure env_ , env_->rendezvous_mgr available
   if (env_ == nullptr || env_->rendezvous_mgr == nullptr) {
-    return tensorflow::errors::Internal("invalid eager env_ or env_->rendezvous_mgr.");
-  } 
+    return tensorflow::errors::Internal(
+        "invalid eager env_ or env_->rendezvous_mgr.");
+  }
   std::vector<tensorflow::Device*> devices;
 
   TF_RETURN_IF_ERROR(tensorflow::DeviceFactory::AddDevices(
@@ -266,6 +267,35 @@ Status EagerServiceImpl::RegisterFunction(
   return context->Context()->AddFunctionDef(request->function_def());
 }
 
+Status EagerServiceImpl::SendTensor(const SendTensorRequest* request,
+                                    SendTensorResponse* response) {
+  ServerContext* context = nullptr;
+  TF_RETURN_IF_ERROR(GetServerContext(request->context_id(), &context));
+  core::ScopedUnref context_unref(context);
+
+  tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 2> tensors;
+  for (const auto& tensor_proto : request->tensors()) {
+    Tensor tensor;
+    if (!tensor.FromProto(tensor_proto)) {
+      return errors::InvalidArgument("Unable to parse tensor proto");
+    }
+
+    TensorHandle* tensor_handle =
+        new TensorHandle(tensor, nullptr, nullptr, nullptr);
+
+    TensorHandle* copied_handle = nullptr;
+    TF_RETURN_IF_ERROR(EagerCopyToDevice(tensor_handle, context->Context(),
+                                         request->device_name().c_str(),
+                                         &copied_handle));
+    tensors.push_back(copied_handle);
+    tensor_handle->Unref();
+  }
+
+  context->AddOperationOutputs(tensors, request->op_id());
+
+  return Status::OK();
+}
+
 tensorflow::Status EagerServiceImpl::GetServerContext(
     uint64 context_id, ServerContext** server_context) {
   mutex_lock l(contexts_mu_);
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.h b/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
index b0e4aa84b9..718b4e2457 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
@@ -62,6 +62,9 @@ class EagerServiceImpl {
   Status RegisterFunction(const RegisterFunctionRequest* request,
                           RegisterFunctionResponse* response);
 
+  Status SendTensor(const SendTensorRequest* request,
+                    SendTensorResponse* response);
+
  protected:
   // This is the server-side execution context. All state regarding execution of
   // a client's ops is held in this server-side context (all generated tensors,
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
index b98386ba86..d1f2a6da8f 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
@@ -84,7 +84,7 @@ class EagerServiceImplTest : public ::testing::Test {
   std::unique_ptr<DeviceMgr> device_mgr_;
 };
 
-void SetTensorProto(AttrValue* val) {
+void SetTensorProto(TensorProto* tensor_proto) {
   int64_t dims[] = {2, 2};
   float data[] = {1.0f, 2.0f, 3.0f, 4.0f};
   TF_Tensor* t = TF_AllocateTensor(
@@ -92,7 +92,7 @@ void SetTensorProto(AttrValue* val) {
   memcpy(TF_TensorData(t), &data[0], TF_TensorByteSize(t));
   tensorflow::Tensor tensor;
   TF_ASSERT_OK(tensorflow::TF_TensorToTensor(t, &tensor));
-  tensor.AsProtoTensorContent(val->mutable_tensor());
+  tensor.AsProtoTensorContent(tensor_proto);
   TF_DeleteTensor(t);
 }
 
@@ -175,7 +175,7 @@ TEST_F(EagerServiceImplTest, BasicTest) {
   val.set_type(tensorflow::DataType::DT_FLOAT);
   const_attrs.insert({"dtype", val});
   val.Clear();
-  SetTensorProto(&val);
+  SetTensorProto(val.mutable_tensor());
   const_attrs.insert({"value", val});
 
   AddOperationToEnqueueRequest(1, "Const", {}, const_attrs,
@@ -260,7 +260,7 @@ TEST_F(EagerServiceImplTest, BasicFunctionTest) {
   const_attrs.insert({"dtype", val});
   val.Clear();
 
-  SetTensorProto(&val);
+  SetTensorProto(val.mutable_tensor());
   const_attrs.insert({"value", val});
 
   AddOperationToEnqueueRequest(1, "Const", {}, const_attrs,
@@ -294,6 +294,77 @@ TEST_F(EagerServiceImplTest, BasicFunctionTest) {
                                                &close_context_response));
 }
 
+// Test creates a context and attempts to send a tensor (using the RPC), and
+// then use the tensor.
+TEST_F(EagerServiceImplTest, SendTensorTest) {
+  TestEagerServiceImpl eager_service_impl(&worker_env_);
+
+  CreateContextRequest request;
+  request.mutable_server_def()->set_job_name("localhost");
+  request.mutable_server_def()->set_task_index(0);
+  request.set_rendezvous_id(random::New64());
+  CreateContextResponse response;
+
+  TF_ASSERT_OK(eager_service_impl.CreateContext(&request, &response));
+
+  uint64 context_id = response.context_id();
+
+  SendTensorRequest send_tensor_request;
+  send_tensor_request.set_context_id(context_id);
+  send_tensor_request.set_op_id(1);
+  SetTensorProto(send_tensor_request.add_tensors());
+  SendTensorResponse send_tensor_response;
+
+  TF_ASSERT_OK(eager_service_impl.SendTensor(&send_tensor_request,
+                                             &send_tensor_response));
+
+  EnqueueRequest remote_enqueue_request;
+  remote_enqueue_request.set_context_id(context_id);
+  EnqueueResponse remote_enqueue_response;
+
+  std::unordered_map<string, AttrValue> attrs;
+  AttrValue val;
+  val.Clear();
+  val.set_type(tensorflow::DataType::DT_FLOAT);
+  attrs.insert({"T", val});
+  val.Clear();
+  val.set_b(false);
+  attrs.insert({"transpose_a", val});
+  attrs.insert({"transpose_b", val});
+
+  AddOperationToEnqueueRequest(2, "MatMul", {{1, 0}, {1, 0}}, attrs,
+                               "/job:localhost/replica:0/task:0/device:CPU:0",
+                               &remote_enqueue_request);
+
+  TF_ASSERT_OK(eager_service_impl.Enqueue(&remote_enqueue_request,
+                                          &remote_enqueue_response));
+
+  const tensorflow::Tensor* t = nullptr;
+  tensorflow::TensorHandle* tensor_handle;
+  TF_ASSERT_OK(eager_service_impl.GetTensorHandle(
+      response.context_id(), RemoteTensorHandleInternal(2, 0), &tensor_handle));
+  TF_ASSERT_OK(tensor_handle->Tensor(&t));
+
+  Device* device = nullptr;
+  TF_ASSERT_OK(tensor_handle->Device(&device));
+  EXPECT_NE(device, nullptr);
+  EXPECT_EQ(device->name(), "/job:localhost/replica:0/task:0/device:CPU:0");
+
+  auto actual = t->flat<float>();
+  EXPECT_EQ(4, actual.size());
+
+  EXPECT_EQ(7, actual(0));
+  EXPECT_EQ(10, actual(1));
+  EXPECT_EQ(15, actual(2));
+  EXPECT_EQ(22, actual(3));
+
+  CloseContextRequest close_context_request;
+  close_context_request.set_context_id(context_id);
+  CloseContextResponse close_context_response;
+  TF_ASSERT_OK(eager_service_impl.CloseContext(&close_context_request,
+                                               &close_context_response));
+}
+
 }  // namespace
 }  // namespace eager
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/eager/remote_execute_node.h b/tensorflow/core/distributed_runtime/eager/remote_execute_node.h
index 28b68c3b88..0e3a68c4d8 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_execute_node.h
+++ b/tensorflow/core/distributed_runtime/eager/remote_execute_node.h
@@ -29,8 +29,8 @@ namespace eager {
 class RemoteExecuteNode : public tensorflow::EagerNode {
  public:
   RemoteExecuteNode(
-      tensorflow::uint64 id, const tensorflow::eager::EnqueueRequest& request,
-      tensorflow::eager::EagerClient* eager_client,
+      tensorflow::uint64 id, std::unique_ptr<EnqueueRequest> request,
+      EagerClient* eager_client,
       const gtl::InlinedVector<TensorHandle*, 4>& inputs,
       std::function<void(const Status& status, const EnqueueResponse& response)>
           done_callback)
@@ -45,8 +45,8 @@ class RemoteExecuteNode : public tensorflow::EagerNode {
   }
 
   RemoteExecuteNode(tensorflow::uint64 id,
-                    const tensorflow::eager::EnqueueRequest& request,
-                    tensorflow::eager::EagerClient* eager_client)
+                    std::unique_ptr<EnqueueRequest> request,
+                    EagerClient* eager_client)
       : tensorflow::EagerNode(id),
         request_(std::move(request)),
         eager_client_(eager_client) {}
@@ -58,10 +58,10 @@ class RemoteExecuteNode : public tensorflow::EagerNode {
   }
 
   tensorflow::Status Run() override {
-    tensorflow::eager::EnqueueResponse response;
-    tensorflow::Status status;
+    EnqueueResponse response;
+    Status status;
     Notification n;
-    eager_client_->EnqueueAsync(&request_, &response,
+    eager_client_->EnqueueAsync(request_.get(), &response,
                                 [&n, &status](const tensorflow::Status& s) {
                                   status.Update(s);
                                   n.Notify();
@@ -76,9 +76,8 @@ class RemoteExecuteNode : public tensorflow::EagerNode {
   }
 
  private:
-  EnqueueRequest request_;
-  tensorflow::eager::EagerClient*
-      eager_client_;  // Not owned, and must outlive the RemoteExecuteNode.
+  std::unique_ptr<EnqueueRequest> request_;
+  EagerClient* eager_client_;  // Not owned, and must outlive this node.
 
   // This is required to ensure that the tensor handles stay alive across the
   // execution.
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
index b23466037f..181422118c 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
@@ -49,6 +49,7 @@ class GrpcEagerClient : public EagerClient {
   CLIENT_METHOD(KeepAlive);
   CLIENT_METHOD(CloseContext);
   CLIENT_METHOD(RegisterFunction);
+  CLIENT_METHOD(SendTensor);
 
 #undef CLIENT_METHOD
 
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.cc
index 39ab6856c5..ab3aa3fd1d 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.cc
@@ -36,6 +36,7 @@ static const char* grpcEagerService_method_names[] = {
     "/tensorflow.eager.EagerService/KeepAlive",
     "/tensorflow.eager.EagerService/CloseContext",
     "/tensorflow.eager.EagerService/RegisterFunction",
+    "/tensorflow.eager.EagerService/SendTensor",
 };
 
 std::unique_ptr<EagerService::Stub> EagerService::NewStub(
@@ -62,7 +63,9 @@ EagerService::Stub::Stub(
                               ::grpc::internal::RpcMethod::NORMAL_RPC, channel),
       rpcmethod_RegisterFunction_(grpcEagerService_method_names[5],
                                   ::grpc::internal::RpcMethod::NORMAL_RPC,
-                                  channel) {}
+                                  channel),
+      rpcmethod_SendTensor_(grpcEagerService_method_names[6],
+                            ::grpc::internal::RpcMethod::NORMAL_RPC, channel) {}
 
 ::grpc::Status EagerService::Stub::CreateContext(
     ::grpc::ClientContext* context, const CreateContextRequest& request,
@@ -106,8 +109,15 @@ EagerService::Stub::Stub(
       channel_.get(), rpcmethod_RegisterFunction_, context, request, response);
 }
 
+::grpc::Status EagerService::Stub::SendTensor(::grpc::ClientContext* context,
+                                              const SendTensorRequest& request,
+                                              SendTensorResponse* response) {
+  return ::grpc::internal::BlockingUnaryCall(
+      channel_.get(), rpcmethod_SendTensor_, context, request, response);
+}
+
 EagerService::AsyncService::AsyncService() {
-  for (int i = 0; i < 6; ++i) {
+  for (int i = 0; i < 7; ++i) {
     AddMethod(new ::grpc::internal::RpcServiceMethod(
         grpcEagerService_method_names[i],
         ::grpc::internal::RpcMethod::NORMAL_RPC, nullptr));
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h
index 66458186ad..521e0ac4fa 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h
@@ -69,6 +69,9 @@ class EagerService final {
     virtual ::grpc::Status RegisterFunction(
         ::grpc::ClientContext* context, const RegisterFunctionRequest& request,
         RegisterFunctionResponse* response) = 0;
+    virtual ::grpc::Status SendTensor(::grpc::ClientContext* context,
+                                      const SendTensorRequest& request,
+                                      SendTensorResponse* response) = 0;
   };
   class Stub final : public StubInterface {
    public:
@@ -91,6 +94,9 @@ class EagerService final {
     ::grpc::Status RegisterFunction(
         ::grpc::ClientContext* context, const RegisterFunctionRequest& request,
         RegisterFunctionResponse* response) override;
+    ::grpc::Status SendTensor(::grpc::ClientContext* context,
+                              const SendTensorRequest& request,
+                              SendTensorResponse* response) override;
 
    private:
     std::shared_ptr< ::grpc::ChannelInterface> channel_;
@@ -100,6 +106,7 @@ class EagerService final {
     const ::grpc::internal::RpcMethod rpcmethod_KeepAlive_;
     const ::grpc::internal::RpcMethod rpcmethod_CloseContext_;
     const ::grpc::internal::RpcMethod rpcmethod_RegisterFunction_;
+    const ::grpc::internal::RpcMethod rpcmethod_SendTensor_;
   };
   static std::unique_ptr<Stub> NewStub(
       const std::shared_ptr< ::grpc::ChannelInterface>& channel,
@@ -157,6 +164,14 @@ class EagerService final {
       ::grpc::Service::RequestAsyncUnary(5, context, request, response,
                                          new_call_cq, notification_cq, tag);
     }
+    void RequestSendTensor(
+        ::grpc::ServerContext* context, SendTensorRequest* request,
+        ::grpc::ServerAsyncResponseWriter<SendTensorResponse>* response,
+        ::grpc::CompletionQueue* new_call_cq,
+        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
+      ::grpc::Service::RequestAsyncUnary(6, context, request, response,
+                                         new_call_cq, notification_cq, tag);
+    }
   };
 };
 
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.cc
index 52e06c263d..f511674e1f 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.cc
@@ -27,9 +27,7 @@ namespace eager {
 
 GrpcEagerServiceImpl::GrpcEagerServiceImpl(
     const WorkerEnv* env, ::grpc::ServerBuilder* server_builder)
-    : local_impl_(env) {
-  request_handler_threadpool_ =
-      MakeUnique<thread::ThreadPool>(env->env, "EagerServiceRequestHandler", 4);
+    : env_(env), local_impl_(env) {
   server_builder->RegisterService(&service_);
   cq_ = server_builder->AddCompletionQueue();
 }
@@ -50,6 +48,7 @@ void GrpcEagerServiceImpl::HandleRPCsLoop() {
   ENQUEUE_REQUEST(KeepAlive);
   ENQUEUE_REQUEST(CloseContext);
   ENQUEUE_REQUEST(RegisterFunction);
+  ENQUEUE_REQUEST(SendTensor);
 #undef ENQUEUE_REQUEST
 
   void* tag;  // Matches the operation started against this cq_.
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
index 9a94026342..537e9043bd 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
@@ -45,7 +45,7 @@ class GrpcEagerServiceImpl : public AsyncServiceInterface {
  private:
 #define HANDLER(method)                                                        \
   void method##Handler(EagerCall<method##Request, method##Response>* call) {   \
-    request_handler_threadpool_->Schedule([this, call]() {                     \
+    env_->compute_pool->Schedule([this, call]() {                              \
       call->SendResponse(                                                      \
           ToGrpcStatus(local_impl_.method(&call->request, &call->response)));  \
     });                                                                        \
@@ -62,8 +62,10 @@ class GrpcEagerServiceImpl : public AsyncServiceInterface {
   HANDLER(KeepAlive);
   HANDLER(CloseContext);
   HANDLER(RegisterFunction);
+  HANDLER(SendTensor);
 #undef HANDLER
 
+  const WorkerEnv* const env_;  // Not owned.
   EagerServiceImpl local_impl_;
 
   std::unique_ptr<::grpc::Alarm> shutdown_alarm_;
@@ -71,8 +73,6 @@ class GrpcEagerServiceImpl : public AsyncServiceInterface {
   std::unique_ptr<::grpc::ServerCompletionQueue> cq_;
   tensorflow::eager::grpc::EagerService::AsyncService service_;
 
-  std::unique_ptr<thread::ThreadPool> request_handler_threadpool_;
-
   TF_DISALLOW_COPY_AND_ASSIGN(GrpcEagerServiceImpl);
 };
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index db14f6473e..8a6903be9e 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -244,6 +244,7 @@ Status GrpcServer::Init(
   // Finish setting up master environment.
   master_env_.ops = OpRegistry::Global();
   master_env_.worker_cache = worker_cache;
+  master_env_.collective_executor_mgr = worker_env_.collective_executor_mgr;
   master_env_.master_session_factory =
       [config, stats_factory](
           SessionOptions options, const MasterEnv* env,
diff --git a/tensorflow/core/framework/allocator.cc b/tensorflow/core/framework/allocator.cc
index 1c62d37955..888ed0c57b 100644
--- a/tensorflow/core/framework/allocator.cc
+++ b/tensorflow/core/framework/allocator.cc
@@ -91,6 +91,11 @@ void EnableCPUAllocatorFullStats(bool enable) {
   cpu_allocator_collect_full_stats = enable;
 }
 
+namespace {
+// A default Allocator for CPU devices.  ProcessState::GetCPUAllocator() will
+// return a different version that may perform better, but may also lack the
+// optional stats triggered by the functions above.  TODO(tucker): migrate all
+// uses of cpu_allocator() except tests to use ProcessState instead.
 class CPUAllocator : public Allocator {
  public:
   CPUAllocator()
@@ -170,14 +175,42 @@ class CPUAllocator : public Allocator {
   TF_DISALLOW_COPY_AND_ASSIGN(CPUAllocator);
 };
 
+class CPUAllocatorFactory : public AllocatorFactory {
+ public:
+  Allocator* CreateAllocator() override { return new CPUAllocator; }
+
+  SubAllocator* CreateSubAllocator(int numa_node) override {
+    return new CPUSubAllocator(new CPUAllocator);
+  }
+
+ private:
+  class CPUSubAllocator : public SubAllocator {
+   public:
+    explicit CPUSubAllocator(CPUAllocator* cpu_allocator)
+        : cpu_allocator_(cpu_allocator) {}
+
+    void* Alloc(size_t alignment, size_t num_bytes) override {
+      return cpu_allocator_->AllocateRaw(alignment, num_bytes);
+    }
+
+    void Free(void* ptr, size_t num_bytes) override {
+      cpu_allocator_->DeallocateRaw(ptr);
+    }
+
+   private:
+    CPUAllocator* cpu_allocator_;
+  };
+};
+
+REGISTER_MEM_ALLOCATOR("DefaultCPUAllocator", 100, CPUAllocatorFactory);
+}  // namespace
+
 Allocator* cpu_allocator() {
-  static Allocator* cpu_alloc = AllocatorRegistry::Global()->GetAllocator();
+  static Allocator* cpu_alloc =
+      AllocatorFactoryRegistry::singleton()->GetAllocator();
   if (cpu_allocator_collect_full_stats && !cpu_alloc->TracksAllocationSizes()) {
     cpu_alloc = new TrackingAllocator(cpu_alloc, true);
   }
   return cpu_alloc;
 }
-
-REGISTER_MEM_ALLOCATOR("DefaultCPUAllocator", 100, CPUAllocator);
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h
index 2bb4d32d57..774b1fe137 100644
--- a/tensorflow/core/framework/allocator.h
+++ b/tensorflow/core/framework/allocator.h
@@ -376,16 +376,18 @@ struct AllocatorAttributes {
   int32 scope_id = 0;
 };
 
-// Returns a trivial implementation of Allocator which uses the system
-// default malloc. The returned allocator is a process singleton.
+// Returns a trivial implementation of Allocator, which is a process singleton.
+// Access through this function is only intended for use in tests and auxiliary
+// processing.  Performance sensitive uses should always obtain allocators from
+// ProcessState.
 Allocator* cpu_allocator();
 
-// If 'enable' is true, the process-wide cpu allocator collects
+// If 'enable' is true, the default CPU allocator implementation will collect
 // AllocatorStats. By default, it's disabled.
 void EnableCPUAllocatorStats(bool enable);
 
-// If 'enable' is true, the process-wide cpu allocator collects full
-// statistics. By default, it's disabled.
+// If 'enable' is true, the default CPU allocator implementation will collect
+// full statistics. By default, it's disabled.
 void EnableCPUAllocatorFullStats(bool enable);
 
 // Abstract interface of an object that does the underlying suballoc/free of
diff --git a/tensorflow/core/framework/allocator_registry.cc b/tensorflow/core/framework/allocator_registry.cc
index 486be39ae3..099c4bacc8 100644
--- a/tensorflow/core/framework/allocator_registry.cc
+++ b/tensorflow/core/framework/allocator_registry.cc
@@ -21,60 +21,110 @@ limitations under the License.
 namespace tensorflow {
 
 // static
-AllocatorRegistry* AllocatorRegistry::Global() {
-  static AllocatorRegistry* global_allocator_registry = new AllocatorRegistry;
-  return global_allocator_registry;
+AllocatorFactoryRegistry* AllocatorFactoryRegistry::singleton() {
+  static AllocatorFactoryRegistry* singleton = new AllocatorFactoryRegistry;
+  return singleton;
 }
 
-Allocator* AllocatorRegistry::GetRegisteredAllocator(const string& name,
-                                                     int priority) {
-  for (auto entry : allocators_) {
+const AllocatorFactoryRegistry::FactoryEntry*
+AllocatorFactoryRegistry::FindEntry(const string& name, int priority) const {
+  for (auto& entry : factories_) {
     if (!name.compare(entry.name) && priority == entry.priority) {
-      return entry.allocator;
+      return &entry;
     }
   }
   return nullptr;
 }
 
-void AllocatorRegistry::Register(const string& name, int priority,
-                                 Allocator* allocator) {
+void AllocatorFactoryRegistry::Register(const char* source_file,
+                                        int source_line, const string& name,
+                                        int priority,
+                                        AllocatorFactory* factory) {
+  mutex_lock l(mu_);
+  CHECK(!first_alloc_made_) << "Attempt to register an AllocatorFactory "
+                            << "after call to GetAllocator()";
   CHECK(!name.empty()) << "Need a valid name for Allocator";
   CHECK_GE(priority, 0) << "Priority needs to be non-negative";
 
-  Allocator* existing = GetRegisteredAllocator(name, priority);
+  const FactoryEntry* existing = FindEntry(name, priority);
   if (existing != nullptr) {
-    // A duplicate is if the registration name and priority match
-    // but the Allocator::Name()'s don't match.
-    CHECK_EQ(existing->Name(), allocator->Name())
-        << "Allocator with name: [" << name << "], type [" << existing->Name()
-        << "], priority: [" << priority
-        << "] already registered.  Choose a different name to register "
-        << "an allocator of type " << allocator->Name();
-
-    // The allocator names match, so we can just return.
-    // It should be safe to delete the allocator since the caller
-    // gives up ownership of it.
-    delete allocator;
-    return;
+    // Duplicate registration is a hard failure.
+    LOG(FATAL) << "New registration for AllocatorFactory with name=" << name
+               << " priority=" << priority << " at location " << source_file
+               << ":" << source_line
+               << " conflicts with previous registration at location "
+               << existing->source_file << ":" << existing->source_line;
   }
 
-  AllocatorRegistryEntry tmp_entry;
-  tmp_entry.name = name;
-  tmp_entry.priority = priority;
-  tmp_entry.allocator = allocator;
+  FactoryEntry entry;
+  entry.source_file = source_file;
+  entry.source_line = source_line;
+  entry.name = name;
+  entry.priority = priority;
+  entry.factory.reset(factory);
+  factories_.push_back(std::move(entry));
+}
 
-  allocators_.push_back(tmp_entry);
-  int high_pri = -1;
-  for (auto entry : allocators_) {
-    if (high_pri < entry.priority) {
-      m_curr_allocator_ = entry.allocator;
-      high_pri = entry.priority;
+Allocator* AllocatorFactoryRegistry::GetAllocator() {
+  mutex_lock l(mu_);
+  first_alloc_made_ = true;
+  FactoryEntry* best_entry = nullptr;
+  for (auto& entry : factories_) {
+    if (best_entry == nullptr) {
+      best_entry = &entry;
+    } else if (entry.priority > best_entry->priority) {
+      best_entry = &entry;
     }
   }
+  if (best_entry) {
+    if (!best_entry->allocator) {
+      best_entry->allocator.reset(best_entry->factory->CreateAllocator());
+    }
+    return best_entry->allocator.get();
+  } else {
+    LOG(FATAL) << "No registered CPU AllocatorFactory";
+    return nullptr;
+  }
 }
 
-Allocator* AllocatorRegistry::GetAllocator() {
-  return CHECK_NOTNULL(m_curr_allocator_);
+SubAllocator* AllocatorFactoryRegistry::GetSubAllocator(int numa_node) {
+  mutex_lock l(mu_);
+  first_alloc_made_ = true;
+  FactoryEntry* best_entry = nullptr;
+  for (auto& entry : factories_) {
+    if (best_entry == nullptr) {
+      best_entry = &entry;
+    } else if (best_entry->factory->NumaEnabled()) {
+      if (entry.factory->NumaEnabled() &&
+          (entry.priority > best_entry->priority)) {
+        best_entry = &entry;
+      }
+    } else {
+      DCHECK(!best_entry->factory->NumaEnabled());
+      if (entry.factory->NumaEnabled() ||
+          (entry.priority > best_entry->priority)) {
+        best_entry = &entry;
+      }
+    }
+  }
+  if (best_entry) {
+    int index = 0;
+    if (numa_node != port::kNUMANoAffinity) {
+      CHECK_LE(numa_node, port::NUMANumNodes());
+      index = 1 + numa_node;
+    }
+    if (best_entry->sub_allocators.size() < (index + 1)) {
+      best_entry->sub_allocators.resize(index + 1);
+    }
+    if (!best_entry->sub_allocators[index].get()) {
+      best_entry->sub_allocators[index].reset(
+          best_entry->factory->CreateSubAllocator(numa_node));
+    }
+    return best_entry->sub_allocators[index].get();
+  } else {
+    LOG(FATAL) << "No registered CPU AllocatorFactory";
+    return nullptr;
+  }
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/allocator_registry.h b/tensorflow/core/framework/allocator_registry.h
index b26e79ac3b..24f282ce84 100644
--- a/tensorflow/core/framework/allocator_registry.h
+++ b/tensorflow/core/framework/allocator_registry.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// Classes to maintain a static registry of memory allocators
+// Classes to maintain a static registry of memory allocator factories.
 #ifndef TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_REGISTRY_H_
 #define TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_REGISTRY_H_
 
@@ -21,59 +21,100 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/platform/numa.h"
 
 namespace tensorflow {
 
-// A global AllocatorRegistry is used to hold allocators for CPU backends
-class AllocatorRegistry {
+class AllocatorFactory {
  public:
-  // Add an allocator to the registry.  Caller releases ownership of
-  // 'allocator'.
-  void Register(const string& name, int priority, Allocator* allocator);
+  virtual ~AllocatorFactory() {}
 
-  // Return allocator with highest priority
-  // If multiple allocators have the same high priority, return one of them
+  // Returns true if the factory will create a functionally different
+  // SubAllocator for different (legal) values of numa_node.
+  virtual bool NumaEnabled() { return false; }
+
+  // Create an Allocator.
+  virtual Allocator* CreateAllocator() = 0;
+
+  // Create a SubAllocator. If NumaEnabled() is true, then returned SubAllocator
+  // will allocate memory local to numa_node.  If numa_node == kNUMANoAffinity
+  // then allocated memory is not specific to any NUMA node.
+  virtual SubAllocator* CreateSubAllocator(int numa_node) = 0;
+};
+
+// A singleton registry of AllocatorFactories.
+//
+// Allocators should be obtained through ProcessState or cpu_allocator()
+// (deprecated), not directly through this interface.  The purpose of this
+// registry is to allow link-time discovery of multiple AllocatorFactories among
+// which ProcessState will obtain the best fit at startup.
+class AllocatorFactoryRegistry {
+ public:
+  AllocatorFactoryRegistry() {}
+  ~AllocatorFactoryRegistry() {}
+
+  void Register(const char* source_file, int source_line, const string& name,
+                int priority, AllocatorFactory* factory);
+
+  // Returns 'best fit' Allocator.  Find the factory with the highest priority
+  // and return an allocator constructed by it.  If multiple factories have
+  // been registered with the same priority, picks one by unspecified criteria.
   Allocator* GetAllocator();
 
-  // Returns the global registry of allocators.
-  static AllocatorRegistry* Global();
+  // Returns 'best fit' SubAllocator.  First look for the highest priority
+  // factory that is NUMA-enabled.  If none is registered, fall back to the
+  // highest priority non-NUMA-enabled factory.  If NUMA-enabled, return a
+  // SubAllocator specific to numa_node, otherwise return a NUMA-insensitive
+  // SubAllocator.
+  SubAllocator* GetSubAllocator(int numa_node);
+
+  // Returns the singleton value.
+  static AllocatorFactoryRegistry* singleton();
 
  private:
-  typedef struct {
+  mutex mu_;
+  bool first_alloc_made_ = false;
+  struct FactoryEntry {
+    const char* source_file;
+    int source_line;
     string name;
     int priority;
-    Allocator* allocator;  // not owned
-  } AllocatorRegistryEntry;
-
-  // Returns the Allocator registered for 'name' and 'priority',
-  // or 'nullptr' if not found.
-  Allocator* GetRegisteredAllocator(const string& name, int priority);
-
-  std::vector<AllocatorRegistryEntry> allocators_;
-  Allocator* m_curr_allocator_;  // not owned
+    std::unique_ptr<AllocatorFactory> factory;
+    std::unique_ptr<Allocator> allocator;
+    // Index 0 corresponds to kNUMANoAffinity, other indices are (numa_node +
+    // 1).
+    std::vector<std::unique_ptr<SubAllocator>> sub_allocators;
+  };
+  std::vector<FactoryEntry> factories_ GUARDED_BY(mu_);
+
+  // Returns any FactoryEntry registered under 'name' and 'priority',
+  // or 'nullptr' if none found.
+  const FactoryEntry* FindEntry(const string& name, int priority) const
+      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(AllocatorFactoryRegistry);
 };
 
-namespace allocator_registration {
-
-class AllocatorRegistration {
+class AllocatorFactoryRegistration {
  public:
-  AllocatorRegistration(const string& name, int priority,
-                        Allocator* allocator) {
-    AllocatorRegistry::Global()->Register(name, priority, allocator);
+  AllocatorFactoryRegistration(const char* file, int line, const string& name,
+                               int priority, AllocatorFactory* factory) {
+    AllocatorFactoryRegistry::singleton()->Register(file, line, name, priority,
+                                                    factory);
   }
 };
 
-}  // namespace allocator_registration
-
-#define REGISTER_MEM_ALLOCATOR(name, priority, allocator) \
-  REGISTER_MEM_ALLOCATOR_UNIQ_HELPER(__COUNTER__, name, priority, allocator)
+#define REGISTER_MEM_ALLOCATOR(name, priority, factory)                     \
+  REGISTER_MEM_ALLOCATOR_UNIQ_HELPER(__COUNTER__, __FILE__, __LINE__, name, \
+                                     priority, factory)
 
-#define REGISTER_MEM_ALLOCATOR_UNIQ_HELPER(ctr, name, priority, allocator) \
-  REGISTER_MEM_ALLOCATOR_UNIQ(ctr, name, priority, allocator)
+#define REGISTER_MEM_ALLOCATOR_UNIQ_HELPER(ctr, file, line, name, priority, \
+                                           factory)                         \
+  REGISTER_MEM_ALLOCATOR_UNIQ(ctr, file, line, name, priority, factory)
 
-#define REGISTER_MEM_ALLOCATOR_UNIQ(ctr, name, priority, allocator) \
-  static allocator_registration::AllocatorRegistration              \
-      register_allocator_##ctr(name, priority, new allocator)
+#define REGISTER_MEM_ALLOCATOR_UNIQ(ctr, file, line, name, priority, factory) \
+  static AllocatorFactoryRegistration allocator_factory_reg_##ctr(            \
+      file, line, name, priority, new factory)
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/framework/bfloat16_test.cc b/tensorflow/core/framework/bfloat16_test.cc
index 206396a25a..0a1b5e1975 100644
--- a/tensorflow/core/framework/bfloat16_test.cc
+++ b/tensorflow/core/framework/bfloat16_test.cc
@@ -45,7 +45,8 @@ class Bfloat16Test : public ::testing::Test,
                      public ::testing::WithParamInterface<Bfloat16TestParam> {};
 
 TEST_P(Bfloat16Test, TruncateTest) {
-  bfloat16 truncated(GetParam().input);
+  bfloat16 truncated = bfloat16::truncate_to_bfloat16((GetParam().input));
+
   if (std::isnan(GetParam().input)) {
     EXPECT_TRUE(std::isnan(float(truncated)) || std::isinf(float(truncated)));
     return;
diff --git a/tensorflow/core/framework/device_base.h b/tensorflow/core/framework/device_base.h
index 922d34fac9..b184fd91e1 100644
--- a/tensorflow/core/framework/device_base.h
+++ b/tensorflow/core/framework/device_base.h
@@ -184,9 +184,7 @@ class DeviceBase {
 
   virtual ScopedAllocatorMgr* GetScopedAllocatorMgr() const { return nullptr; }
 
-  const bool has_eigen_cpu_device() const {
-    return !eigen_cpu_devices_.empty();
-  }
+  bool has_eigen_cpu_device() const { return !eigen_cpu_devices_.empty(); }
 
   virtual const Eigen::ThreadPoolDevice* eigen_cpu_device();
 
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index 88d9d65f5a..57bcc0f513 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -865,12 +865,15 @@ Status FunctionCallFrame::GetRetvals(std::vector<Tensor>* rets) const {
   return Status::OK();
 }
 
-Status FunctionCallFrame::ConsumeRetvals(std::vector<Tensor>* rets) {
+Status FunctionCallFrame::ConsumeRetvals(std::vector<Tensor>* rets,
+                                         bool allow_dead_tensors) {
   rets->clear();
   rets->reserve(rets_.size());
   for (size_t i = 0; i < rets_.size(); ++i) {
     if (rets_[i].has_val) {
       rets->emplace_back(std::move(rets_[i].val));
+    } else if (allow_dead_tensors) {
+      rets->emplace_back();
     } else {
       return errors::Internal("Retval[", i, "] does not have value");
     }
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index 8e607b927c..5da9af7db3 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -261,7 +261,10 @@ class FunctionCallFrame : public CallFrameInterface {
   // Caller methods.
   Status SetArgs(gtl::ArraySlice<Tensor> args);
   Status GetRetvals(std::vector<Tensor>* rets) const;
-  Status ConsumeRetvals(std::vector<Tensor>* rets);
+
+  // Moves the return values from the frame to rets. If allow_dead_tensors is
+  // false it will fail if any of the retvals do not have a value.
+  Status ConsumeRetvals(std::vector<Tensor>* rets, bool allow_dead_tensors);
 
   size_t num_args() const override { return arg_types_.size(); }
   size_t num_retvals() const override { return ret_types_.size(); }
@@ -510,6 +513,9 @@ class FunctionLibraryRuntime {
     // If true, we create a new IntraProcessRendezvous, else use the existing
     // one.
     bool create_rendezvous = false;
+
+    // If True, allow returning dead tensors.
+    bool allow_dead_tensors = false;
   };
   typedef std::function<void(const Status&)> DoneCallback;
   virtual void Run(const Options& opts, Handle handle,
diff --git a/tensorflow/core/framework/function_testlib.cc b/tensorflow/core/framework/function_testlib.cc
index 2b5a0fe1bb..a8eecc1a63 100644
--- a/tensorflow/core/framework/function_testlib.cc
+++ b/tensorflow/core/framework/function_testlib.cc
@@ -45,13 +45,12 @@ GraphDef GDef(gtl::ArraySlice<NodeDef> nodes,
 }
 
 // Helper to construct a NodeDef.
-NodeDef NDef(const string& name, const string& op,
-             gtl::ArraySlice<string> inputs,
+NodeDef NDef(StringPiece name, StringPiece op, gtl::ArraySlice<string> inputs,
              gtl::ArraySlice<std::pair<string, FDH::AttrValueWrapper>> attrs,
              const string& device) {
   NodeDef n;
-  n.set_name(name);
-  n.set_op(op);
+  n.set_name(name.ToString());
+  n.set_op(op.ToString());
   for (const auto& in : inputs) n.add_input(in);
   n.set_device(device);
   for (auto na : attrs) n.mutable_attr()->insert({na.first, na.second.proto});
diff --git a/tensorflow/core/framework/function_testlib.h b/tensorflow/core/framework/function_testlib.h
index b67c5cb1ab..8cf3c6a680 100644
--- a/tensorflow/core/framework/function_testlib.h
+++ b/tensorflow/core/framework/function_testlib.h
@@ -48,7 +48,7 @@ class Attrs {
 
 // Helper to construct a NodeDef.
 NodeDef NDef(
-    const string& name, const string& op, gtl::ArraySlice<string> inputs,
+    StringPiece name, StringPiece op, gtl::ArraySlice<string> inputs,
     gtl::ArraySlice<std::pair<string, FunctionDefHelper::AttrValueWrapper>>
         attrs = {},
     const string& device = "");
diff --git a/tensorflow/core/framework/node_def_util.cc b/tensorflow/core/framework/node_def_util.cc
index e8ea904ebd..0bd79366eb 100644
--- a/tensorflow/core/framework/node_def_util.cc
+++ b/tensorflow/core/framework/node_def_util.cc
@@ -86,7 +86,8 @@ string AttrSlice::SummarizeNode() const {
 string SummarizeNode(const Node& node) { return SummarizeNodeDef(node.def()); }
 
 string SummarizeNodeDef(const NodeDef& node_def) {
-  string ret = strings::StrCat(node_def.name(), " = ", node_def.op(), "[");
+  string ret = strings::StrCat(FormatNodeDefForError(node_def), " = ",
+                               node_def.op(), "[");
   strings::StrAppend(&ret, SummarizeAttrsHelper(node_def, node_def.device()));
   strings::StrAppend(&ret, "](");
 
@@ -101,6 +102,14 @@ string SummarizeNodeDef(const NodeDef& node_def) {
   return ret;
 }
 
+string FormatNodeForError(const Node& node) {
+  return FormatNodeDefForError(node.def());
+}
+
+string FormatNodeDefForError(const NodeDef& node_def) {
+  return errors::FormatNodeNameForError(node_def.name());
+}
+
 const AttrValue* AttrSlice::Find(StringPiece attr_name) const {
   // Currently, the collection used for NodeDef::attr() (google::protobuf::Map)
   // requires that the keys used for lookups have type 'const string&'. Because
@@ -634,7 +643,7 @@ Status ValidateExternalNodeDefSyntax(const NodeDef& node_def) {
 Status AttachDef(const Status& status, const NodeDef& node_def) {
   Status ret = status;
   errors::AppendToMessage(
-      &ret, strings::StrCat(" [[Node: ", SummarizeNodeDef(node_def), "]]"));
+      &ret, strings::StrCat(" [[", SummarizeNodeDef(node_def), "]]"));
   return ret;
 }
 
diff --git a/tensorflow/core/framework/node_def_util.h b/tensorflow/core/framework/node_def_util.h
index 64c8b386e8..c012b7c3d3 100644
--- a/tensorflow/core/framework/node_def_util.h
+++ b/tensorflow/core/framework/node_def_util.h
@@ -50,6 +50,12 @@ extern const char* const kColocationGroupPrefix;
 string SummarizeNode(const Node& node);
 string SummarizeNodeDef(const NodeDef& node_def);
 
+// Produces a formatted string pattern from the node which can uniquely identify
+// this node upstream to produce an informative error message. The pattern
+// followed is: {{node <node_name>}}
+string FormatNodeForError(const Node& node);
+string FormatNodeDefForError(const NodeDef& node_def);
+
 typedef protobuf::Map<string, AttrValue> AttrValueMap;
 
 // Adds an attr with name <name> and value <value> to *node_def.
diff --git a/tensorflow/core/framework/node_def_util_test.cc b/tensorflow/core/framework/node_def_util_test.cc
index 35b7b2272b..74cc594863 100644
--- a/tensorflow/core/framework/node_def_util_test.cc
+++ b/tensorflow/core/framework/node_def_util_test.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_def_builder.h"
 #include "tensorflow/core/framework/op_def_util.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -79,7 +81,7 @@ TEST(NodeDefUtilTest, In) {
     )proto");
   ExpectSuccess(node_def, op);
 
-  EXPECT_EQ("n = In[T=DT_FLOAT](a)", SummarizeNodeDef(node_def));
+  EXPECT_EQ("{{node n}} = In[T=DT_FLOAT](a)", SummarizeNodeDef(node_def));
 
   // Mismatching Op names.
   NodeDef bad = node_def;
@@ -144,7 +146,7 @@ TEST(NodeDefUtilTest, Out) {
     )proto");
   ExpectSuccess(node_def, op);
 
-  EXPECT_EQ("n = Out[T=DT_INT32]()", SummarizeNodeDef(node_def));
+  EXPECT_EQ("{{node n}} = Out[T=DT_INT32]()", SummarizeNodeDef(node_def));
 
   // Non-number type.
   NodeDef bad = node_def;
@@ -164,7 +166,7 @@ TEST(NodeDefUtilTest, Enum) {
     )proto");
   ExpectSuccess(node_def, op);
 
-  EXPECT_EQ("n = Enum[e=\"apple\"]()", SummarizeNodeDef(node_def));
+  EXPECT_EQ("{{node n}} = Enum[e=\"apple\"]()", SummarizeNodeDef(node_def));
 
   NodeDef good = node_def;
   good.clear_attr();
@@ -191,7 +193,8 @@ TEST(NodeDefUtilTest, SameIn) {
     )proto");
   ExpectSuccess(node_def, op);
 
-  EXPECT_EQ("n = SameIn[N=2, T=DT_DOUBLE](a, b)", SummarizeNodeDef(node_def));
+  EXPECT_EQ("{{node n}} = SameIn[N=2, T=DT_DOUBLE](a, b)",
+            SummarizeNodeDef(node_def));
 
   // Illegal type
   NodeDef bad = ToNodeDef(R"proto(
@@ -220,7 +223,7 @@ TEST(NodeDefUtilTest, AnyIn) {
     )proto");
   ExpectSuccess(node_def, op);
 
-  EXPECT_EQ("n = AnyIn[T=[DT_INT32, DT_STRING]](a, b)",
+  EXPECT_EQ("{{node n}} = AnyIn[T=[DT_INT32, DT_STRING]](a, b)",
             SummarizeNodeDef(node_def));
 
   const NodeDef bad = ToNodeDef(R"proto(
@@ -243,13 +246,14 @@ TEST(NodeDefUtilTest, Device) {
   const NodeDef node_def1 =
       ToNodeDef(NodeDefBuilder("d", &op_def1).Device("/cpu:17"));
   ExpectSuccess(node_def1, op_def1);
-  EXPECT_EQ("d = None[_device=\"/cpu:17\"]()", SummarizeNodeDef(node_def1));
+  EXPECT_EQ("{{node d}} = None[_device=\"/cpu:17\"]()",
+            SummarizeNodeDef(node_def1));
 
   const OpDef op_def2 = ToOpDef(OpDefBuilder("WithAttr").Attr("v: int"));
   const NodeDef node_def2 =
       ToNodeDef(NodeDefBuilder("d", &op_def2).Attr("v", 7).Device("/cpu:5"));
   ExpectSuccess(node_def2, op_def2);
-  EXPECT_EQ("d = WithAttr[v=7, _device=\"/cpu:5\"]()",
+  EXPECT_EQ("{{node d}} = WithAttr[v=7, _device=\"/cpu:5\"]()",
             SummarizeNodeDef(node_def2));
 }
 
@@ -284,7 +288,7 @@ TEST(NodeDefUtilTest, ValidSyntax) {
     )proto");
   ExpectValidSyntax(node_def_explicit_inputs);
 
-  EXPECT_EQ("n = AnyIn[T=[DT_INT32, DT_STRING]](a:0, b:123)",
+  EXPECT_EQ("{{node n}} = AnyIn[T=[DT_INT32, DT_STRING]](a:0, b:123)",
             SummarizeNodeDef(node_def_explicit_inputs));
 
   const NodeDef node_def_partial_shape = ToNodeDef(R"proto(
@@ -379,7 +383,7 @@ TEST(NameRangesForNodeTest, Simple) {
   EXPECT_EQ(NameRangeMap({{"a", {0, 1}}, {"b", {1, 2}}}), inputs);
   EXPECT_EQ(NameRangeMap({{"c", {0, 1}}, {"d", {1, 2}}}), outputs);
 
-  EXPECT_EQ("simple = Simple[](a, b)", SummarizeNodeDef(node_def));
+  EXPECT_EQ("{{node simple}} = Simple[](a, b)", SummarizeNodeDef(node_def));
 
   OpDef bad_op_def = op_def;
   bad_op_def.mutable_input_arg(0)->clear_type();
@@ -399,7 +403,7 @@ TEST(NameRangesForNodeTest, Polymorphic) {
   TF_EXPECT_OK(NameRangesForNode(node_def1, op_def, &inputs, &outputs));
   EXPECT_EQ(NameRangeMap({{"a", {0, 1}}, {"b", {1, 2}}}), inputs);
   EXPECT_EQ(NameRangeMap({{"c", {0, 1}}}), outputs);
-  EXPECT_EQ("poly = Polymorphic[T=DT_INT32](a, b)",
+  EXPECT_EQ("{{node poly}} = Polymorphic[T=DT_INT32](a, b)",
             SummarizeNodeDef(node_def1));
 
   const NodeDef node_def2 = ToNodeDef(NodeDefBuilder("poly", &op_def)
@@ -408,7 +412,8 @@ TEST(NameRangesForNodeTest, Polymorphic) {
   TF_EXPECT_OK(NameRangesForNode(node_def2, op_def, &inputs, &outputs));
   EXPECT_EQ(NameRangeMap({{"a", {0, 1}}, {"b", {1, 2}}}), inputs);
   EXPECT_EQ(NameRangeMap({{"c", {0, 1}}}), outputs);
-  EXPECT_EQ("poly = Polymorphic[T=DT_BOOL](a, b)", SummarizeNodeDef(node_def2));
+  EXPECT_EQ("{{node poly}} = Polymorphic[T=DT_BOOL](a, b)",
+            SummarizeNodeDef(node_def2));
 }
 
 TEST(NameRangesForNodeTest, NRepeats) {
@@ -431,7 +436,8 @@ TEST(NameRangesForNodeTest, NRepeats) {
   EXPECT_EQ(NameRangeMap({{"c", {0, 1}}, {"d", {1, 5}}, {"e", {5, 8}}}),
             outputs);
   EXPECT_EQ(
-      "nr = NRepeats[M=3, N=4, T=DT_FLOAT](a, a:1, a:2, a:3, b, b:1, b:2, b:3)",
+      "{{node nr}} = NRepeats[M=3, N=4, T=DT_FLOAT](a, a:1, a:2, a:3, b, b:1, "
+      "b:2, b:3)",
       SummarizeNodeDef(node_def1));
 
   const NodeDef node_def2 = ToNodeDef(NodeDefBuilder("nr", &op_def)
@@ -442,7 +448,7 @@ TEST(NameRangesForNodeTest, NRepeats) {
   EXPECT_EQ(NameRangeMap({{"a", {0, 2}}, {"b", {2, 4}}}), inputs);
   EXPECT_EQ(NameRangeMap({{"c", {0, 1}}, {"d", {1, 3}}, {"e", {3, 10}}}),
             outputs);
-  EXPECT_EQ("nr = NRepeats[M=7, N=2, T=DT_DOUBLE](a, a:1, b, b:1)",
+  EXPECT_EQ("{{node nr}} = NRepeats[M=7, N=2, T=DT_DOUBLE](a, a:1, b, b:1)",
             SummarizeNodeDef(node_def2));
 
   NodeDef bad_node_def = node_def2;
@@ -471,7 +477,7 @@ TEST(NameRangesForNodeTest, TypeList) {
   EXPECT_EQ(NameRangeMap({{"c", {0, 4}}, {"d", {4, 7}}, {"e", {7, 9}}}),
             outputs);
   EXPECT_EQ(
-      "tl = TypeList[T1=[DT_BOOL, DT_FLOAT],"
+      "{{node tl}} = TypeList[T1=[DT_BOOL, DT_FLOAT],"
       " T2=[DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT],"
       " T3=[DT_INT32, DT_DOUBLE, DT_STRING]](a, a:1, b, b:1, b:2, b:3)",
       SummarizeNodeDef(node_def1));
@@ -485,7 +491,8 @@ TEST(NameRangesForNodeTest, TypeList) {
   EXPECT_EQ(NameRangeMap({{"c", {0, 1}}, {"d", {1, 3}}, {"e", {3, 10}}}),
             outputs);
   EXPECT_EQ(
-      "tl = TypeList[T1=[DT_INT32, DT_INT32, DT_INT32, DT_INT32, DT_INT32,"
+      "{{node tl}} = TypeList[T1=[DT_INT32, DT_INT32, DT_INT32, DT_INT32, "
+      "DT_INT32,"
       " DT_INT32, DT_INT32], T2=[DT_DOUBLE], T3=[DT_DOUBLE, DT_STRING]]"
       "(a, a:1, a:2, a:3, a:4, a:5, a:6, b)",
       SummarizeNodeDef(node_def2));
@@ -509,5 +516,20 @@ TEST(AddPrefixAndSuffixToNode, Enter) {
   EXPECT_EQ("prefix/test_frame/suffix", frame_name);
 }
 
+TEST(FormatNodeForErrorTest, Node) {
+  Graph g(OpRegistry::Global());
+  Node* node;
+  TF_CHECK_OK(NodeBuilder("enter", "NoOp").Finalize(&g, &node));
+  EXPECT_EQ("{{node enter}}", FormatNodeForError(*node));
+}
+
+TEST(FormatNodeForErrorTest, NodeDef) {
+  NodeDef node_def;
+  node_def.set_name("enter");
+  node_def.set_op("Enter");
+  AddNodeAttr("frame_name", "test_frame", &node_def);
+  EXPECT_EQ("{{node enter}}", FormatNodeDefForError(node_def));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/op_compatibility_test.cc b/tensorflow/core/framework/op_compatibility_test.cc
index c782480f1f..140f201085 100644
--- a/tensorflow/core/framework/op_compatibility_test.cc
+++ b/tensorflow/core/framework/op_compatibility_test.cc
@@ -209,8 +209,8 @@ TEST_F(OpCompatibilityTest, Same) {
                    .Finalize(node_def()));
   ExpectSuccess(*RegisteredOpDef());
   EXPECT_EQ(
-      "same = Same[N=3, T=DT_FLOAT, TList=[DT_BOOL, DT_BOOL]](a, b, c, c:1, "
-      "c:2, d, d:1, d:2, e, e:1)",
+      "{{node same}} = Same[N=3, T=DT_FLOAT, TList=[DT_BOOL, DT_BOOL]](a, b, "
+      "c, c:1, c:2, d, d:1, d:2, e, e:1)",
       Result());
 }
 
@@ -224,7 +224,7 @@ TEST_F(OpCompatibilityTest, AddAttr) {
       OpDefBuilder("AddAttr").Output("ndef: string").Finalize(&old_op));
   TF_ASSERT_OK(NodeDefBuilder("add_attr", &old_op.op_def).Finalize(node_def()));
   ExpectSuccess(old_op.op_def);
-  EXPECT_EQ("add_attr = AddAttr[a=42]()", Result());
+  EXPECT_EQ("{{node add_attr}} = AddAttr[a=42]()", Result());
 }
 
 // Should be able to make an attr restriction less strict.
@@ -241,7 +241,7 @@ TEST_F(OpCompatibilityTest, LessStrict) {
                    .Attr("a", "B")
                    .Finalize(node_def()));
   ExpectSuccess(old_op.op_def);
-  EXPECT_EQ("less_strict = LessStrict[a=\"B\"]()", Result());
+  EXPECT_EQ("{{node less_strict}} = LessStrict[a=\"B\"]()", Result());
 }
 
 // Should be able to remove an attr restriction.
@@ -259,7 +259,8 @@ TEST_F(OpCompatibilityTest, RemoveRestriction) {
                    .Attr("a", DT_INT32)
                    .Finalize(node_def()));
   ExpectSuccess(old_op.op_def);
-  EXPECT_EQ("remove_restriction = RemoveRestriction[a=DT_INT32]()", Result());
+  EXPECT_EQ("{{node remove_restriction}} = RemoveRestriction[a=DT_INT32]()",
+            Result());
 }
 
 // Should be able to change the order of attrs.
@@ -278,7 +279,7 @@ TEST_F(OpCompatibilityTest, AttrOrder) {
                    .Attr("a", 7)
                    .Finalize(node_def()));
   ExpectSuccess(old_op.op_def);
-  EXPECT_EQ("attr_order = AttrOrder[a=7, b=true]()", Result());
+  EXPECT_EQ("{{node attr_order}} = AttrOrder[a=7, b=true]()", Result());
 }
 
 // Should be able to make an input/output polymorphic.
@@ -299,7 +300,8 @@ TEST_F(OpCompatibilityTest, TypePolymorphic) {
                    .Input(FakeInput())
                    .Finalize(node_def()));
   ExpectSuccess(old_op.op_def);
-  EXPECT_EQ("type_polymorphic = TypePolymorphic[T=DT_INT32](a)", Result());
+  EXPECT_EQ("{{node type_polymorphic}} = TypePolymorphic[T=DT_INT32](a)",
+            Result());
 }
 
 // Should be able to make a single input/output into a list.
@@ -320,7 +322,7 @@ TEST_F(OpCompatibilityTest, MakeList) {
                    .Input(FakeInput())
                    .Finalize(node_def()));
   ExpectSuccess(old_op.op_def);
-  EXPECT_EQ("make_list = MakeList[N=1](a)", Result());
+  EXPECT_EQ("{{node make_list}} = MakeList[N=1](a)", Result());
 }
 
 // Should be able to make a single input/output into a polymorphic list.
@@ -343,7 +345,8 @@ TEST_F(OpCompatibilityTest, MakePolyList) {
                    .Input(FakeInput())
                    .Finalize(node_def()));
   ExpectSuccess(old_op.op_def);
-  EXPECT_EQ("make_poly_list = MakePolyList[N=1, T=DT_INT32](a)", Result());
+  EXPECT_EQ("{{node make_poly_list}} = MakePolyList[N=1, T=DT_INT32](a)",
+            Result());
 }
 
 // Should be able to make a single input/output into an arbitrary list.
@@ -364,7 +367,7 @@ TEST_F(OpCompatibilityTest, MakeAnyList) {
                    .Input(FakeInput())
                    .Finalize(node_def()));
   ExpectSuccess(old_op.op_def);
-  EXPECT_EQ("make_any_list = MakeAnyList[T=[DT_INT32]](a)", Result());
+  EXPECT_EQ("{{node make_any_list}} = MakeAnyList[T=[DT_INT32]](a)", Result());
 }
 
 // Should be able to make a single polymorphic input/output into a list of
@@ -387,7 +390,8 @@ TEST_F(OpCompatibilityTest, PolyIntoList) {
                    .Input(FakeInput(DT_INT32))
                    .Finalize(node_def()));
   ExpectSuccess(old_op.op_def);
-  EXPECT_EQ("poly_into_list = PolyIntoList[N=1, T=DT_INT32](a)", Result());
+  EXPECT_EQ("{{node poly_into_list}} = PolyIntoList[N=1, T=DT_INT32](a)",
+            Result());
 }
 
 // Should be able to make a multiple inputs/outputs into a list with
@@ -413,7 +417,7 @@ TEST_F(OpCompatibilityTest, MakeMultipleSameList) {
                    .Input(FakeInput())
                    .Finalize(node_def()));
   ExpectSuccess(old_op.op_def);
-  EXPECT_EQ("make_list = MakeMultipleSameList[N=2](a, b)", Result());
+  EXPECT_EQ("{{node make_list}} = MakeMultipleSameList[N=2](a, b)", Result());
 }
 
 // Changing from int32, float -> T
@@ -437,8 +441,9 @@ TEST_F(OpCompatibilityTest, MakeMultipleAnyList) {
                    .Input(FakeInput())
                    .Finalize(node_def()));
   ExpectSuccess(old_op.op_def);
-  EXPECT_EQ("make_list = MakeMultipleAnyList[T=[DT_INT32, DT_FLOAT]](a, b)",
-            Result());
+  EXPECT_EQ(
+      "{{node make_list}} = MakeMultipleAnyList[T=[DT_INT32, DT_FLOAT]](a, b)",
+      Result());
 }
 
 // Should be able to change the name of an input/output.
@@ -455,7 +460,7 @@ TEST_F(OpCompatibilityTest, ChangeName) {
                    .Input(FakeInput())
                    .Finalize(node_def()));
   ExpectSuccess(old_op.op_def);
-  EXPECT_EQ("change_name = ChangeName[](a)", Result());
+  EXPECT_EQ("{{node change_name}} = ChangeName[](a)", Result());
 }
 
 // Should be able to add an input/output of type
@@ -473,7 +478,7 @@ TEST_F(OpCompatibilityTest, AddNInts) {
   TF_ASSERT_OK(
       NodeDefBuilder("add_n_ints", &old_op.op_def).Finalize(node_def()));
   ExpectSuccess(old_op.op_def);
-  EXPECT_EQ("add_n_ints = AddNInts[N=0]()", Result());
+  EXPECT_EQ("{{node add_n_ints}} = AddNInts[N=0]()", Result());
 }
 
 // Should be able to add an input/output of type N * T
@@ -492,7 +497,7 @@ TEST_F(OpCompatibilityTest, AddNSame) {
   TF_ASSERT_OK(
       NodeDefBuilder("add_n_same", &old_op.op_def).Finalize(node_def()));
   ExpectSuccess(old_op.op_def);
-  EXPECT_EQ("add_n_same = AddNSame[N=0, T=DT_BOOL]()", Result());
+  EXPECT_EQ("{{node add_n_same}} = AddNSame[N=0, T=DT_BOOL]()", Result());
 }
 
 // Should be able to add an input/output of type N * T
@@ -517,8 +522,10 @@ TEST_F(OpCompatibilityTest, AddNSameAsExisting) {
                    .Input(FakeInput(DT_STRING))
                    .Finalize(node_def()));
   ExpectSuccess(old_op.op_def);
-  EXPECT_EQ("add_n_same_as_existing = AddNSameAsExisting[N=0, T=DT_STRING](a)",
-            Result());
+  EXPECT_EQ(
+      "{{node add_n_same_as_existing}} = AddNSameAsExisting[N=0, "
+      "T=DT_STRING](a)",
+      Result());
 }
 
 // Should be able to add an input/output of type T
@@ -536,7 +543,7 @@ TEST_F(OpCompatibilityTest, AddAnyList) {
   TF_ASSERT_OK(
       NodeDefBuilder("add_any_list", &old_op.op_def).Finalize(node_def()));
   ExpectSuccess(old_op.op_def);
-  EXPECT_EQ("add_any_list = AddAnyList[T=[]]()", Result());
+  EXPECT_EQ("{{node add_any_list}} = AddAnyList[T=[]]()", Result());
 }
 
 // Should be able to allow shorter lists.
@@ -557,8 +564,10 @@ TEST_F(OpCompatibilityTest, ShorterAnyList) {
                    .Input(FakeInput(2, DT_BOOL))
                    .Finalize(node_def()));
   ExpectSuccess(old_op.op_def);
-  EXPECT_EQ("shorter_any_list = ShorterAnyList[T=[DT_BOOL, DT_BOOL]](a, a:1)",
-            Result());
+  EXPECT_EQ(
+      "{{node shorter_any_list}} = ShorterAnyList[T=[DT_BOOL, DT_BOOL]](a, "
+      "a:1)",
+      Result());
 }
 
 REGISTER_OP("ShorterSameList")
@@ -578,7 +587,8 @@ TEST_F(OpCompatibilityTest, ShorterSameList) {
                    .Input(FakeInput(2))
                    .Finalize(node_def()));
   ExpectSuccess(old_op.op_def);
-  EXPECT_EQ("shorter_same_list = ShorterSameList[N=2](a, a:1)", Result());
+  EXPECT_EQ("{{node shorter_same_list}} = ShorterSameList[N=2](a, a:1)",
+            Result());
 }
 
 // Can remove a restriction to an attr
@@ -597,7 +607,7 @@ TEST_F(OpCompatibilityTest, AttrRemoveRestriction) {
                    .Attr("t", DT_INT32)
                    .Finalize(node_def()));
   ExpectSuccess(old_op.op_def);
-  EXPECT_EQ("remove_restriction = AttrRemoveRestriction[t=DT_INT32]()",
+  EXPECT_EQ("{{node remove_restriction}} = AttrRemoveRestriction[t=DT_INT32]()",
             Result());
 }
 
@@ -619,7 +629,8 @@ TEST_F(OpCompatibilityTest, AttrLessRestrictive) {
                    .Attr("t", DT_INT32)
                    .Finalize(node_def()));
   ExpectSuccess(old_op.op_def);
-  EXPECT_EQ("less_restrictive = AttrLessRestrictive[t=DT_INT32]()", Result());
+  EXPECT_EQ("{{node less_restrictive}} = AttrLessRestrictive[t=DT_INT32]()",
+            Result());
 }
 
 // Can remove a minimum from an attr.
@@ -637,7 +648,7 @@ TEST_F(OpCompatibilityTest, AttrRemoveMin) {
                    .Attr("n", 4)
                    .Finalize(node_def()));
   ExpectSuccess(old_op.op_def);
-  EXPECT_EQ("remove_min = AttrRemoveMin[n=4]()", Result());
+  EXPECT_EQ("{{node remove_min}} = AttrRemoveMin[n=4]()", Result());
 }
 
 // Can lower the minimum on an attr.
@@ -655,7 +666,7 @@ TEST_F(OpCompatibilityTest, AttrLowerMin) {
                    .Attr("n", 4)
                    .Finalize(node_def()));
   ExpectSuccess(old_op.op_def);
-  EXPECT_EQ("lower_min = AttrLowerMin[n=4]()", Result());
+  EXPECT_EQ("{{node lower_min}} = AttrLowerMin[n=4]()", Result());
 }
 
 // Can make a ref input into a non-ref input.
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index 58feec90f0..b53bd8d53d 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -1061,40 +1061,51 @@ Status SupportedDeviceTypesForNode(
 }
 
 void LogAllRegisteredKernels() {
-  for (const auto& key_registration : *GlobalKernelRegistryTyped()) {
-    const KernelDef& kernel_def(key_registration.second.def);
+  KernelList kernel_list = GetAllRegisteredKernels();
+  for (const auto& kernel_def : kernel_list.kernel()) {
     LOG(INFO) << "OpKernel ('" << ProtoShortDebugString(kernel_def) << "')";
   }
 }
 
 KernelList GetAllRegisteredKernels() {
+  return GetFilteredRegisteredKernels([](const KernelDef& k) { return true; });
+}
+
+KernelList GetFilteredRegisteredKernels(
+    const std::function<bool(const KernelDef&)>& predicate) {
   const KernelRegistry* const typed_registry = GlobalKernelRegistryTyped();
   KernelList kernel_list;
   kernel_list.mutable_kernel()->Reserve(typed_registry->size());
   for (const auto& p : *typed_registry) {
-    *kernel_list.add_kernel() = p.second.def;
+    const KernelDef& kernel_def = p.second.def;
+    if (predicate(kernel_def)) {
+      *kernel_list.add_kernel() = kernel_def;
+    }
   }
   return kernel_list;
 }
 
+KernelList GetRegisteredKernelsForOp(StringPiece op_name) {
+  auto op_pred = [op_name](const KernelDef& k) { return k.op() == op_name; };
+  return GetFilteredRegisteredKernels(op_pred);
+}
+
 string KernelsRegisteredForOp(StringPiece op_name) {
+  KernelList kernel_list = GetRegisteredKernelsForOp(op_name);
+  if (kernel_list.kernel_size() == 0) return "  <no registered kernels>\n";
   string ret;
-  for (const auto& key_registration : *GlobalKernelRegistryTyped()) {
-    const KernelDef& kernel_def(key_registration.second.def);
-    if (kernel_def.op() == op_name) {
-      strings::StrAppend(&ret, "  device='", kernel_def.device_type(), "'");
-      if (!kernel_def.label().empty()) {
-        strings::StrAppend(&ret, "; label='", kernel_def.label(), "'");
-      }
-      for (int i = 0; i < kernel_def.constraint_size(); ++i) {
-        strings::StrAppend(
-            &ret, "; ", kernel_def.constraint(i).name(), " in ",
-            SummarizeAttrValue(kernel_def.constraint(i).allowed_values()));
-      }
-      strings::StrAppend(&ret, "\n");
+  for (const auto& kernel_def : kernel_list.kernel()) {
+    strings::StrAppend(&ret, "  device='", kernel_def.device_type(), "'");
+    if (!kernel_def.label().empty()) {
+      strings::StrAppend(&ret, "; label='", kernel_def.label(), "'");
     }
+    for (int i = 0; i < kernel_def.constraint_size(); ++i) {
+      strings::StrAppend(
+          &ret, "; ", kernel_def.constraint(i).name(), " in ",
+          SummarizeAttrValue(kernel_def.constraint(i).allowed_values()));
+    }
+    strings::StrAppend(&ret, "\n");
   }
-  if (ret.empty()) return "  <no registered kernels>\n";
   return ret;
 }
 
@@ -1277,4 +1288,10 @@ void OpKernelContext::CtxFailureWithWarning(const char* file, int line,
   SetStatus(s);
 }
 
+void CheckNotInComputeAsync(OpKernelContext* ctx,
+                            const char* correct_macro_name) {
+  CHECK_EQ(nullptr, ctx->op_kernel().AsAsync())
+      << "Use " << correct_macro_name << " in AsyncOpKernel implementations.";
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index d9fe42fcbb..2b7cc867da 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -113,6 +113,7 @@ class OpKernel {
 
   // Returns nullptr iff this op kernel is synchronous.
   virtual AsyncOpKernel* AsAsync() { return nullptr; }
+  virtual const AsyncOpKernel* AsAsync() const { return nullptr; }
 
   // Returns true iff this op kernel is considered "expensive". The
   // runtime may use this flag to optimize graph execution for example
@@ -197,6 +198,7 @@ class AsyncOpKernel : public OpKernel {
   virtual void ComputeAsync(OpKernelContext* context, DoneCallback done) = 0;
 
   AsyncOpKernel* AsAsync() final { return this; }
+  const AsyncOpKernel* AsAsync() const final { return this; }
 
   void Compute(OpKernelContext* context) final;
 
@@ -1304,6 +1306,13 @@ void LogAllRegisteredKernels();
 // Gets a list of all registered kernels.
 KernelList GetAllRegisteredKernels();
 
+// Gets a list of all registered kernels for which predicate returns true
+KernelList GetFilteredRegisteredKernels(
+    const std::function<bool(const KernelDef&)>& predicate);
+
+// Gets a list of all registered kernels for a given op
+KernelList GetRegisteredKernelsForOp(StringPiece op_name);
+
 namespace kernel_factory {
 
 class OpKernelRegistrar {
@@ -1535,21 +1544,36 @@ inline void OpOutputList::set_ref(int i, mutex* mu, Tensor* tensor_for_ref) {
 //   ...
 // }
 
-#define OP_REQUIRES(CTX, EXP, STATUS)                  \
-  do {                                                 \
-    if (!TF_PREDICT_TRUE(EXP)) {                       \
-      (CTX)->CtxFailure(__FILE__, __LINE__, (STATUS)); \
-      return;                                          \
-    }                                                  \
+// Generate a fatal error if OP_REQUIRES or OP_REQUIRES_OK are used in
+// AsyncOpKernel implementations. If these macros are used and the condition
+// does not hold, the `done` callback will never be called and the system will
+// deadlock, so a crash failure is preferable. Since the OP_REQUIRES[_OK] macros
+// are legal to use in AsyncOpKernel constructors, we use overload resolution
+// to distinguish between OpKernelConstruction* and OpKernelContext* context
+// types.
+class XlaOpKernelContext;
+inline void CheckNotInComputeAsync(XlaOpKernelContext*, const char*) {}
+inline void CheckNotInComputeAsync(OpKernelConstruction*, const char*) {}
+void CheckNotInComputeAsync(OpKernelContext* ctx,
+                            const char* correct_macro_name);
+
+#define OP_REQUIRES(CTX, EXP, STATUS)                     \
+  do {                                                    \
+    if (!TF_PREDICT_TRUE(EXP)) {                          \
+      CheckNotInComputeAsync((CTX), "OP_REQUIRES_ASYNC"); \
+      (CTX)->CtxFailure(__FILE__, __LINE__, (STATUS));    \
+      return;                                             \
+    }                                                     \
   } while (0)
 
-#define OP_REQUIRES_OK(CTX, ...)                            \
-  do {                                                      \
-    ::tensorflow::Status _s(__VA_ARGS__);                   \
-    if (!TF_PREDICT_TRUE(_s.ok())) {                        \
-      (CTX)->CtxFailureWithWarning(__FILE__, __LINE__, _s); \
-      return;                                               \
-    }                                                       \
+#define OP_REQUIRES_OK(CTX, ...)                             \
+  do {                                                       \
+    ::tensorflow::Status _s(__VA_ARGS__);                    \
+    if (!TF_PREDICT_TRUE(_s.ok())) {                         \
+      CheckNotInComputeAsync((CTX), "OP_REQUIRES_OK_ASYNC"); \
+      (CTX)->CtxFailureWithWarning(__FILE__, __LINE__, _s);  \
+      return;                                                \
+    }                                                        \
   } while (0)
 
 #define OP_REQUIRES_ASYNC(CTX, EXP, STATUS, CALLBACK)  \
diff --git a/tensorflow/core/framework/op_kernel_test.cc b/tensorflow/core/framework/op_kernel_test.cc
index b76a3400a8..83dda6579b 100644
--- a/tensorflow/core/framework/op_kernel_test.cc
+++ b/tensorflow/core/framework/op_kernel_test.cc
@@ -965,7 +965,8 @@ BENCHMARK(BM_ConcatInputRange);
 BENCHMARK(BM_SelectInputRange);
 
 TEST(RegisteredKernels, CanCallGetAllRegisteredKernels) {
-  auto all_registered_kernels = GetAllRegisteredKernels().kernel();
+  auto kernel_list = GetAllRegisteredKernels();
+  auto all_registered_kernels = kernel_list.kernel();
   auto has_name_test1 = [](const KernelDef& k) { return k.op() == "Test1"; };
 
   // Verify we can find the "Test1" op registered above
@@ -986,5 +987,20 @@ TEST(RegisteredKernels, CanLogAllRegisteredKernels) {
   tensorflow::LogAllRegisteredKernels();
 }
 
+TEST(RegisteredKernels, GetFilteredRegisteredKernels) {
+  auto has_name_test1 = [](const KernelDef& k) { return k.op() == "Test1"; };
+  auto kernel_list = GetFilteredRegisteredKernels(has_name_test1);
+  ASSERT_EQ(kernel_list.kernel_size(), 1);
+  EXPECT_EQ(kernel_list.kernel(0).op(), "Test1");
+  EXPECT_EQ(kernel_list.kernel(0).device_type(), "CPU");
+}
+
+TEST(RegisteredKernels, GetRegisteredKernelsForOp) {
+  auto kernel_list = GetRegisteredKernelsForOp("Test1");
+  ASSERT_EQ(kernel_list.kernel_size(), 1);
+  EXPECT_EQ(kernel_list.kernel(0).op(), "Test1");
+  EXPECT_EQ(kernel_list.kernel(0).device_type(), "CPU");
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/tensor.h b/tensorflow/core/framework/tensor.h
index d2f2609d3b..1b19ab5da3 100644
--- a/tensorflow/core/framework/tensor.h
+++ b/tensorflow/core/framework/tensor.h
@@ -482,6 +482,7 @@ class Tensor {
   friend class VariableOp;            // For access to set_shape
   friend class AutoReloadVariableOp;  // For access to set_shape
   friend class TensorTestHelper;      // For access to set_shape
+  friend class CastOpBase;            // For access to set_dtype;
   friend class OpKernelContext;       // For access to RefCountIsOne().
   friend class ScopedAllocator;       // For access to buf_.
   friend class XlaTensor;             // For access to RefCountIsOne().
diff --git a/tensorflow/core/graph/control_flow.cc b/tensorflow/core/graph/control_flow.cc
index 1778e48ef6..8e1e56d29b 100644
--- a/tensorflow/core/graph/control_flow.cc
+++ b/tensorflow/core/graph/control_flow.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <deque>
 #include <vector>
 
+#include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -54,10 +55,11 @@ Status ValidateControlFlowInfo(const Graph* graph,
       frame.parent = parent;
       frame.name = cf.frame_name;
     } else if (frame.parent != parent) {
-      return errors::InvalidArgument(
+      return errors::Internal(
           "Invalid loop structure: Mismatched parent frames for \"",
           cf.frame_name, "\": \"", parent->name, "\" vs \"", frame.parent->name,
-          "\". This is an internal bug, please file a bug report with "
+          "\". The node giving this error: ", FormatNodeForError(*node),
+          "This is an internal bug, please file a bug report with "
           "instructions on how to reproduce the error.");
     }
     if (IsLoopCond(node)) {
@@ -69,9 +71,9 @@ Status ValidateControlFlowInfo(const Graph* graph,
           !str_util::StrContains(node->name(), "LoopCounter")) {
         return errors::InvalidArgument(
             "Invalid loop structure: Loop \"", cf.frame_name,
-            "\" has more than one LoopCond node: \"", node->name(), "\" and \"",
-            frame.loop_cond->name(),
-            "\". This is an internal bug, please file a bug report with "
+            "\" has more than one LoopCond node: ", FormatNodeForError(*node),
+            " and ", FormatNodeForError(*frame.loop_cond),
+            ". This is an internal bug, please file a bug report with "
             "instructions on how to reproduce the error.");
       }
       frame.loop_cond = node;
@@ -135,12 +137,11 @@ Status BuildControlFlowInfo(const Graph* g, std::vector<ControlFlowInfo>* info,
           const string& parent_frame = (*info)[out_parent->id()].frame_name;
           if (parent_frame != frame_name) {
             return errors::InvalidArgument(
-                "The node '", out->name(),
-                "' has inputs from different "
-                "frames. The input '",
-                curr_node->name(), "' is in frame '", frame_name,
-                "'. The input '", parent_nodes[out->id()]->name(),
-                "' is in frame '", parent_frame, "'.");
+                FormatNodeForError(*out),
+                " has inputs from different frames. The input ",
+                FormatNodeForError(*curr_node), " is in frame '", frame_name,
+                "'. The input ", FormatNodeForError(*parent_nodes[out->id()]),
+                " is in frame '", parent_frame, "'.");
           }
         } else {
           out_info->frame = out;
@@ -148,7 +149,8 @@ Status BuildControlFlowInfo(const Graph* g, std::vector<ControlFlowInfo>* info,
           TF_RETURN_IF_ERROR(
               GetNodeAttr(out->attrs(), "frame_name", &out_info->frame_name));
           if (out_info->frame_name.empty()) {
-            return errors::InvalidArgument("The Enter node ", out->name(),
+            return errors::InvalidArgument("The Enter ",
+                                           FormatNodeForError(*out),
                                            " must have a frame name.");
           }
         }
@@ -156,12 +158,11 @@ Status BuildControlFlowInfo(const Graph* g, std::vector<ControlFlowInfo>* info,
         if (is_visited) {
           if (out_info->frame_name != frame_name) {
             return errors::InvalidArgument(
-                "The node '", out->name(),
-                "' has inputs from different "
-                "frames. The input '",
-                curr_node->name(), "' is in frame '", frame_name,
-                "'. The input '", parent_nodes[out->id()]->name(),
-                "' is in frame '", out_info->frame_name, "'.");
+                FormatNodeForError(*out),
+                " has inputs from different frames. The input ",
+                FormatNodeForError(*curr_node), " is in frame '", frame_name,
+                "'. The input ", FormatNodeForError(*parent_nodes[out->id()]),
+                " is in frame '", out_info->frame_name, "'.");
           }
         } else {
           out_info->frame = frame;
diff --git a/tensorflow/core/graph/control_flow_test.cc b/tensorflow/core/graph/control_flow_test.cc
index eb7937400f..803c757c3f 100644
--- a/tensorflow/core/graph/control_flow_test.cc
+++ b/tensorflow/core/graph/control_flow_test.cc
@@ -63,6 +63,15 @@ TEST(ValidateControlFlowTest, InputsFromDifferentFrames) {
   EXPECT_TRUE(str_util::StrContains(status.error_message(),
                                     "has inputs from different frames"))
       << status.error_message();
+  EXPECT_TRUE(str_util::StrContains(status.error_message(),
+                                    "{{node outer/body/inner/Merge}}"))
+      << status.error_message();
+  EXPECT_TRUE(str_util::StrContains(status.error_message(),
+                                    "{{node outer/body/inner/Enter}}"))
+      << status.error_message();
+  EXPECT_TRUE(
+      str_util::StrContains(status.error_message(), "{{node outer/Switch}}"))
+      << status.error_message();
 }
 
 TEST(ValidateControlFlowTest, MismatchedParentFrames) {
@@ -102,6 +111,8 @@ TEST(ValidateControlFlowTest, MismatchedParentFrames) {
   EXPECT_TRUE(
       str_util::StrContains(status.error_message(), "Mismatched parent frames"))
       << status.error_message();
+  EXPECT_TRUE(str_util::StrContains(status.error_message(), "{{node Enter2}}"))
+      << status.error_message();
 }
 
 TEST(ValidateControlFlowTest, TwoLoopCond) {
@@ -125,6 +136,12 @@ TEST(ValidateControlFlowTest, TwoLoopCond) {
   EXPECT_TRUE(str_util::StrContains(status.error_message(),
                                     "more than one LoopCond node"))
       << status.error_message();
+  EXPECT_TRUE(
+      str_util::StrContains(status.error_message(), "{{node sub/LoopCond}}"))
+      << status.error_message();
+  EXPECT_TRUE(
+      str_util::StrContains(status.error_message(), "{{node LoopCond}}"))
+      << status.error_message();
 }
 
 }  // namespace
diff --git a/tensorflow/core/graph/graph_partition.cc b/tensorflow/core/graph/graph_partition.cc
index 1b1941f9c1..ea0a814ab8 100644
--- a/tensorflow/core/graph/graph_partition.cc
+++ b/tensorflow/core/graph/graph_partition.cc
@@ -214,6 +214,14 @@ NodeDef* AddSend(const PartitionOptions& opts, const GraphInfo& g_info,
       cast_builder.Attr("_start_time", start_time);
     }
     cast_builder.Attr("DstT", cast_dtype);
+
+    if (cast_dtype == DT_BFLOAT16) {
+      // the below attribute specifies that the cast to bfloat16 should use
+      // truncation. This is needed to retain legacy behavior when we change
+      // the default bfloat16 casts to use rounding instead of truncation
+      cast_builder.Attr("Truncate", true);
+    }
+
     NodeDef* cast = gdef->add_node();
     *status = cast_builder.Finalize(cast);
     if (!status->ok()) return nullptr;
diff --git a/tensorflow/core/graph/mkl_graph_util.h b/tensorflow/core/graph/mkl_graph_util.h
index 5f51d6083b..333bf761b0 100644
--- a/tensorflow/core/graph/mkl_graph_util.h
+++ b/tensorflow/core/graph/mkl_graph_util.h
@@ -17,7 +17,6 @@ limitations under the License.
 #define TENSORFLOW_CORE_GRAPH_MKL_GRAPH_UTIL_H_
 #ifdef INTEL_MKL
 
-#include <string>
 #include "tensorflow/core/framework/op_kernel.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 836ccae9b7..c22e0a3872 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include <memory>
 #include <queue>
 #include <set>
-#include <string>
 #include <unordered_set>
 #include <utility>
 #include <vector>
@@ -2894,7 +2893,8 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     for (const Edge* e : n->in_edges()) {
       // Rewrite only if there is corresponding LRN, i.e workspace is available
       if (e->dst()->type_string() == csinfo_.lrn_grad && e->dst_input() == 2 &&
-          e->src()->type_string() == mkl_op_registry::GetMklOpName(csinfo_.lrn) &&
+          e->src()->type_string() ==
+              mkl_op_registry::GetMklOpName(csinfo_.lrn) &&
           e->src_output() == 0) {
         do_rewrite = true;
         break;
@@ -2907,9 +2907,12 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     CHECK_NOTNULL(n);
     bool do_rewrite = false;
     for (const Edge* e : n->in_edges()) {
-      // Rewrite only if there is corresponding Maxpool, i.e workspace is available
-      if (e->dst()->type_string() == csinfo_.max_pool_grad && e->dst_input() == 1 &&
-          e->src()->type_string() == mkl_op_registry::GetMklOpName(csinfo_.max_pool) &&
+      // Rewrite only if there is corresponding Maxpool, i.e workspace is
+      // available
+      if (e->dst()->type_string() == csinfo_.max_pool_grad &&
+          e->dst_input() == 1 &&
+          e->src()->type_string() ==
+              mkl_op_registry::GetMklOpName(csinfo_.max_pool) &&
           e->src_output() == 0) {
         do_rewrite = true;
         break;
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index b8f5492f7c..a41f5861af 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include "tensorflow/core/graph/mkl_graph_util.h"
 
 #include <algorithm>
-#include <string>
 #include <vector>
 
 #include "tensorflow/core/framework/op.h"
diff --git a/tensorflow/core/graph/mkl_tfconversion_pass.cc b/tensorflow/core/graph/mkl_tfconversion_pass.cc
index e9ced4d2b6..aa39af637f 100644
--- a/tensorflow/core/graph/mkl_tfconversion_pass.cc
+++ b/tensorflow/core/graph/mkl_tfconversion_pass.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <memory>
 #include <queue>
 #include <set>
-#include <string>
 #include <utility>
 #include <vector>
 
diff --git a/tensorflow/core/graph/mkl_tfconversion_pass_test.cc b/tensorflow/core/graph/mkl_tfconversion_pass_test.cc
index bbdbe78bbd..ebcb6de551 100644
--- a/tensorflow/core/graph/mkl_tfconversion_pass_test.cc
+++ b/tensorflow/core/graph/mkl_tfconversion_pass_test.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include "tensorflow/core/graph/mkl_graph_util.h"
 
 #include <algorithm>
-#include <string>
 #include <vector>
 
 #include "tensorflow/core/framework/op.h"
diff --git a/tensorflow/core/grappler/BUILD b/tensorflow/core/grappler/BUILD
index 9dcc6765f5..7c6fe56e1f 100644
--- a/tensorflow/core/grappler/BUILD
+++ b/tensorflow/core/grappler/BUILD
@@ -33,6 +33,7 @@ tf_cc_test(
     name = "utils_test",
     srcs = ["utils_test.cc"],
     deps = [
+        ":grappler_item",
         ":utils",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/core:all_kernels",
@@ -151,3 +152,32 @@ tf_cc_test(
         "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
     ],
 )
+
+cc_library(
+    name = "mutable_graph_view",
+    srcs = [
+        "mutable_graph_view.cc",
+    ],
+    hdrs = ["mutable_graph_view.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":graph_view",
+        ":grappler_item",
+        ":utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+tf_cc_test(
+    name = "mutable_graph_view_test",
+    srcs = ["mutable_graph_view_test.cc"],
+    deps = [
+        ":grappler_item",
+        ":mutable_graph_view",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+    ],
+)
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index 83a8326e79..231c7c63be 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -496,18 +496,11 @@ class SymbolicShapeRefiner {
             "supported.");
       }
 
+      // It is guaranteed that output_tensors does not contain any control
+      // inputs, so port_id >= 0.
       string out_tensor = out_arg.output_tensors[0];
-      auto out_tensor_pieces = str_util::Split(out_tensor, ",");
-      string node_name = out_tensor_pieces[0];
       int port_id;
-
-      // Check if port_id was included in out_tensor
-      if (out_tensor_pieces.size() <= 1) {
-        port_id = 0;
-      } else if (!strings::safe_strto32(out_tensor_pieces[1], &port_id)) {
-        return errors::FailedPrecondition(
-            "Failed string to integer conversion for ", out_tensor_pieces[1]);
-      }
+      string node_name = ParseNodeName(out_tensor, &port_id);
 
       const NodeDef* retnode = gv.GetNode(node_name);
       if (retnode == nullptr) {
@@ -516,6 +509,11 @@ class SymbolicShapeRefiner {
       }
 
       auto output_properties = gp.GetOutputProperties(retnode->name());
+      if (port_id >= output_properties.size()) {
+        return errors::InvalidArgument(
+            out_tensor, " has invalid position ", port_id,
+            " (output_properties.size() = ", output_properties.size(), ").");
+      }
       auto const& outprop = output_properties[port_id];
       const TensorShapeProto& shape = outprop.shape();
       ShapeHandle out;
diff --git a/tensorflow/core/grappler/costs/graph_properties_test.cc b/tensorflow/core/grappler/costs/graph_properties_test.cc
index 1be19d291a..5acfb56b05 100644
--- a/tensorflow/core/grappler/costs/graph_properties_test.cc
+++ b/tensorflow/core/grappler/costs/graph_properties_test.cc
@@ -887,6 +887,44 @@ TEST_F(GraphPropertiesTest, LargeFunctionStaticShapeInference) {
   EXPECT_EQ(8, in_prop3.shape().dim(3).size());
 }
 
+TEST_F(GraphPropertiesTest, LargeFunctionWithMultipleOutputs) {
+  // Test graph produced in python using:
+  /*
+    @function.Defun(noinline=True)
+    def MyFunc():
+      @function.Defun(*[tf.float32] * 2)
+      def Cond(n, unused_x):
+        return n > 0
+
+      @function.Defun(*[tf.float32] * 2)
+      def Body(n, x):
+        return n - 1, x + n
+
+      i = tf.constant(10)
+      return functional_ops.While([i, 0.], Cond, Body)
+
+    with tf.Graph().as_default():
+      z = MyFunc()
+  */
+  GrapplerItem item;
+  string filename = io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataPath,
+                                 "function_functional_while.pbtxt");
+  TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));
+  GraphProperties properties(item);
+  TF_CHECK_OK(properties.InferStatically(false));
+
+  const auto out_props = properties.GetOutputProperties("MyFunc_AenMyWWx1Us");
+  EXPECT_EQ(2, out_props.size());
+
+  const OpInfo::TensorProperties& out_prop0 = out_props[0];
+  EXPECT_EQ(DT_INT32, out_prop0.dtype());
+  EXPECT_FALSE(out_prop0.shape().unknown_rank());
+
+  const OpInfo::TensorProperties& out_prop1 = out_props[1];
+  EXPECT_EQ(DT_FLOAT, out_prop1.dtype());
+  EXPECT_FALSE(out_prop1.shape().unknown_rank());
+}
+
 TEST_F(GraphPropertiesTest, FunctionWithErrorStaticShapeInference) {
   GrapplerItem item;
   string filename = io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataPath,
diff --git a/tensorflow/core/grappler/costs/graph_properties_testdata/function_functional_while.pbtxt b/tensorflow/core/grappler/costs/graph_properties_testdata/function_functional_while.pbtxt
new file mode 100644
index 0000000000..c94ee2f227
--- /dev/null
+++ b/tensorflow/core/grappler/costs/graph_properties_testdata/function_functional_while.pbtxt
@@ -0,0 +1,239 @@
+node {
+  name: "MyFunc_AenMyWWx1Us"
+  op: "MyFunc_AenMyWWx1Us"
+}
+library {
+  function {
+    signature {
+      name: "MyFunc_AenMyWWx1Us"
+      output_arg {
+        name: "while"
+        type: DT_INT32
+      }
+      output_arg {
+        name: "while_0"
+        type: DT_FLOAT
+      }
+      is_stateful: true
+    }
+    node_def {
+      name: "Const"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 10
+          }
+        }
+      }
+    }
+    node_def {
+      name: "While/input_1"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_FLOAT
+            tensor_shape {
+            }
+            float_val: 0.0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "While"
+      op: "While"
+      input: "Const:output:0"
+      input: "While/input_1:output:0"
+      attr {
+        key: "T"
+        value {
+          list {
+            type: DT_INT32
+            type: DT_FLOAT
+          }
+        }
+      }
+      attr {
+        key: "body"
+        value {
+          func {
+            name: "Body_8GOMGeZeK5c"
+          }
+        }
+      }
+      attr {
+        key: "cond"
+        value {
+          func {
+            name: "Cond_Xf5ttAHgUCg"
+          }
+        }
+      }
+    }
+    ret {
+      key: "while"
+      value: "While:output:0"
+    }
+    ret {
+      key: "while_0"
+      value: "While:output:1"
+    }
+    attr {
+      key: "_noinline"
+      value {
+        b: true
+      }
+    }
+  }
+  function {
+    signature {
+      name: "Body_8GOMGeZeK5c"
+      input_arg {
+        name: "n"
+        type: DT_FLOAT
+      }
+      input_arg {
+        name: "x"
+        type: DT_FLOAT
+      }
+      output_arg {
+        name: "sub"
+        type: DT_FLOAT
+      }
+      output_arg {
+        name: "add"
+        type: DT_FLOAT
+      }
+    }
+    node_def {
+      name: "sub/y"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_FLOAT
+            tensor_shape {
+            }
+            float_val: 1.0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "sub_0"
+      op: "Sub"
+      input: "n"
+      input: "sub/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    node_def {
+      name: "add_0"
+      op: "Add"
+      input: "x"
+      input: "n"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    ret {
+      key: "add"
+      value: "add_0:z:0"
+    }
+    ret {
+      key: "sub"
+      value: "sub_0:z:0"
+    }
+  }
+  function {
+    signature {
+      name: "Cond_Xf5ttAHgUCg"
+      input_arg {
+        name: "n"
+        type: DT_FLOAT
+      }
+      input_arg {
+        name: "unused_x"
+        type: DT_FLOAT
+      }
+      output_arg {
+        name: "greater"
+        type: DT_BOOL
+      }
+    }
+    node_def {
+      name: "Greater/y"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_FLOAT
+            tensor_shape {
+            }
+            float_val: 0.0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "Greater"
+      op: "Greater"
+      input: "n"
+      input: "Greater/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    ret {
+      key: "greater"
+      value: "Greater:z:0"
+    }
+  }
+}
+versions {
+  producer: 26
+  min_consumer: 12
+}
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index d34eecd009..5b303f6ccb 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -65,6 +65,7 @@ constexpr char kAvgPool[] = "AvgPool";
 constexpr char kAvgPoolGrad[] = "AvgPoolGrad";
 constexpr char kFusedBatchNorm[] = "FusedBatchNorm";
 constexpr char kFusedBatchNormGrad[] = "FusedBatchNormGrad";
+constexpr char kQuantizedMatMulV2[] = "QuantizedMatMulV2";
 
 static const Costs::Duration kMinComputeTime(1);
 
@@ -226,6 +227,7 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
       {kMatMul, wrap(&OpLevelCostEstimator::PredictMatMul)},
       {kSparseMatMul, wrap(&OpLevelCostEstimator::PredictMatMul)},
       {kBatchMatMul, wrap(&OpLevelCostEstimator::PredictBatchMatMul)},
+      {kQuantizedMatMulV2, wrap(&OpLevelCostEstimator::PredictMatMul)},
 
       {kNoOp, wrap(&OpLevelCostEstimator::PredictNoOp)},
       {kGuaranteeConst, wrap(&OpLevelCostEstimator::PredictNoOp)},
@@ -268,67 +270,70 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
       EIGEN_COST(scalar_product_op<float>) + EIGEN_COST(scalar_max_op<float>) +
       EIGEN_COST(scalar_min_op<float>) + EIGEN_COST(scalar_round_op<float>);
 
-  elementwise_ops_ = {// Unary ops alphabetically sorted
-                      {"Acos", EIGEN_COST(scalar_acos_op<float>)},
-                      {"Asin", EIGEN_COST(scalar_asin_op<float>)},
-                      {"Atan", EIGEN_COST(scalar_atan_op<float>)},
-                      {"Atan2", EIGEN_COST(scalar_quotient_op<float>) +
-                                    EIGEN_COST(scalar_atan_op<float>)},
-                      {"Ceil", EIGEN_COST(scalar_ceil_op<float>)},
-                      {"Cos", EIGEN_COST(scalar_cos_op<float>)},
-                      {"Dequantize", EIGEN_COST(scalar_product_op<float>)},
-                      {"Erf", 1},
-                      {"Erfc", 1},
-                      {"Exp", EIGEN_COST(scalar_exp_op<float>)},
-                      {"Expm1", EIGEN_COST(scalar_expm1_op<float>)},
-                      {"Floor", EIGEN_COST(scalar_floor_op<float>)},
-                      {"Inv", EIGEN_COST(scalar_inverse_op<float>)},
-                      {"InvGrad", 1},
-                      {"Lgamma", 1},
-                      {"Log", EIGEN_COST(scalar_log_op<float>)},
-                      {"Log1p", EIGEN_COST(scalar_log1p_op<float>)},
-                      {"Neg", EIGEN_COST(scalar_opposite_op<float>)},
-                      {"QuantizeV2", quantize_v2_cost},
-                      {"Reciprocal", EIGEN_COST(scalar_inverse_op<float>)},
-                      {"Rint", 1},
-                      {"Round", EIGEN_COST(scalar_round_op<float>)},
-                      {"Rsqrt", EIGEN_COST(scalar_rsqrt_op<float>)},
-                      {"Sqrt", EIGEN_COST(scalar_sqrt_op<float>)},
-                      {"Square", EIGEN_COST(scalar_square_op<float>)},
-                      {"Tanh", EIGEN_COST(scalar_tanh_op<float>)},
-                      {"Relu", EIGEN_COST(scalar_max_op<float>)},
-                      {"Sigmoid", EIGEN_COST(scalar_sigmoid_op<float>)},
-                      {"Sign", EIGEN_COST(scalar_sign_op<float>)},
-                      {"Sin", EIGEN_COST(scalar_sin_op<float>)},
-                      {"Tan", EIGEN_COST(scalar_tan_op<float>)},
-                      // Binary ops alphabetically sorted
-                      {"Add", EIGEN_COST(scalar_sum_op<float>)},
-                      {"ApproximateEqual", 1},
-                      {"BiasAdd", EIGEN_COST(scalar_sum_op<float>)},
-                      {"Div", EIGEN_COST(scalar_quotient_op<float>)},
-                      {"Equal", 1},
-                      {"FloorDiv", EIGEN_COST(scalar_quotient_op<float>)},
-                      {"FloorMod", EIGEN_COST(scalar_mod_op<float>)},
-                      {"Greater", 1},
-                      {"GreaterEqual", 1},
-                      {"Less", 1},
-                      {"LessEqual", 1},
-                      {"LogicalAnd", EIGEN_COST(scalar_boolean_and_op)},
-                      {"LogicalNot", 1},
-                      {"LogicalOr", EIGEN_COST(scalar_boolean_or_op)},
-                      {"Maximum", EIGEN_COST(scalar_max_op<float>)},
-                      {"Minimum", EIGEN_COST(scalar_min_op<float>)},
-                      {"Mod", EIGEN_COST(scalar_mod_op<float>)},
-                      {"Mul", EIGEN_COST(scalar_product_op<float>)},
-                      {"NotEqual", 1},
-                      {"QuantizedAdd", EIGEN_COST(scalar_sum_op<float>)},
-                      {"QuantizedMul", EIGEN_COST(scalar_product_op<float>)},
-                      {"RealDiv", EIGEN_COST(scalar_quotient_op<float>)},
-                      {"ReluGrad", EIGEN_COST(scalar_max_op<float>)},
-                      {"SquareDifference", 1},
-                      {"Sub", EIGEN_COST(scalar_difference_op<float>)},
-                      {"TruncateDiv", EIGEN_COST(scalar_quotient_op<float>)},
-                      {"TruncateMod", EIGEN_COST(scalar_mod_op<float>)}};
+  elementwise_ops_ = {
+      // Unary ops alphabetically sorted
+      {"Acos", EIGEN_COST(scalar_acos_op<float>)},
+      {"Asin", EIGEN_COST(scalar_asin_op<float>)},
+      {"Atan", EIGEN_COST(scalar_atan_op<float>)},
+      {"Atan2", EIGEN_COST(scalar_quotient_op<float>) +
+                    EIGEN_COST(scalar_atan_op<float>)},
+      {"Ceil", EIGEN_COST(scalar_ceil_op<float>)},
+      {"Cos", EIGEN_COST(scalar_cos_op<float>)},
+      {"Dequantize", EIGEN_COST(scalar_product_op<float>)},
+      {"Erf", 1},
+      {"Erfc", 1},
+      {"Exp", EIGEN_COST(scalar_exp_op<float>)},
+      {"Expm1", EIGEN_COST(scalar_expm1_op<float>)},
+      {"Floor", EIGEN_COST(scalar_floor_op<float>)},
+      {"Inv", EIGEN_COST(scalar_inverse_op<float>)},
+      {"InvGrad", 1},
+      {"Lgamma", 1},
+      {"Log", EIGEN_COST(scalar_log_op<float>)},
+      {"Log1p", EIGEN_COST(scalar_log1p_op<float>)},
+      {"Neg", EIGEN_COST(scalar_opposite_op<float>)},
+      {"QuantizeV2", quantize_v2_cost},
+      {"Reciprocal", EIGEN_COST(scalar_inverse_op<float>)},
+      {"Rint", 1},
+      {"Round", EIGEN_COST(scalar_round_op<float>)},
+      {"Rsqrt", EIGEN_COST(scalar_rsqrt_op<float>)},
+      {"Sqrt", EIGEN_COST(scalar_sqrt_op<float>)},
+      {"Square", EIGEN_COST(scalar_square_op<float>)},
+      {"Tanh", EIGEN_COST(scalar_tanh_op<float>)},
+      {"Relu", EIGEN_COST(scalar_max_op<float>)},
+      {"Sigmoid", EIGEN_COST(scalar_sigmoid_op<float>)},
+      {"QuantizedSigmoid", EIGEN_COST(scalar_sigmoid_op<float>)},
+      {"Sign", EIGEN_COST(scalar_sign_op<float>)},
+      {"Sin", EIGEN_COST(scalar_sin_op<float>)},
+      {"Tan", EIGEN_COST(scalar_tan_op<float>)},
+      // Binary ops alphabetically sorted
+      {"Add", EIGEN_COST(scalar_sum_op<float>)},
+      {"ApproximateEqual", 1},
+      {"BiasAdd", EIGEN_COST(scalar_sum_op<float>)},
+      {"QuantizedBiasAdd", EIGEN_COST(scalar_sum_op<float>)},
+      {"Div", EIGEN_COST(scalar_quotient_op<float>)},
+      {"Equal", 1},
+      {"FloorDiv", EIGEN_COST(scalar_quotient_op<float>)},
+      {"FloorMod", EIGEN_COST(scalar_mod_op<float>)},
+      {"Greater", 1},
+      {"GreaterEqual", 1},
+      {"Less", 1},
+      {"LessEqual", 1},
+      {"LogicalAnd", EIGEN_COST(scalar_boolean_and_op)},
+      {"LogicalNot", 1},
+      {"LogicalOr", EIGEN_COST(scalar_boolean_or_op)},
+      {"Maximum", EIGEN_COST(scalar_max_op<float>)},
+      {"Minimum", EIGEN_COST(scalar_min_op<float>)},
+      {"Mod", EIGEN_COST(scalar_mod_op<float>)},
+      {"Mul", EIGEN_COST(scalar_product_op<float>)},
+      {"NotEqual", 1},
+      {"QuantizedAdd", EIGEN_COST(scalar_sum_op<float>)},
+      {"QuantizedMul", EIGEN_COST(scalar_product_op<float>)},
+      {"RealDiv", EIGEN_COST(scalar_quotient_op<float>)},
+      {"ReluGrad", EIGEN_COST(scalar_max_op<float>)},
+      {"SquareDifference", 1},
+      {"Sub", EIGEN_COST(scalar_difference_op<float>)},
+      {"TruncateDiv", EIGEN_COST(scalar_quotient_op<float>)},
+      {"TruncateMod", EIGEN_COST(scalar_mod_op<float>)}};
 
 #undef EIGEN_COST
 
@@ -675,7 +680,7 @@ int64 OpLevelCostEstimator::CountMatMulOperations(
   }
 
   ops = m_dim * n_dim * k_dim * 2;
-  VLOG(1) << "Operations for Matmul" << ops;
+  VLOG(1) << "Operations for Matmul: " << ops;
 
   if (mat_mul != nullptr) {
     mat_mul->m = m_dim;
@@ -972,8 +977,10 @@ int64 OpLevelCostEstimator::CalculateTensorElementCount(
 
 int64 OpLevelCostEstimator::CalculateTensorSize(
     const OpInfo::TensorProperties& tensor, bool* found_unknown_shapes) const {
-  return CalculateTensorElementCount(tensor, found_unknown_shapes) *
-         DataTypeSize(BaseType(tensor.dtype()));
+  int64 count = CalculateTensorElementCount(tensor, found_unknown_shapes);
+  int size = DataTypeSize(BaseType(tensor.dtype()));
+  VLOG(2) << "Count: " << count << " DataTypeSize: " << size;
+  return count * size;
 }
 
 int64 OpLevelCostEstimator::CalculateInputSize(
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc
index 7f68272950..6a1b0aebfa 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
@@ -858,8 +859,9 @@ Costs VirtualScheduler::Summary() const {
     const auto& memory_cost = op_cost_pair.second.memory_time.count();
     const bool is_op_cost_accurate = !op_cost_pair.second.inaccurate;
     if (cost) {  // Skip printing out zero-cost ops.
-      VLOG(1) << " + " << op << " : " << (is_op_cost_accurate ? "" : "~")
-              << cost << " / " << compute_cost << " / " << memory_cost;
+      VLOG(1) << strings::Printf(" + %30s : %c %10ld / %10ld / %10ld",
+                                 op.c_str(), (is_op_cost_accurate ? ' ' : '~'),
+                                 cost, compute_cost, memory_cost);
     }
   }
 
@@ -934,9 +936,11 @@ Costs VirtualScheduler::Summary() const {
                                : 0.0;
       if (cost || mem_usage_percent > 1.0) {
         // Print out only non-zero cost ops or ops with > 1% memory usage.
-        VLOG(1) << " + " << op << " : " << (is_op_cost_accurate ? "" : "~")
-                << cost << " / " << compute_cost << " / " << memory_cost << " ("
-                << strings::HumanReadableNumBytes(op_mem_usage) << " ["
+        VLOG(1) << strings::Printf(" + %30s : %c %10ld / %10ld / %10ld",
+                                   op.c_str(),
+                                   (is_op_cost_accurate ? ' ' : '~'), cost,
+                                   compute_cost, memory_cost)
+                << " (" << strings::HumanReadableNumBytes(op_mem_usage) << " ["
                 << mem_usage_percent << "%] "
                 << (persisent_ops.count(op) > 0 ? ": persistent op)" : ")");
       }
diff --git a/tensorflow/core/grappler/graph_view.cc b/tensorflow/core/grappler/graph_view.cc
index 3e448216f9..7998f0a902 100644
--- a/tensorflow/core/grappler/graph_view.cc
+++ b/tensorflow/core/grappler/graph_view.cc
@@ -22,28 +22,33 @@ namespace grappler {
 GraphView::GraphView(GraphDef* graph) : graph_(graph) {
   for (int i = 0; i < graph_->node_size(); i++) {
     auto node = graph_->mutable_node(i);
-    auto rslt = nodes_.insert(std::make_pair(node->name(), node));
+    auto result = nodes_.emplace(node->name(), node);
     // Check that the graph doesn't contain multiple nodes with the same name.
-    CHECK(rslt.second) << "Non unique node name detected: " << node->name();
+    CHECK(result.second) << "Non unique node name detected: " << node->name();
   }
+
   for (NodeDef& node : *graph_->mutable_node()) {
-    for (int i = 0; i < node.input_size(); ++i) {
-      OutputPort fanin;
-      string fanin_name = ParseNodeName(node.input(i), &fanin.port_id);
-      fanin.node = nodes_[fanin_name];
+    AddFanouts(&node);
+  }
+}
 
-      InputPort input;
-      input.node = &node;
-      if (fanin.port_id < 0) {
-        input.port_id = -1;
-      } else {
-        input.port_id = i;
-        num_regular_outputs_[fanin.node] =
-            std::max(num_regular_outputs_[fanin.node], fanin.port_id);
-      }
+void GraphView::AddFanouts(NodeDef* node) {
+  for (int i = 0; i < node->input_size(); ++i) {
+    OutputPort fanin;
+    string fanin_name = ParseNodeName(node->input(i), &fanin.port_id);
+    fanin.node = nodes_[fanin_name];
 
-      fanouts_[fanin].insert(input);
+    InputPort input;
+    input.node = node;
+    if (fanin.port_id < 0) {
+      input.port_id = -1;
+    } else {
+      input.port_id = i;
+      num_regular_outputs_[fanin.node] =
+          std::max(num_regular_outputs_[fanin.node], fanin.port_id);
     }
+
+    fanouts_[fanin].insert(input);
   }
 }
 
diff --git a/tensorflow/core/grappler/graph_view.h b/tensorflow/core/grappler/graph_view.h
index 584cb9048b..050789d2e2 100644
--- a/tensorflow/core/grappler/graph_view.h
+++ b/tensorflow/core/grappler/graph_view.h
@@ -29,8 +29,11 @@ namespace grappler {
 class GraphView {
  public:
   struct Port {
-    Port() : node(nullptr), port_id(-1) {}
+    Port() = default;
     Port(NodeDef* n, int port) : node(n), port_id(port) {}
+
+    // TODO(prazek): ports should keep the constness of GraphView.  The only way
+    // to modify graph through the view should be using MutableGraphView.
     NodeDef* node = nullptr;
     int port_id = -1;
 
@@ -111,13 +114,22 @@ class GraphView {
   std::unordered_set<Edge, HashEdge> GetFaninEdges(
       const NodeDef& node, bool include_controlling_edges) const;
 
+ protected:
+  // Add fanout to every `node` input.
+  void AddFanouts(NodeDef* node);
+  std::unordered_map<string, NodeDef*>* MutableNodes() { return &nodes_; }
+  GraphDef* MutableGraph() { return graph_; }
+
+  using FanoutsMapType =
+      std::unordered_map<OutputPort, std::unordered_set<InputPort, HashPort>,
+                         HashPort>;
+  FanoutsMapType* MutableFanouts() { return &fanouts_; }
+
  private:
   GraphDef* graph_;
   std::unordered_map<string, NodeDef*> nodes_;
   std::unordered_set<InputPort, HashPort> empty_set_;
-  std::unordered_map<OutputPort, std::unordered_set<InputPort, HashPort>,
-                     HashPort>
-      fanouts_;
+  FanoutsMapType fanouts_;
   std::unordered_map<const NodeDef*, int> num_regular_outputs_;
 };
 
diff --git a/tensorflow/core/grappler/mutable_graph_view.cc b/tensorflow/core/grappler/mutable_graph_view.cc
new file mode 100644
index 0000000000..6abafe11a2
--- /dev/null
+++ b/tensorflow/core/grappler/mutable_graph_view.cc
@@ -0,0 +1,72 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/mutable_graph_view.h"
+#include "tensorflow/core/grappler/utils.h"
+
+namespace tensorflow {
+namespace grappler {
+
+NodeDef* MutableGraphView::AddNode(NodeDef&& node) {
+  auto* node_in_graph = GetGraph()->add_node();
+  *node_in_graph = std::move(node);
+
+  auto result = MutableNodes()->emplace(node_in_graph->name(), node_in_graph);
+  // Check that the graph doesn't contain multiple nodes with the same name.
+  CHECK(result.second) << "Non unique node name detected: "
+                       << node_in_graph->name();
+  AddFanouts(node_in_graph);
+  return node_in_graph;
+}
+
+void MutableGraphView::ReplaceInput(const NodeDef& old_input,
+                                    const NodeDef& new_input,
+                                    const int output_port_id) {
+  GraphView::OutputPort output_port =
+      GetOutputPort(old_input.name(), output_port_id);
+  auto fanout = GetFanout(output_port);
+  for (auto& input_port : fanout) {
+    input_port.node->set_input(input_port.port_id, new_input.name());
+    AddFanouts(input_port.node);
+  }
+}
+
+void MutableGraphView::DeleteNodes(const std::set<string>& nodes_to_delete) {
+  for (const string& node_name_to_delete : nodes_to_delete)
+    RemoveFanouts(MutableNodes()->at(node_name_to_delete));
+  for (const string& node_name_to_delete : nodes_to_delete)
+    MutableNodes()->erase(node_name_to_delete);
+  EraseNodesFromGraph(nodes_to_delete, GetGraph());
+}
+
+void MutableGraphView::RemoveFanouts(NodeDef* node) {
+  for (int i = 0; i < node->input_size(); ++i) {
+    OutputPort fanin;
+    string fanin_name = ParseNodeName(node->input(i), &fanin.port_id);
+    fanin.node = (*MutableNodes())[fanin_name];
+
+    InputPort input;
+    input.node = node;
+    if (fanin.port_id < 0)
+      input.port_id = -1;
+    else
+      input.port_id = i;
+
+    (*MutableFanouts())[fanin].erase(input);
+  }
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/mutable_graph_view.h b/tensorflow/core/grappler/mutable_graph_view.h
new file mode 100644
index 0000000000..105eb972e8
--- /dev/null
+++ b/tensorflow/core/grappler/mutable_graph_view.h
@@ -0,0 +1,56 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_MUTABLE_GRAPH_VIEW_H_
+#define TENSORFLOW_CORE_GRAPPLER_MUTABLE_GRAPH_VIEW_H_
+
+#include "tensorflow/core/grappler/graph_view.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// A utility class to simplify the traversal of a GraphDef that, unlike
+// GraphView, supports updating the graph.  Note that you should not modify the
+// graph separately, because the view will get out of sync.
+class MutableGraphView : public GraphView {
+ public:
+  using GraphView::GraphView;
+
+  GraphDef* GetGraph() { return MutableGraph(); }
+  // Adds a new node to graph and updates the view.
+  NodeDef* AddNode(NodeDef&& node);
+
+  // Replaces the input for the output nodes of 'old_input' with a port
+  // `output_port_id` with 'new_input'.
+  //
+  // E.g: We have 2 nodes that use 'bar' node outputs as inputs:
+  // foo(bar:0, bar:1),  foo2(other:0, bar:0)
+  // Calling ReplaceInput(bar, new, 0) changes every occurrence of bar:0 for
+  // new:0.  Result:
+  // foo(new:0, bar:1),  foo2(other:0, new:0)
+  void ReplaceInput(const NodeDef& old_input, const NodeDef& new_input,
+                    int output_port_id = 0);
+
+  // Deletes nodes from the graph.
+  void DeleteNodes(const std::set<string>& nodes_to_delete);
+
+ private:
+  void RemoveFanouts(NodeDef* node);
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_MUTABLE_GRAPH_VIEW_H_
diff --git a/tensorflow/core/grappler/mutable_graph_view_test.cc b/tensorflow/core/grappler/mutable_graph_view_test.cc
new file mode 100644
index 0000000000..f09dfb8271
--- /dev/null
+++ b/tensorflow/core/grappler/mutable_graph_view_test.cc
@@ -0,0 +1,94 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/mutable_graph_view.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+TEST(MutableGraphViewTest, AddAndReplaceInput) {
+  // This outputs simple graph like:
+  //        x
+  //       / \
+  // Square   Square_1
+  //   |   \  /    |
+  //   |    \/     |
+  //   |    /\     |
+  //   |   /  \    |
+  //  AddN     AddN_1
+  //      \   /
+  //        y
+  TrivialTestGraphInputYielder fake_input(2, 2, 2, false, {"/CPU:0", "/GPU:0"});
+  GrapplerItem item;
+  CHECK(fake_input.NextItem(&item));
+
+  GraphDef new_graph = item.graph;
+  MutableGraphView graph(&new_graph);
+
+  GraphView::InputPort input = graph.GetInputPort("AddN", 0);
+  EXPECT_EQ("AddN", input.node->name());
+  EXPECT_EQ(0, input.port_id);
+  GraphView::OutputPort fanin = graph.GetRegularFanin(input);
+  EXPECT_EQ("Square", fanin.node->name());
+  EXPECT_EQ(0, fanin.port_id);
+
+  auto find_child_with_name = [&graph](string output_port_name,
+                                       string input_name) {
+    GraphView::OutputPort output_port =
+        graph.GetOutputPort(output_port_name, 0);
+    auto fanout = graph.GetFanout(output_port);
+    for (auto& input_port : fanout) {
+      if (input_port.node->name() == input_name) return true;
+    }
+    return false;
+  };
+
+  EXPECT_FALSE(find_child_with_name("Square", "new_node"));
+
+  NodeDef new_node = *input.node;
+  new_node.set_name("new_node");
+
+  EXPECT_EQ(graph.GetNode("new_node"), nullptr);
+  NodeDef* node_in_graph = graph.AddNode(std::move(new_node));
+  EXPECT_NE(graph.GetNode("new_node"), nullptr);
+
+  graph.ReplaceInput(*input.node, *node_in_graph);
+  EXPECT_TRUE(find_child_with_name("Square", "new_node"));
+  EXPECT_TRUE(find_child_with_name("new_node", "y"));
+}
+
+TEST(MutableGraphViewTest, DeleteNodes) {
+  // Outputs simple graph as described in first test.
+  TrivialTestGraphInputYielder fake_input(2, 2, 2, false, {"/CPU:0", "/GPU:0"});
+  GrapplerItem item;
+  CHECK(fake_input.NextItem(&item));
+
+  GraphDef new_graph = item.graph;
+  MutableGraphView graph(&new_graph);
+
+  EXPECT_NE(graph.GetNode("AddN"), nullptr);
+  graph.DeleteNodes({"AddN"});
+
+  EXPECT_EQ(graph.GetNode("AddN"), nullptr);
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index 97862d1ed0..3ab2211694 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -3007,14 +3007,7 @@ void ArithmeticOptimizer::DedupComputations() {
 
   // Delete duplicates
   if (fetch_nodes_known_ && !duplicates.empty()) {
-    int last = optimized_graph_->node_size() - 1;
-    for (auto it = duplicates.rbegin(); it != duplicates.rend(); ++it) {
-      int index = *it;
-      optimized_graph_->mutable_node()->SwapElements(index, last);
-      last--;
-    }
-    optimized_graph_->mutable_node()->DeleteSubrange(last + 1,
-                                                     duplicates.size());
+    EraseNodesFromGraph(duplicates, optimized_graph_);
     // Rebuild the NodeMap which was invalidated by the node  swapping above.
     node_map_.reset(new NodeMap(optimized_graph_));
   }
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index 76c928f995..f016fae3a5 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -1305,17 +1305,12 @@ Status ConstantFolding::FoldGraph(GraphDef* output) {
   }
 
   // Delete the newly created nodes that don't feed anything.
-  int last = output->node_size() - 1;
-  for (int i = output->node_size() - 1; i >= 0; --i) {
-    const NodeDef& node = output->node(i);
-    auto fanout = node_map_->GetOutputs(node.name());
-    if (fanout.empty()) {
-      output->mutable_node()->SwapElements(i, last);
-      last--;
-    }
+  std::vector<int> nodes_to_delete;
+  for (int i = 0; i < output->node_size(); i++) {
+    auto fanout = node_map_->GetOutputs(output->node(i).name());
+    if (fanout.empty()) nodes_to_delete.push_back(i);
   }
-  output->mutable_node()->DeleteSubrange(last + 1,
-                                         output->node_size() - last - 1);
+  EraseNodesFromGraph(std::move(nodes_to_delete), output);
 
   for (const auto& node : graph_->node()) {
     // If no fetch nodes is provided, we conservatively
diff --git a/tensorflow/core/grappler/optimizers/data/BUILD b/tensorflow/core/grappler/optimizers/data/BUILD
index c8946c499c..d7ac58c99d 100644
--- a/tensorflow/core/grappler/optimizers/data/BUILD
+++ b/tensorflow/core/grappler/optimizers/data/BUILD
@@ -46,7 +46,7 @@ cc_library(
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core/grappler:graph_view",
+        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
     ] + tf_protos_all(),
@@ -58,8 +58,14 @@ tf_cc_test(
     visibility = ["//visibility:public"],
     deps = [
         ":graph_utils",
+        "//tensorflow/core:framework",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
+        "//tensorflow/core/kernels:cast_op",
     ],
 )
 
@@ -73,7 +79,7 @@ cc_library(
     deps = [
         ":graph_utils",
         "//tensorflow/core:lib",
-        "//tensorflow/core/grappler:graph_view",
+        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
@@ -98,6 +104,43 @@ tf_cc_test(
 )
 
 cc_library(
+    name = "map_fusion",
+    srcs = ["map_fusion.cc"],
+    hdrs = [
+        "map_fusion.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":graph_utils",
+        "//tensorflow/core/grappler:mutable_graph_view",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core/kernels:cast_op",
+        "//tensorflow/core/grappler/utils:topological_sort",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
+    ] + tf_protos_all(),
+)
+
+tf_cc_test(
+    name = "map_fusion_test",
+    srcs = ["map_fusion_test.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":graph_utils",
+        ":map_fusion",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/grappler:grappler_item",
+    ],
+)
+
+cc_library(
     name = "noop_elimination",
     srcs = ["noop_elimination.cc"],
     hdrs = [
@@ -107,7 +150,7 @@ cc_library(
     deps = [
         ":graph_utils",
         "//tensorflow/core:lib",
-        "//tensorflow/core/grappler:graph_view",
+        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
@@ -141,7 +184,7 @@ cc_library(
     deps = [
         ":graph_utils",
         "//tensorflow/core:lib",
-        "//tensorflow/core/grappler:graph_view",
+        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
@@ -171,6 +214,7 @@ cc_library(
     deps = [
         ":function_rename",
         ":map_and_batch_fusion",
+        ":map_fusion",
         ":noop_elimination",
         ":shuffle_and_repeat_fusion",
     ],
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils.cc b/tensorflow/core/grappler/optimizers/data/graph_utils.cc
index ea5f450009..6ce6533369 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
 
 #include "tensorflow/core/framework/device_base.h"
-#include "tensorflow/core/grappler/graph_view.h"
+#include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
@@ -26,14 +26,12 @@ namespace {
 
 constexpr char kConstOpName[] = "Const";
 
-int FindNodeWithPredicate(const std::function<bool(const NodeDef&)>& predicate,
-                          const GraphDef& graph) {
-  for (int i = 0; i < graph.node_size(); ++i) {
-    if (predicate(graph.node(i))) {
-      return i;
-    }
-  }
-  return -1;
+template <typename Predicate, typename Collection>
+int GetElementIdxWithPredicate(const Predicate& predicate,
+                               const Collection& collection) {
+  auto it = std::find_if(collection.begin(), collection.end(), predicate);
+  if (it == collection.end()) return -1;
+  return std::distance(collection.begin(), it);
 }
 
 std::vector<int> CreateNameIndex(const GraphDef& graph) {
@@ -62,13 +60,14 @@ std::vector<int> CreateInputIndex(const NodeDef& node) {
   return index;
 }
 
-Status AddScalarConstNodeHelper(
+NodeDef* AddScalarConstNodeHelper(
     DataType dtype, const std::function<void(TensorProto*)>& add_value,
-    GraphDef* graph, NodeDef** result) {
-  NodeDef* node = graph->add_node();
-  node->set_op(kConstOpName);
-  SetUniqueName(kConstOpName, graph, node);
-  (*node->mutable_attr())["dtype"].set_type(dtype);
+    MutableGraphView* graph) {
+  NodeDef node;
+  node.set_op(kConstOpName);
+  SetUniqueGraphNodeName(kConstOpName, graph->GetGraph(), &node);
+
+  (*node.mutable_attr())["dtype"].set_type(dtype);
   std::unique_ptr<tensorflow::TensorProto> tensor =
       tensorflow::MakeUnique<tensorflow::TensorProto>();
   std::unique_ptr<tensorflow::TensorShapeProto> tensor_shape =
@@ -76,75 +75,69 @@ Status AddScalarConstNodeHelper(
   tensor->set_allocated_tensor_shape(tensor_shape.release());
   tensor->set_dtype(dtype);
   add_value(tensor.get());
-  (*node->mutable_attr())["value"].set_allocated_tensor(tensor.release());
-  *result = node;
-  return Status::OK();
+  (*node.mutable_attr())["value"].set_allocated_tensor(tensor.release());
+
+  return graph->AddNode(std::move(node));
 }
 
 }  // namespace
 
-Status AddNode(const string& name, const string& op,
-               const std::vector<string>& inputs,
-               const std::vector<std::pair<string, AttrValue>>& attributes,
-               GraphDef* graph, NodeDef** result) {
-  NodeDef* node = graph->add_node();
+NodeDef* AddNode(const string& name, const string& op,
+                 const std::vector<string>& inputs,
+                 const std::vector<std::pair<string, AttrValue>>& attributes,
+                 MutableGraphView* graph) {
+  NodeDef node;
   if (!name.empty()) {
-    node->set_name(name);
+    node.set_name(name);
   } else {
-    SetUniqueName(op, graph, node);
+    SetUniqueGraphNodeName(op, graph->GetGraph(), &node);
   }
-  node->set_op(op);
+  node.set_op(op);
   for (const string& input : inputs) {
-    node->add_input(input);
+    node.add_input(input);
   }
   for (auto attr : attributes) {
-    (*node->mutable_attr())[attr.first] = attr.second;
+    (*node.mutable_attr())[attr.first] = attr.second;
   }
-  *result = node;
-  return Status::OK();
+  return graph->AddNode(std::move(node));
 }
 
 template <>
-Status AddScalarConstNode(bool v, GraphDef* graph, NodeDef** result) {
+NodeDef* AddScalarConstNode(bool v, MutableGraphView* graph) {
   return AddScalarConstNodeHelper(
-      DT_BOOL, [v](TensorProto* proto) { proto->add_bool_val(v); }, graph,
-      result);
+      DT_BOOL, [v](TensorProto* proto) { proto->add_bool_val(v); }, graph);
 }
 
 template <>
-Status AddScalarConstNode(double v, GraphDef* graph, NodeDef** result) {
+NodeDef* AddScalarConstNode(double v, MutableGraphView* graph) {
   return AddScalarConstNodeHelper(
-      DT_DOUBLE, [v](TensorProto* proto) { proto->add_double_val(v); }, graph,
-      result);
+      DT_DOUBLE, [v](TensorProto* proto) { proto->add_double_val(v); }, graph);
 }
 
 template <>
-Status AddScalarConstNode(float v, GraphDef* graph, NodeDef** result) {
+NodeDef* AddScalarConstNode(float v, MutableGraphView* graph) {
   return AddScalarConstNodeHelper(
-      DT_FLOAT, [v](TensorProto* proto) { proto->add_float_val(v); }, graph,
-      result);
+      DT_FLOAT, [v](TensorProto* proto) { proto->add_float_val(v); }, graph);
 }
 
 template <>
-Status AddScalarConstNode(int v, GraphDef* graph, NodeDef** result) {
+NodeDef* AddScalarConstNode(int v, MutableGraphView* graph) {
   return AddScalarConstNodeHelper(
-      DT_INT32, [v](TensorProto* proto) { proto->add_int_val(v); }, graph,
-      result);
+      DT_INT32, [v](TensorProto* proto) { proto->add_int_val(v); }, graph);
 }
 
 template <>
-Status AddScalarConstNode(int64 v, GraphDef* graph, NodeDef** result) {
+NodeDef* AddScalarConstNode(int64 v, MutableGraphView* graph) {
   return AddScalarConstNodeHelper(
-      DT_INT64, [v](TensorProto* proto) { proto->add_int64_val(v); }, graph,
-      result);
+      DT_INT64, [v](TensorProto* proto) { proto->add_int64_val(v); }, graph);
 }
 
 template <>
-Status AddScalarConstNode(StringPiece v, GraphDef* graph, NodeDef** result) {
+NodeDef* AddScalarConstNode(StringPiece v, MutableGraphView* graph) {
   return AddScalarConstNodeHelper(
       DT_STRING,
       [v](TensorProto* proto) { proto->add_string_val(v.data(), v.size()); },
-      graph, result);
+      graph);
 }
 
 bool Compare(const GraphDef& g1, const GraphDef& g2) {
@@ -177,52 +170,82 @@ bool Compare(const GraphDef& g1, const GraphDef& g2) {
   return true;
 }
 
-bool ContainsNodeWithName(const string& name, const GraphDef& graph) {
-  return FindNodeWithName(name, graph) != -1;
+bool ContainsGraphNodeWithName(const string& name, const GraphDef& graph) {
+  return FindGraphNodeWithName(name, graph) != -1;
 }
 
 bool ContainsNodeWithOp(const string& op, const GraphDef& graph) {
   return FindNodeWithOp(op, graph) != -1;
 }
 
-Status DeleteNodes(const std::set<string>& nodes_to_delete, GraphDef* graph) {
-  int last = graph->node_size() - 1;
-  for (int i = graph->node_size() - 1; i >= 0; --i) {
-    const NodeDef& node = graph->node(i);
-    if (nodes_to_delete.find(node.name()) != nodes_to_delete.end()) {
-      graph->mutable_node()->SwapElements(i, last);
-      last--;
-    }
-  }
-  graph->mutable_node()->DeleteSubrange(last + 1,
-                                        graph->node_size() - last - 1);
-  return Status::OK();
+bool ContainsGraphFunctionWithName(const string& name,
+                                   const FunctionDefLibrary& library) {
+  return FindGraphFunctionWithName(name, library) != -1;
 }
 
-int FindNodeWithName(const string& name, const GraphDef& graph) {
-  return FindNodeWithPredicate(
-      [name](const NodeDef& node) { return node.name() == name; }, graph);
+bool ContainsFunctionNodeWithName(const string& name,
+                                  const FunctionDef& function) {
+  return FindFunctionNodeWithName(name, function) != -1;
+}
+
+int FindGraphNodeWithName(const string& name, const GraphDef& graph) {
+  return GetElementIdxWithPredicate(
+      [&name](const NodeDef& node) { return node.name() == name; },
+      graph.node());
 }
 
 int FindNodeWithOp(const string& op, const GraphDef& graph) {
-  return FindNodeWithPredicate(
-      [op](const NodeDef& node) { return node.op() == op; }, graph);
+  return GetElementIdxWithPredicate(
+      [&op](const NodeDef& node) { return node.op() == op; }, graph.node());
 }
 
-void SetUniqueName(const string& op, GraphDef* graph, NodeDef* node) {
+int FindGraphFunctionWithName(const string& name,
+                              const FunctionDefLibrary& library) {
+  return GetElementIdxWithPredicate(
+      [&name](const FunctionDef& function) {
+        return function.signature().name() == name;
+      },
+      library.function());
+}
+
+int FindFunctionNodeWithName(const string& name, const FunctionDef& function) {
+  return GetElementIdxWithPredicate(
+      [&name](const NodeDef& node) { return node.name() == name; },
+      function.node_def());
+}
+
+void SetUniqueGraphNodeName(const string& prefix, GraphDef* graph,
+                            NodeDef* node) {
+  string name = prefix;
   int id = graph->node_size();
-  while (ContainsNodeWithName(strings::StrCat(op, "/_", id), *graph)) {
+  while (ContainsGraphNodeWithName(name, *graph)) {
+    name = strings::StrCat(prefix, "/_", id);
     ++id;
   }
-  node->set_name(strings::StrCat(op, "/_", id));
+  node->set_name(std::move(name));
 }
 
-void ReplaceInput(const NodeDef& old_input, const NodeDef& new_input,
-                  GraphView* graph) {
-  GraphView::OutputPort output_port = graph->GetOutputPort(old_input.name(), 0);
-  auto fanout = graph->GetFanout(output_port);
-  for (auto& input_port : fanout)
-    input_port.node->set_input(0, new_input.name());
+void SetUniqueFunctionNodeName(const string& prefix, FunctionDef* function,
+                               NodeDef* node) {
+  string name = prefix;
+  int id = function->node_def_size();
+  while (ContainsFunctionNodeWithName(name, *function)) {
+    name = strings::StrCat(prefix, "/_", id);
+    ++id;
+  }
+  node->set_name(std::move(name));
+}
+
+void SetUniqueGraphFunctionName(const string& prefix,
+                                FunctionDefLibrary* library,
+                                FunctionDef* function) {
+  string name = prefix;
+  int id = library->function_size();
+  while (ContainsGraphFunctionWithName(name, *library)) {
+    name = strings::StrCat(prefix, "/_", id);
+    ++id;
+  }
+  function->mutable_signature()->set_name(name);
 }
 
 }  // end namespace graph_utils
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils.h b/tensorflow/core/grappler/optimizers/data/graph_utils.h
index 1cb0f0c81d..0847748802 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils.h
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils.h
@@ -17,12 +17,13 @@ limitations under the License.
 #define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_GRAPH_UTILS_H_
 
 #include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/grappler/graph_view.h"
+#include "tensorflow/core/grappler/mutable_graph_view.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/core/errors.h"
 
@@ -31,57 +32,83 @@ namespace grappler {
 namespace graph_utils {
 
 // Adds a node to the graph.
-Status AddNode(const string& name, const string& op,
-               const std::vector<string>& inputs,
-               const std::vector<std::pair<string, AttrValue>>& attributes,
-               GraphDef* graph, NodeDef** result);
+NodeDef* AddNode(const string& name, const string& op,
+                 const std::vector<string>& inputs,
+                 const std::vector<std::pair<string, AttrValue>>& attributes,
+                 MutableGraphView* graph);
 
 // Adds a Const node with the given value to the graph.
 template <typename T>
-Status AddScalarConstNode(T v, GraphDef* graph, NodeDef** result) {
-  return errors::Unimplemented("Type %s is not supported.",
-                               DataTypeToEnum<T>::value);
+NodeDef* AddScalarConstNode(T v, MutableGraphView* graph) {
+  // is_same is an idiomatic hack for making it compile if not instantiated.
+  // Replacing with false will result in a compile-time error.
+  static_assert(!std::is_same<T, T>::value,
+                "Invalid specialization of this method for type T.");
+  return {};
 }
+
 template <>
-Status AddScalarConstNode(bool v, GraphDef* graph, NodeDef** result);
+NodeDef* AddScalarConstNode(bool v, MutableGraphView* graph);
 template <>
-Status AddScalarConstNode(double v, GraphDef* graph, NodeDef** result);
+NodeDef* AddScalarConstNode(double v, MutableGraphView* graph);
 template <>
-Status AddScalarConstNode(float v, GraphDef* graph, NodeDef** result);
+NodeDef* AddScalarConstNode(float v, MutableGraphView* graph);
 template <>
-Status AddScalarConstNode(int v, GraphDef* graph, NodeDef** result);
+NodeDef* AddScalarConstNode(int v, MutableGraphView* graph);
 template <>
-Status AddScalarConstNode(int64 v, GraphDef* graph, NodeDef** result);
+NodeDef* AddScalarConstNode(int64 v, MutableGraphView* graph);
 template <>
-Status AddScalarConstNode(StringPiece v, GraphDef* graph, NodeDef** result);
+NodeDef* AddScalarConstNode(StringPiece v, MutableGraphView* graph);
 
 // Checks whether the two graphs are the same.
 bool Compare(const GraphDef& g1, const GraphDef& g2);
 
 // Checks whether the graph contains a node with the given name.
-bool ContainsNodeWithName(const string& name, const GraphDef& graph);
+bool ContainsGraphNodeWithName(const string& name, const GraphDef& graph);
+
+// Checks whether the library contains a function with the given name.
+bool ContainsGraphFunctionWithName(const string& name,
+                                   const FunctionDefLibrary& library);
+
+// Checks whether the function contains a node with the given name.
+bool ContainsFunctionNodeWithName(const string& name,
+                                  const FunctionDef& function);
 
 // Checks whether the graph contains a node with the given op.
 bool ContainsNodeWithOp(const string& op, const GraphDef& graph);
 
-// Deletes nodes from the graph.
-Status DeleteNodes(const std::set<string>& nodes_to_delete, GraphDef* graph);
-
 // Returns the index of the node with the given name or -1 if the node does
 // not exist.
-int FindNodeWithName(const string& name, const GraphDef& graph);
+int FindGraphNodeWithName(const string& name, const GraphDef& graph);
+
+// Returns the index of the function with the given name or -1 if the function
+// does not exist.
+int FindGraphFunctionWithName(const string& name,
+                              const FunctionDefLibrary& library);
+
+// Returns the index of the function node with the given name or -1 if the
+// function node does not exist.
+int FindFunctionNodeWithName(const string& name, const FunctionDef& function);
 
 // Returns the index of a node with the given op or -1 if no such  node
 // exists.
 int FindNodeWithOp(const string& op, const GraphDef& graph);
 
-// Sets the node name using the op name as a prefix while guaranteeing the name
+// Sets the node name using `prefix` as a prefix while guaranteeing the name
 // is unique across the graph.
-void SetUniqueName(const string& op, GraphDef* graph, NodeDef* node);
-
-// Replaces the input for the output nodes of 'old_input' with 'new_input'.
-void ReplaceInput(const NodeDef& old_input, const NodeDef& new_input,
-                  GraphView* graph);
+void SetUniqueGraphNodeName(const string& prefix, GraphDef* graph,
+                            NodeDef* node);
+
+// Sets the function node name using the `prefix` as a prefix while guaranteeing
+// the name is unique across the functions nodes.
+void SetUniqueFunctionNodeName(const string& prefix, FunctionDef* function,
+                               NodeDef* node);
+
+// Sets the node name using the `prefix` name as a prefix while guaranteeing the
+// name is unique across the graph.
+void SetUniqueGraphFunctionName(const string& prefix,
+                                FunctionDefLibrary* library,
+                                FunctionDef* function);
 
 }  // end namespace graph_utils
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc b/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
index d723d73b7a..59ed79ab8f 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
 
+#include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -24,149 +25,191 @@ namespace graph_utils {
 namespace {
 
 TEST(GraphUtilsTest, AddScalarConstNodeBool) {
-  GraphDef graph;
-  NodeDef* bool_node;
-  TF_EXPECT_OK(AddScalarConstNode<bool>(true, &graph, &bool_node));
-  EXPECT_TRUE(ContainsNodeWithName(bool_node->name(), graph));
+  GraphDef graph_def;
+  MutableGraphView graph(&graph_def);
+  NodeDef* bool_node = AddScalarConstNode<bool>(true, &graph);
+  EXPECT_TRUE(ContainsGraphNodeWithName(bool_node->name(), *graph.GetGraph()));
   EXPECT_EQ(bool_node->attr().at("value").tensor().bool_val(0), true);
 }
 
 TEST(GraphUtilsTest, AddScalarConstNodeDouble) {
-  GraphDef graph;
-  NodeDef* double_node;
-  TF_EXPECT_OK(AddScalarConstNode<double>(3.14, &graph, &double_node));
-  EXPECT_TRUE(ContainsNodeWithName(double_node->name(), graph));
+  GraphDef graph_def;
+  MutableGraphView graph(&graph_def);
+  NodeDef* double_node = AddScalarConstNode<double>(3.14, &graph);
+  EXPECT_TRUE(
+      ContainsGraphNodeWithName(double_node->name(), *graph.GetGraph()));
   EXPECT_FLOAT_EQ(double_node->attr().at("value").tensor().double_val(0), 3.14);
 }
 
 TEST(GraphUtilsTest, AddScalarConstNodeFloat) {
-  GraphDef graph;
-  NodeDef* float_node;
-  TF_EXPECT_OK(AddScalarConstNode<float>(3.14, &graph, &float_node));
-  EXPECT_TRUE(ContainsNodeWithName(float_node->name(), graph));
+  GraphDef graph_def;
+  MutableGraphView graph(&graph_def);
+  NodeDef* float_node = AddScalarConstNode<float>(3.14, &graph);
+  EXPECT_TRUE(ContainsGraphNodeWithName(float_node->name(), *graph.GetGraph()));
   EXPECT_FLOAT_EQ(float_node->attr().at("value").tensor().float_val(0), 3.14);
 }
 
 TEST(GraphUtilsTest, AddScalarConstNodeInt) {
-  GraphDef graph;
-  NodeDef* int_node;
-  TF_EXPECT_OK(AddScalarConstNode<int>(42, &graph, &int_node));
-  EXPECT_TRUE(ContainsNodeWithName(int_node->name(), graph));
+  GraphDef graph_def;
+  MutableGraphView graph(&graph_def);
+  NodeDef* int_node = AddScalarConstNode<int>(42, &graph);
+  EXPECT_TRUE(ContainsGraphNodeWithName(int_node->name(), *graph.GetGraph()));
   EXPECT_EQ(int_node->attr().at("value").tensor().int_val(0), 42);
 }
 
 TEST(GraphUtilsTest, AddScalarConstNodeInt64) {
-  GraphDef graph;
-  NodeDef* int64_node;
-  TF_EXPECT_OK(AddScalarConstNode<int64>(42, &graph, &int64_node));
-  EXPECT_TRUE(ContainsNodeWithName(int64_node->name(), graph));
+  GraphDef graph_def;
+  MutableGraphView graph(&graph_def);
+  NodeDef* int64_node = AddScalarConstNode<int64>(42, &graph);
+  EXPECT_TRUE(ContainsGraphNodeWithName(int64_node->name(), *graph.GetGraph()));
   EXPECT_EQ(int64_node->attr().at("value").tensor().int64_val(0), 42);
 }
 
 TEST(GraphUtilsTest, AddScalarConstNodeString) {
-  GraphDef graph;
-  NodeDef* string_node;
-  TF_EXPECT_OK(AddScalarConstNode<StringPiece>("hello", &graph, &string_node));
-  EXPECT_TRUE(ContainsNodeWithName(string_node->name(), graph));
+  GraphDef graph_def;
+  MutableGraphView graph(&graph_def);
+  NodeDef* string_node = AddScalarConstNode<StringPiece>("hello", &graph);
+  EXPECT_TRUE(
+      ContainsGraphNodeWithName(string_node->name(), *graph.GetGraph()));
   EXPECT_EQ(string_node->attr().at("value").tensor().string_val(0), "hello");
 }
 
 TEST(GraphUtilsTest, Compare) {
-  GraphDef graphA;
-  GraphDef graphB;
-  EXPECT_TRUE(Compare(graphA, graphB));
+  GraphDef graph_def_a;
+  MutableGraphView graph_a(&graph_def_a);
+  GraphDef graph_def_b;
+  MutableGraphView graph_b(&graph_def_b);
 
-  NodeDef* nodeA;
-  TF_EXPECT_OK(AddNode("A", "OpA", {}, {}, &graphA, &nodeA));
-  NodeDef* nodeB;
-  TF_EXPECT_OK(AddNode("B", "OpB", {"A"}, {}, &graphA, &nodeB));
-  EXPECT_FALSE(Compare(graphA, graphB));
+  EXPECT_TRUE(Compare(graph_def_a, graph_def_b));
 
-  graphB.mutable_node()->CopyFrom(graphA.node());
-  EXPECT_TRUE(Compare(graphA, graphB));
+  AddNode("A", "OpA", {}, {}, &graph_a);
+  AddNode("B", "OpB", {"A"}, {}, &graph_a);
+  EXPECT_FALSE(Compare(graph_def_a, graph_def_b));
+
+  graph_def_b.mutable_node()->CopyFrom(graph_def_a.node());
+  EXPECT_TRUE(Compare(graph_def_a, graph_def_b));
 }
 
-TEST(GraphUtilsTest, ContainsNodeWithName) {
-  GraphDef graph;
-  EXPECT_TRUE(!ContainsNodeWithName("A", graph));
+TEST(GraphUtilsTest, ContainsGraphNodeWithName) {
+  GraphDef graph_def;
+  MutableGraphView graph(&graph_def);
+  EXPECT_TRUE(!ContainsGraphNodeWithName("A", *graph.GetGraph()));
 
-  NodeDef* node;
-  TF_EXPECT_OK(AddNode("A", "OpA", {}, {}, &graph, &node));
-  EXPECT_TRUE(ContainsNodeWithName("A", graph));
+  AddNode("A", "OpA", {}, {}, &graph);
+  EXPECT_TRUE(ContainsGraphNodeWithName("A", *graph.GetGraph()));
 
-  TF_EXPECT_OK(DeleteNodes({"A"}, &graph));
-  EXPECT_TRUE(!ContainsNodeWithName("A", graph));
+  graph.DeleteNodes({"A"});
+  EXPECT_TRUE(!ContainsGraphNodeWithName("A", *graph.GetGraph()));
 }
 
-TEST(GraphUtilsTest, ContainsNodeWithOp) {
-  GraphDef graph;
-  EXPECT_TRUE(!ContainsNodeWithOp("OpA", graph));
+TEST(GraphUtilsTest, ContainsGraphFunctionWithName) {
+  FunctionDefLibrary library;
+  EXPECT_FALSE(ContainsGraphFunctionWithName("new_function", library));
+  FunctionDef* new_function = library.add_function();
+  SetUniqueGraphFunctionName("new_function", &library, new_function);
 
-  NodeDef* node;
-  TF_EXPECT_OK(AddNode("A", "OpA", {}, {}, &graph, &node));
-  EXPECT_TRUE(ContainsNodeWithOp("OpA", graph));
+  EXPECT_TRUE(
+      ContainsGraphFunctionWithName(new_function->signature().name(), library));
+}
 
-  TF_EXPECT_OK(DeleteNodes({"A"}, &graph));
-  EXPECT_TRUE(!ContainsNodeWithOp("OpA", graph));
+TEST(GraphUtilsTest, ContainsFunctionNodeWithName) {
+  FunctionDef function = test::function::XTimesTwo();
+  EXPECT_FALSE(ContainsFunctionNodeWithName(
+      "weird_name_that_should_not_be_there", function));
+  EXPECT_TRUE(ContainsFunctionNodeWithName("two", function));
 }
 
-TEST(GraphUtilsTest, FindNodeWithName) {
-  GraphDef graph;
-  EXPECT_EQ(FindNodeWithName("A", graph), -1);
+TEST(GraphUtilsTest, ContainsNodeWithOp) {
+  GraphDef graph_def;
+  MutableGraphView graph(&graph_def);
+  EXPECT_TRUE(!ContainsNodeWithOp("OpA", *graph.GetGraph()));
 
-  NodeDef* node;
-  TF_EXPECT_OK(AddNode("A", "OpA", {}, {}, &graph, &node));
-  EXPECT_NE(FindNodeWithName("A", graph), -1);
+  AddNode("A", "OpA", {}, {}, &graph);
+  EXPECT_TRUE(ContainsNodeWithOp("OpA", *graph.GetGraph()));
 
-  TF_EXPECT_OK(DeleteNodes({"A"}, &graph));
-  EXPECT_EQ(FindNodeWithName("A", graph), -1);
+  graph.DeleteNodes({"A"});
+  EXPECT_TRUE(!ContainsNodeWithOp("OpA", *graph.GetGraph()));
 }
 
-TEST(GraphUtilsTest, FindNodeWithOp) {
-  GraphDef graph;
-  EXPECT_EQ(FindNodeWithOp("OpA", graph), -1);
+TEST(GraphUtilsTest, FindGraphNodeWithName) {
+  GraphDef graph_def;
+  MutableGraphView graph(&graph_def);
+  EXPECT_EQ(FindGraphNodeWithName("A", *graph.GetGraph()), -1);
 
-  NodeDef* node;
-  TF_EXPECT_OK(AddNode("A", "OpA", {}, {}, &graph, &node));
-  EXPECT_NE(FindNodeWithOp("OpA", graph), -1);
+  AddNode("A", "OpA", {}, {}, &graph);
+  EXPECT_NE(FindGraphNodeWithName("A", *graph.GetGraph()), -1);
 
-  TF_EXPECT_OK(DeleteNodes({"A"}, &graph));
-  EXPECT_EQ(FindNodeWithOp("OpA", graph), -1);
+  graph.DeleteNodes({"A"});
+  EXPECT_EQ(FindGraphNodeWithName("A", *graph.GetGraph()), -1);
 }
 
-TEST(GraphUtilsTest, SetUniqueName) {
-  GraphDef graph;
+TEST(GraphUtilsTest, FindFunctionWithName) {
+  FunctionDef function = test::function::XTimesTwo();
+  EXPECT_EQ(
+      FindFunctionNodeWithName("weird_name_that_should_not_be_there", function),
+      -1);
+  EXPECT_NE(FindFunctionNodeWithName("two", function), -1);
+}
 
-  NodeDef* node1;
-  TF_EXPECT_OK(AddNode("", "A", {}, {}, &graph, &node1));
-  NodeDef* node2;
-  TF_EXPECT_OK(AddNode("", "A", {}, {}, &graph, &node2));
-  EXPECT_NE(node1->name(), node2->name());
+TEST(GraphUtilsTest, FindGraphFunctionWithName) {
+  FunctionDefLibrary library;
+  EXPECT_EQ(FindGraphFunctionWithName("new_function", library), -1);
+  FunctionDef* new_function = library.add_function();
+  SetUniqueGraphFunctionName("new_function", &library, new_function);
 
-  TF_EXPECT_OK(DeleteNodes({node1->name()}, &graph));
-  NodeDef* node3;
-  TF_EXPECT_OK(AddNode("", "A", {}, {}, &graph, &node3));
-  EXPECT_NE(node2->name(), node3->name());
+  EXPECT_NE(
+      FindGraphFunctionWithName(new_function->signature().name(), library), -1);
 }
 
-TEST(GraphUtilsTest, ReplaceInput) {
-  GraphDef graph;
+TEST(GraphUtilsTest, FindNodeWithOp) {
+  GraphDef graph_def;
+  MutableGraphView graph(&graph_def);
+  EXPECT_EQ(FindNodeWithOp("OpA", *graph.GetGraph()), -1);
+
+  AddNode("A", "OpA", {}, {}, &graph);
+  EXPECT_NE(FindNodeWithOp("OpA", *graph.GetGraph()), -1);
+
+  graph.DeleteNodes({"A"});
+  EXPECT_EQ(FindNodeWithOp("OpA", *graph.GetGraph()), -1);
+}
 
-  NodeDef* node1;
-  TF_EXPECT_OK(AddNode("", "A", {}, {}, &graph, &node1));
+TEST(GraphUtilsTest, SetUniqueGraphNodeName) {
+  GraphDef graph_def;
+  MutableGraphView graph(&graph_def);
 
-  NodeDef* node2;
-  TF_EXPECT_OK(AddNode("", "A", {node1->name()}, {}, &graph, &node2));
+  NodeDef* node1 = AddNode("", "A", {}, {}, &graph);
+  NodeDef* node2 = AddNode("", "A", {}, {}, &graph);
+  EXPECT_NE(node1->name(), node2->name());
 
-  NodeDef* node3;
-  TF_EXPECT_OK(AddNode("", "A", {node2->name()}, {}, &graph, &node3));
+  graph.DeleteNodes({node1->name()});
+  NodeDef* node3 = AddNode("", "A", {}, {}, &graph);
+  EXPECT_NE(node2->name(), node3->name());
+}
 
-  EXPECT_EQ(node3->input(0), node2->name());
+TEST(GraphUtilsTest, SetUniqueFunctionNodeName) {
+  FunctionDef function = test::function::XTimesTwo();
+  NodeDef node;
+  SetUniqueFunctionNodeName("abc", &function, &node);
+  for (const NodeDef& function_node : function.node_def()) {
+    EXPECT_NE(node.name(), function_node.name());
+  }
+  auto* new_node = function.add_node_def();
+  *new_node = node;
+
+  NodeDef other;
+  SetUniqueFunctionNodeName("abc", &function, &other);
+  EXPECT_NE(other.name(), new_node->name());
+}
 
-  GraphView view(&graph);
-  ReplaceInput(*node2, *node1, &view);
+TEST(GraphUtilsTest, SetUniqueGraphFunctionName) {
+  FunctionDefLibrary library;
+  FunctionDef* new_function = library.add_function();
+  SetUniqueGraphFunctionName("new_function", &library, new_function);
 
-  EXPECT_EQ(node3->input(0), node1->name());
+  FunctionDef* other_function = library.add_function();
+  SetUniqueGraphFunctionName("new_function", &library, other_function);
+  EXPECT_NE(new_function->signature().name(),
+            other_function->signature().name());
 }
 
 }  // namespace
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
index eac665bd92..3ce238a30a 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
-#include "tensorflow/core/grappler/graph_view.h"
 #include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/mutable_graph_view.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
 #include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
@@ -32,12 +32,70 @@ namespace {
 
 constexpr char kFusedOpName[] = "MapAndBatchDatasetV2";
 
+NodeDef make_map_and_batch_node(const NodeDef& map_node,
+                                const NodeDef& batch_node,
+                                MutableGraphView* graph) {
+  NodeDef new_node;
+  new_node.set_op(kFusedOpName);
+  graph_utils::SetUniqueGraphNodeName(kFusedOpName, graph->GetGraph(),
+                                      &new_node);
+
+  // Set the `input` input argument.
+  new_node.add_input(map_node.input(0));
+
+  // Set the `other_arguments` input arguments.
+  int num_other_args;
+  if (map_node.op() == "ParallelMapDataset") {
+    num_other_args = map_node.input_size() - 2;
+  } else {
+    num_other_args = map_node.input_size() - 1;
+  }
+  for (int i = 0; i < num_other_args; i++) {
+    new_node.add_input(map_node.input(i + 1));
+  }
+
+  // Set the `batch_size` input argument.
+  new_node.add_input(batch_node.input(1));
+
+  // Set the `num_parallel_calls` input argument.
+  if (map_node.op() == "ParallelMapDataset") {
+    // The type of the `num_parallel_calls` argument in ParallelMapDataset
+    // and MapAndBatchDataset is different (int32 and int64 respectively)
+    // so we cannot reuse the same Const node and thus create a new one.
+    NodeDef* v = graph->GetNode(map_node.input(map_node.input_size() - 1));
+    NodeDef* tmp = graph_utils::AddScalarConstNode<int64>(
+        v->attr().at("value").tensor().int_val(0), graph);
+    new_node.add_input(tmp->name());
+  } else {
+    NodeDef* tmp = graph_utils::AddScalarConstNode<int64>(1, graph);
+    new_node.add_input(tmp->name());
+  }
+
+  // Set the `drop_remainder` input argument.
+  if (batch_node.op() == "BatchDatasetV2") {
+    new_node.add_input(batch_node.input(2));
+  } else {
+    NodeDef* tmp = graph_utils::AddScalarConstNode<bool>(false, graph);
+    new_node.add_input(tmp->name());
+  }
+
+  // Set `f` and `Targuments` attributes.
+  for (auto key : {"f", "Targuments"}) {
+    (*new_node.mutable_attr())[key] = map_node.attr().at(key);
+  }
+  // Set `output_types` and `output_shapes` attributes.
+  for (auto key : {"output_shapes", "output_types"}) {
+    (*new_node.mutable_attr())[key] = batch_node.attr().at(key);
+  }
+  return new_node;
+}
+
 }  // namespace
 
 Status MapAndBatchFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
                                    GraphDef* output) {
   *output = item.graph;
-  GraphView graph(output);
+  MutableGraphView graph(output);
   std::set<string> nodes_to_delete;
   for (const NodeDef& node : item.graph.node()) {
     if (node.op() != "BatchDataset" && node.op() != "BatchDatasetV2") {
@@ -45,79 +103,25 @@ Status MapAndBatchFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
     }
 
     // Use a more descriptive variable name now that we know the node type.
-    const NodeDef batch_node(node);
+    const NodeDef& batch_node = node;
     GraphView::InputPort input_port = graph.GetInputPort(batch_node.name(), 0);
     NodeDef* node2 = graph.GetRegularFanin(input_port).node;
     if (node2->op() != "MapDataset" && node2->op() != "ParallelMapDataset") {
       continue;
     }
-
-    NodeDef* new_node = output->add_node();
-    new_node->set_op(kFusedOpName);
-    graph_utils::SetUniqueName(kFusedOpName, output, new_node);
-
     // Use a more descriptive variable name now that we know the node type.
     NodeDef* map_node = node2;
-    // Set the `input` input argument.
-    new_node->add_input(map_node->input(0));
-
-    // Set the `other_arguments` input arguments.
-    int num_other_args;
-    if (map_node->op() == "ParallelMapDataset") {
-      num_other_args = map_node->input_size() - 2;
-    } else {
-      num_other_args = map_node->input_size() - 1;
-    }
-    for (int i = 0; i < num_other_args; i++) {
-      new_node->add_input(map_node->input(i + 1));
-    }
 
-    // Set the `batch_size` input argument.
-    new_node->add_input(batch_node.input(1));
-
-    // Set the `num_parallel_calls` input argument.
-    if (map_node->op() == "ParallelMapDataset") {
-      // The type of the `num_parallel_calls` argument in ParallelMapDataset
-      // and MapAndBatchDataset is different (int32 and int64 respectively)
-      // so we cannot reuse the same Const node and thus create a new one.
-      NodeDef* v = graph.GetNode(map_node->input(map_node->input_size() - 1));
-      NodeDef* tmp;
-      TF_RETURN_IF_ERROR(graph_utils::AddScalarConstNode<int64>(
-          v->attr().at("value").tensor().int_val(0), output, &tmp));
-      new_node->add_input(tmp->name());
-    } else {
-      NodeDef* tmp;
-      TF_RETURN_IF_ERROR(
-          graph_utils::AddScalarConstNode<int64>(1, output, &tmp));
-      new_node->add_input(tmp->name());
-    }
-
-    // Set the `drop_remainder` input argument.
-    if (batch_node.op() == "BatchDatasetV2") {
-      new_node->add_input(batch_node.input(2));
-    } else {
-      NodeDef* tmp;
-      TF_RETURN_IF_ERROR(
-          graph_utils::AddScalarConstNode<bool>(false, output, &tmp));
-      new_node->add_input(tmp->name());
-    }
-
-    // Set `f` and `Targuments` attributes.
-    for (auto key : {"f", "Targuments"}) {
-      (*new_node->mutable_attr())[key] = map_node->attr().at(key);
-    }
-    // Set `output_types` and `output_shapes` attributes.
-    for (auto key : {"output_shapes", "output_types"}) {
-      (*new_node->mutable_attr())[key] = batch_node.attr().at(key);
-    }
+    auto* new_node =
+        graph.AddNode(make_map_and_batch_node(*map_node, batch_node, &graph));
+    graph.ReplaceInput(batch_node, *new_node);
 
     // Mark the `Map` and `Batch` nodes for removal.
     nodes_to_delete.insert(map_node->name());
     nodes_to_delete.insert(batch_node.name());
-
-    graph_utils::ReplaceInput(batch_node, *new_node, &graph);
   }
-  TF_RETURN_IF_ERROR(graph_utils::DeleteNodes(nodes_to_delete, output));
+
+  graph.DeleteNodes(nodes_to_delete);
   return Status::OK();
 }
 
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc
index 3c1d8d5359..a46c504ac4 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc
@@ -27,25 +27,21 @@ namespace {
 
 TEST(MapAndBatchFusionTest, FuseMapAndBatchNodesIntoOne) {
   GrapplerItem item;
-  GraphDef *graph = &item.graph;
-  NodeDef *start_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(0, graph, &start_node));
-  NodeDef *stop_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(10, graph, &stop_node));
-  NodeDef *step_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(1, graph, &step_node));
+  MutableGraphView graph(&item.graph);
+
+  NodeDef *start_node = graph_utils::AddScalarConstNode<int64>(0, &graph);
+  NodeDef *stop_node = graph_utils::AddScalarConstNode<int64>(10, &graph);
+  NodeDef *step_node = graph_utils::AddScalarConstNode<int64>(1, &graph);
 
   std::vector<string> range_inputs(3);
   range_inputs[0] = start_node->name();
   range_inputs[1] = stop_node->name();
   range_inputs[2] = step_node->name();
   std::vector<std::pair<string, AttrValue>> range_attrs;
-  NodeDef *range_node;
-  TF_ASSERT_OK(graph_utils::AddNode("", "RangeDataset", range_inputs,
-                                    range_attrs, graph, &range_node));
-  NodeDef *captured_input_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<StringPiece>(
-      "hello", graph, &captured_input_node));
+  NodeDef *range_node = graph_utils::AddNode("", "RangeDataset", range_inputs,
+                                             range_attrs, &graph);
+  NodeDef *captured_input_node =
+      graph_utils::AddScalarConstNode<StringPiece>("hello", &graph);
 
   NodeDef *map_node;
   {
@@ -59,13 +55,11 @@ TEST(MapAndBatchFusionTest, FuseMapAndBatchNodesIntoOne) {
     AttrValue args_attr;
     SetAttrValue("Targuments", &args_attr);
     map_attrs[1] = std::make_pair("Targuments", args_attr);
-    TF_ASSERT_OK(graph_utils::AddNode("", "MapDataset", map_inputs, map_attrs,
-                                      graph, &map_node));
+    map_node =
+        graph_utils::AddNode("", "MapDataset", map_inputs, map_attrs, &graph);
   }
 
-  NodeDef *batch_size_node;
-  TF_ASSERT_OK(
-      graph_utils::AddScalarConstNode<int64>(5, graph, &batch_size_node));
+  NodeDef *batch_size_node = graph_utils::AddScalarConstNode<int64>(5, &graph);
   NodeDef *batch_node;
   {
     std::vector<string> batch_inputs(2);
@@ -78,16 +72,18 @@ TEST(MapAndBatchFusionTest, FuseMapAndBatchNodesIntoOne) {
     AttrValue types_attr;
     SetAttrValue("output_types", &types_attr);
     batch_attrs[1] = std::make_pair("output_types", types_attr);
-    TF_ASSERT_OK(graph_utils::AddNode("", "BatchDataset", batch_inputs,
-                                      batch_attrs, graph, &batch_node));
+    batch_node = graph_utils::AddNode("", "BatchDataset", batch_inputs,
+                                      batch_attrs, &graph);
   }
 
   MapAndBatchFusion optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
 
-  EXPECT_FALSE(graph_utils::ContainsNodeWithName(map_node->name(), output));
-  EXPECT_FALSE(graph_utils::ContainsNodeWithName(batch_node->name(), output));
+  EXPECT_FALSE(
+      graph_utils::ContainsGraphNodeWithName(map_node->name(), output));
+  EXPECT_FALSE(
+      graph_utils::ContainsGraphNodeWithName(batch_node->name(), output));
   EXPECT_TRUE(graph_utils::ContainsNodeWithOp("MapAndBatchDatasetV2", output));
   NodeDef map_and_batch_node =
       output.node(graph_utils::FindNodeWithOp("MapAndBatchDatasetV2", output));
@@ -96,11 +92,11 @@ TEST(MapAndBatchFusionTest, FuseMapAndBatchNodesIntoOne) {
   EXPECT_EQ(map_and_batch_node.input(1), map_node->input(1));
   EXPECT_EQ(map_and_batch_node.input(2), batch_node->input(1));
   NodeDef num_parallel_calls_node = output.node(
-      graph_utils::FindNodeWithName(map_and_batch_node.input(3), output));
+      graph_utils::FindGraphNodeWithName(map_and_batch_node.input(3), output));
   EXPECT_EQ(num_parallel_calls_node.attr().at("value").tensor().int64_val(0),
             1);
   NodeDef drop_remainder_node = output.node(
-      graph_utils::FindNodeWithName(map_and_batch_node.input(4), output));
+      graph_utils::FindGraphNodeWithName(map_and_batch_node.input(4), output));
   EXPECT_EQ(drop_remainder_node.attr().at("value").tensor().bool_val(0), false);
   EXPECT_TRUE(AreAttrValuesEqual(map_and_batch_node.attr().at("f"),
                                  map_node->attr().at("f")));
@@ -114,25 +110,20 @@ TEST(MapAndBatchFusionTest, FuseMapAndBatchNodesIntoOne) {
 
 TEST(MapAndBatchFusionTest, FuseMapAndBatchV2NodesIntoOne) {
   GrapplerItem item;
-  GraphDef *graph = &item.graph;
-  NodeDef *start_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(0, graph, &start_node));
-  NodeDef *stop_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(10, graph, &stop_node));
-  NodeDef *step_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(1, graph, &step_node));
+  MutableGraphView graph(&item.graph);
+  NodeDef *start_node = graph_utils::AddScalarConstNode<int64>(0, &graph);
+  NodeDef *stop_node = graph_utils::AddScalarConstNode<int64>(10, &graph);
+  NodeDef *step_node = graph_utils::AddScalarConstNode<int64>(1, &graph);
 
   std::vector<string> range_inputs(3);
   range_inputs[0] = start_node->name();
   range_inputs[1] = stop_node->name();
   range_inputs[2] = step_node->name();
   std::vector<std::pair<string, AttrValue>> range_attrs;
-  NodeDef *range_node;
-  TF_ASSERT_OK(graph_utils::AddNode("", "RangeDataset", range_inputs,
-                                    range_attrs, graph, &range_node));
-  NodeDef *captured_input_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<StringPiece>(
-      "hello", graph, &captured_input_node));
+  NodeDef *range_node = graph_utils::AddNode("", "RangeDataset", range_inputs,
+                                             range_attrs, &graph);
+  NodeDef *captured_input_node =
+      graph_utils::AddScalarConstNode<StringPiece>("hello", &graph);
 
   NodeDef *map_node;
   {
@@ -146,16 +137,13 @@ TEST(MapAndBatchFusionTest, FuseMapAndBatchV2NodesIntoOne) {
     AttrValue args_attr;
     SetAttrValue("Targuments", &args_attr);
     map_attrs[1] = std::make_pair("Targuments", args_attr);
-    TF_ASSERT_OK(graph_utils::AddNode("", "MapDataset", map_inputs, map_attrs,
-                                      graph, &map_node));
+    map_node =
+        graph_utils::AddNode("", "MapDataset", map_inputs, map_attrs, &graph);
   }
 
-  NodeDef *batch_size_node;
-  TF_ASSERT_OK(
-      graph_utils::AddScalarConstNode<int64>(5, graph, &batch_size_node));
-  NodeDef *drop_remainder_node;
-  TF_ASSERT_OK(
-      graph_utils::AddScalarConstNode<bool>(true, graph, &drop_remainder_node));
+  NodeDef *batch_size_node = graph_utils::AddScalarConstNode<int64>(5, &graph);
+  NodeDef *drop_remainder_node =
+      graph_utils::AddScalarConstNode<bool>(true, &graph);
   NodeDef *batch_node;
   {
     std::vector<string> batch_inputs(3);
@@ -169,16 +157,18 @@ TEST(MapAndBatchFusionTest, FuseMapAndBatchV2NodesIntoOne) {
     AttrValue types_attr;
     SetAttrValue("output_types", &types_attr);
     batch_attrs[1] = std::make_pair("output_types", types_attr);
-    TF_ASSERT_OK(graph_utils::AddNode("", "BatchDatasetV2", batch_inputs,
-                                      batch_attrs, graph, &batch_node));
+    batch_node = graph_utils::AddNode("", "BatchDatasetV2", batch_inputs,
+                                      batch_attrs, &graph);
   }
 
   MapAndBatchFusion optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
 
-  EXPECT_FALSE(graph_utils::ContainsNodeWithName(map_node->name(), output));
-  EXPECT_FALSE(graph_utils::ContainsNodeWithName(batch_node->name(), output));
+  EXPECT_FALSE(
+      graph_utils::ContainsGraphNodeWithName(map_node->name(), output));
+  EXPECT_FALSE(
+      graph_utils::ContainsGraphNodeWithName(batch_node->name(), output));
   EXPECT_TRUE(graph_utils::ContainsNodeWithOp("MapAndBatchDatasetV2", output));
   NodeDef map_and_batch_node =
       output.node(graph_utils::FindNodeWithOp("MapAndBatchDatasetV2", output));
@@ -187,7 +177,7 @@ TEST(MapAndBatchFusionTest, FuseMapAndBatchV2NodesIntoOne) {
   EXPECT_EQ(map_and_batch_node.input(1), map_node->input(1));
   EXPECT_EQ(map_and_batch_node.input(2), batch_node->input(1));
   NodeDef num_parallel_calls_node = output.node(
-      graph_utils::FindNodeWithName(map_and_batch_node.input(3), output));
+      graph_utils::FindGraphNodeWithName(map_and_batch_node.input(3), output));
   EXPECT_EQ(num_parallel_calls_node.attr().at("value").tensor().int64_val(0),
             1);
   EXPECT_EQ(map_and_batch_node.input(4), batch_node->input(2));
@@ -203,28 +193,22 @@ TEST(MapAndBatchFusionTest, FuseMapAndBatchV2NodesIntoOne) {
 
 TEST(MapAndBatchFusionTest, FuseParallelMapAndBatchNodesIntoOne) {
   GrapplerItem item;
-  GraphDef *graph = &item.graph;
-  NodeDef *start_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(0, graph, &start_node));
-  NodeDef *stop_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(10, graph, &stop_node));
-  NodeDef *step_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(1, graph, &step_node));
+  MutableGraphView graph(&item.graph);
+  NodeDef *start_node = graph_utils::AddScalarConstNode<int64>(0, &graph);
+  NodeDef *stop_node = graph_utils::AddScalarConstNode<int64>(10, &graph);
+  NodeDef *step_node = graph_utils::AddScalarConstNode<int64>(1, &graph);
 
   std::vector<string> range_inputs(3);
   range_inputs[0] = start_node->name();
   range_inputs[1] = stop_node->name();
   range_inputs[2] = step_node->name();
   std::vector<std::pair<string, AttrValue>> range_attrs;
-  NodeDef *range_node;
-  TF_ASSERT_OK(graph_utils::AddNode("", "RangeDataset", range_inputs,
-                                    range_attrs, graph, &range_node));
-  NodeDef *captured_input_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<StringPiece>(
-      "hello", graph, &captured_input_node));
-  NodeDef *num_parallel_calls_node;
-  TF_ASSERT_OK(
-      graph_utils::AddScalarConstNode<int>(2, graph, &num_parallel_calls_node));
+  NodeDef *range_node = graph_utils::AddNode("", "RangeDataset", range_inputs,
+                                             range_attrs, &graph);
+  NodeDef *captured_input_node =
+      graph_utils::AddScalarConstNode<StringPiece>("hello", &graph);
+  NodeDef *num_parallel_calls_node =
+      graph_utils::AddScalarConstNode<int>(2, &graph);
 
   NodeDef *map_node;
   {
@@ -239,13 +223,11 @@ TEST(MapAndBatchFusionTest, FuseParallelMapAndBatchNodesIntoOne) {
     AttrValue args_attr;
     SetAttrValue("Targuments", &args_attr);
     map_attrs[1] = std::make_pair("Targuments", args_attr);
-    TF_ASSERT_OK(graph_utils::AddNode("", "ParallelMapDataset", map_inputs,
-                                      map_attrs, graph, &map_node));
+    map_node = graph_utils::AddNode("", "ParallelMapDataset", map_inputs,
+                                    map_attrs, &graph);
   }
 
-  NodeDef *batch_size_node;
-  TF_ASSERT_OK(
-      graph_utils::AddScalarConstNode<int64>(5, graph, &batch_size_node));
+  NodeDef *batch_size_node = graph_utils::AddScalarConstNode<int64>(5, &graph);
   NodeDef *batch_node;
   {
     std::vector<string> batch_inputs(2);
@@ -258,16 +240,18 @@ TEST(MapAndBatchFusionTest, FuseParallelMapAndBatchNodesIntoOne) {
     AttrValue types_attr;
     SetAttrValue("output_types", &types_attr);
     batch_attrs[1] = std::make_pair("output_types", types_attr);
-    TF_ASSERT_OK(graph_utils::AddNode("", "BatchDataset", batch_inputs,
-                                      batch_attrs, graph, &batch_node));
+    batch_node = graph_utils::AddNode("", "BatchDataset", batch_inputs,
+                                      batch_attrs, &graph);
   }
 
   MapAndBatchFusion optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
 
-  EXPECT_FALSE(graph_utils::ContainsNodeWithName(map_node->name(), output));
-  EXPECT_FALSE(graph_utils::ContainsNodeWithName(batch_node->name(), output));
+  EXPECT_FALSE(
+      graph_utils::ContainsGraphNodeWithName(map_node->name(), output));
+  EXPECT_FALSE(
+      graph_utils::ContainsGraphNodeWithName(batch_node->name(), output));
   EXPECT_TRUE(graph_utils::ContainsNodeWithOp("MapAndBatchDatasetV2", output));
   NodeDef map_and_batch_node =
       output.node(graph_utils::FindNodeWithOp("MapAndBatchDatasetV2", output));
@@ -276,11 +260,11 @@ TEST(MapAndBatchFusionTest, FuseParallelMapAndBatchNodesIntoOne) {
   EXPECT_EQ(map_and_batch_node.input(1), map_node->input(1));
   EXPECT_EQ(map_and_batch_node.input(2), batch_node->input(1));
   NodeDef num_parallel_calls_node2 = output.node(
-      graph_utils::FindNodeWithName(map_and_batch_node.input(3), output));
+      graph_utils::FindGraphNodeWithName(map_and_batch_node.input(3), output));
   EXPECT_EQ(num_parallel_calls_node2.attr().at("value").tensor().int64_val(0),
             2);
   NodeDef drop_remainder_node = output.node(
-      graph_utils::FindNodeWithName(map_and_batch_node.input(4), output));
+      graph_utils::FindGraphNodeWithName(map_and_batch_node.input(4), output));
   EXPECT_EQ(drop_remainder_node.attr().at("value").tensor().bool_val(0), false);
   EXPECT_TRUE(AreAttrValuesEqual(map_and_batch_node.attr().at("f"),
                                  map_node->attr().at("f")));
@@ -294,27 +278,21 @@ TEST(MapAndBatchFusionTest, FuseParallelMapAndBatchNodesIntoOne) {
 
 TEST(MapAndBatchFusionTest, NoChange) {
   GrapplerItem item;
-  GraphDef *graph = &item.graph;
+  MutableGraphView graph(&item.graph);
 
-  NodeDef *start_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(0, graph, &start_node));
-  NodeDef *stop_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(10, graph, &stop_node));
-  NodeDef *step_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(1, graph, &step_node));
+  NodeDef *start_node = graph_utils::AddScalarConstNode<int64>(0, &graph);
+  NodeDef *stop_node = graph_utils::AddScalarConstNode<int64>(10, &graph);
+  NodeDef *step_node = graph_utils::AddScalarConstNode<int64>(1, &graph);
 
   std::vector<string> range_inputs(3);
   range_inputs[0] = start_node->name();
   range_inputs[1] = stop_node->name();
   range_inputs[2] = step_node->name();
   std::vector<std::pair<string, AttrValue>> range_attrs;
-  NodeDef *range_node;
-  TF_ASSERT_OK(graph_utils::AddNode("", "RangeDataset", range_inputs,
-                                    range_attrs, graph, &range_node));
+  NodeDef *range_node = graph_utils::AddNode("", "RangeDataset", range_inputs,
+                                             range_attrs, &graph);
 
-  NodeDef *batch_size_node;
-  TF_ASSERT_OK(
-      graph_utils::AddScalarConstNode<int64>(5, graph, &batch_size_node));
+  NodeDef *batch_size_node = graph_utils::AddScalarConstNode<int64>(5, &graph);
   std::vector<string> batch_inputs(2);
   batch_inputs[0] = range_node->name();
   batch_inputs[1] = batch_size_node->name();
@@ -325,15 +303,13 @@ TEST(MapAndBatchFusionTest, NoChange) {
   AttrValue types_attr;
   SetAttrValue("output_types", &types_attr);
   batch_attrs[1] = std::make_pair("output_types", types_attr);
-  NodeDef *batch_node;
-  TF_ASSERT_OK(graph_utils::AddNode("", "BatchDataset", batch_inputs,
-                                    batch_attrs, graph, &batch_node));
+  graph_utils::AddNode("", "BatchDataset", batch_inputs, batch_attrs, &graph);
 
   MapAndBatchFusion optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
 
-  EXPECT_TRUE(graph_utils::Compare(*graph, output));
+  EXPECT_TRUE(graph_utils::Compare(*graph.GetGraph(), output));
 }
 
 }  // namespace
diff --git a/tensorflow/core/grappler/optimizers/data/map_fusion.cc b/tensorflow/core/grappler/optimizers/data/map_fusion.cc
new file mode 100644
index 0000000000..707f4a3407
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/map_fusion.cc
@@ -0,0 +1,261 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/data/map_fusion.h"
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/mutable_graph_view.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/topological_sort.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+// Sets basic function parameters and copies attributes from parent and map
+// node.
+NodeDef MakeFusedNode(const NodeDef& parent_map_node, const NodeDef& map_node,
+                      const FunctionDef& fused_function,
+                      MutableGraphView* graph) {
+  NodeDef fused_node;
+  graph_utils::SetUniqueGraphNodeName("fused_map", graph->GetGraph(),
+                                      &fused_node);
+
+  fused_node.set_op("MapDataset");
+  fused_node.add_input(parent_map_node.input(0));
+
+  auto copy_attribute = [](const string& attribute_name, const NodeDef& from,
+                           NodeDef* to) {
+    (*to->mutable_attr())[attribute_name] = from.attr().at(attribute_name);
+  };
+
+  auto attr = parent_map_node.attr().at("f");
+  *attr.mutable_func()->mutable_name() = fused_function.signature().name();
+  (*fused_node.mutable_attr())["f"] = std::move(attr);
+
+  copy_attribute("Targuments", parent_map_node, &fused_node);
+
+  for (auto key : {"output_shapes", "output_types"})
+    copy_attribute(key, map_node, &fused_node);
+
+  return fused_node;
+}
+
+string ParseNodeConnection(const string& name) {
+  // If input/output node name has semicolon, take the prefix.  Otherwise take
+  // the whole string.
+  return name.substr(0, name.find(':'));
+}
+
+string ParseOutputNode(const string& name) {
+  return name.substr(name.find(':'), string::npos);
+}
+
+const string& GetOutputNode(const FunctionDef& parent_function,
+                            int output_idx) {
+  const auto& ret_output_name =
+      parent_function.signature().output_arg(output_idx).name();
+  return parent_function.ret().at(ret_output_name);
+}
+
+// Nodes that will be added to the function can have the same name as the nodes
+// from parent function.  We need to rename them and the connections of the
+// inputs that refer to them.
+void RenameFunctionNodes(FunctionDef* fused_function,
+                         protobuf::RepeatedPtrField<NodeDef>* nodes_to_fuse) {
+  std::unordered_map<string, string> changed_node_names;
+  for (NodeDef& function_node : *nodes_to_fuse) {
+    string name_before = function_node.name();
+    graph_utils::SetUniqueFunctionNodeName(name_before, fused_function,
+                                           &function_node);
+    if (name_before != function_node.name())
+      changed_node_names[name_before] = function_node.name();
+  }
+
+  auto update_name = [&changed_node_names](string* input) {
+    string input_node = ParseNodeConnection(*input);
+    if (changed_node_names.count(input_node) == 0) return;
+    const string& new_node_name = changed_node_names.at(input_node);
+    *input = new_node_name + ParseOutputNode(*input);
+  };
+
+  for (NodeDef& function_node : *nodes_to_fuse) {
+    for (string& input : *function_node.mutable_input()) {
+      update_name(&input);
+    }
+  }
+
+  for (auto& ret : *fused_function->mutable_ret()) update_name(&ret.second);
+}
+
+// This function adds new nodes and changes their input to the output nodes
+// of parent function.
+void FuseFunctionNodes(const FunctionDef& parent_function,
+                       const FunctionDef& function,
+                       protobuf::RepeatedPtrField<NodeDef>* nodes_to_fuse) {
+  const auto number_of_outputs = parent_function.signature().output_arg_size();
+  CHECK(number_of_outputs == function.signature().input_arg_size())
+      << "The number of input arguments of function "
+      << function.signature().name()
+      << " should be the same as the number of output arguments of function "
+      << parent_function.signature().name() << ".";
+
+  for (int output_idx = 0; output_idx < number_of_outputs; output_idx++) {
+    const string& output = GetOutputNode(parent_function, output_idx);
+
+    const auto& input_node_name =
+        function.signature().input_arg(output_idx).name();
+
+    for (NodeDef& function_node : *nodes_to_fuse) {
+      for (auto& node_input : *function_node.mutable_input()) {
+        auto parsed_name = ParseNodeConnection(node_input);
+        if (parsed_name != input_node_name) continue;
+
+        node_input = output;
+      }
+    }
+  }
+}
+
+// This function looks for direct edges from input to return and rewrites
+// them to the coresponding input of the return of parent_function.
+void FuseReturns(const FunctionDef& parent_function,
+                 const FunctionDef& function, FunctionDef* fused_function) {
+  const auto number_of_inputs = function.signature().input_arg_size();
+
+  for (auto& ret : *fused_function->mutable_ret()) {
+    auto return_input = ParseNodeConnection(ret.second);
+    for (int input_idx = 0; input_idx < number_of_inputs; input_idx++) {
+      const auto& input_arg = function.signature().input_arg(input_idx);
+      if (return_input != input_arg.name()) continue;
+
+      ret.second = GetOutputNode(parent_function, input_idx);
+    }
+  }
+}
+
+// This function produces new function that is a result of fusion of
+// `parent_function` with `function`.
+FunctionDef* FuseFunctions(const FunctionDef& parent_function,
+                           const FunctionDef& function,
+                           FunctionDefLibrary* library) {
+  FunctionDef* fused_function = library->add_function();
+  graph_utils::SetUniqueGraphFunctionName("fused_function", library,
+                                          fused_function);
+
+  // Copy input signature from parent function.
+  *fused_function->mutable_signature()->mutable_input_arg() =
+      parent_function.signature().input_arg();
+
+  fused_function->mutable_node_def()->CopyFrom(parent_function.node_def());
+  // This code assumes functions does not have any attributes. If this is
+  // not the case, we need to merge attributes and fix name conflicts.
+  CHECK(parent_function.attr_size() == 0 && function.attr_size() == 0 &&
+        "Functions with attributes are currently not supported");
+
+  // Copy the returns and output signature from the second node.
+  auto nodes_to_fuse = function.node_def();
+  fused_function->mutable_signature()->mutable_output_arg()->CopyFrom(
+      function.signature().output_arg());
+  *fused_function->mutable_ret() = function.ret();
+
+  RenameFunctionNodes(fused_function, &nodes_to_fuse);
+  FuseFunctionNodes(parent_function, function, &nodes_to_fuse);
+  FuseReturns(parent_function, function, fused_function);
+
+  // Copy transformed nodes from the second function.
+  fused_function->mutable_node_def()->MergeFrom(nodes_to_fuse);
+
+  return fused_function;
+}
+
+}  // namespace
+
+Status MapFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
+                           GraphDef* output) {
+  GraphDef sorted_old_graph = item.graph;
+  TF_RETURN_IF_ERROR(TopologicalSort(&sorted_old_graph));
+  *output = sorted_old_graph;
+
+  MutableGraphView graph(output);
+  std::set<string> nodes_to_delete;
+  FunctionLibraryDefinition function_library(OpRegistry::Global(),
+                                             item.graph.library());
+
+  auto get_map_node = [](const NodeDef& node) -> const NodeDef* {
+    // TODO(prazek): we could also handle ParallelMapDataset and
+    // MapAndBatchDataset.
+    if (node.op() == "MapDataset") return &node;
+    return nullptr;
+  };
+
+  auto get_fused_function = [&function_library, &output](
+                                const NodeDef* parent_map_node,
+                                const NodeDef* map_node) {
+    const auto& parent_fun = parent_map_node->attr().at("f");
+    const FunctionDef* parent_func =
+        function_library.Find(parent_fun.func().name());
+    const auto& fun = map_node->attr().at("f");
+    const FunctionDef* func = function_library.Find(fun.func().name());
+
+    return FuseFunctions(*parent_func, *func, output->mutable_library());
+  };
+
+  for (const NodeDef& node : sorted_old_graph.node()) {
+    const NodeDef* map_node = get_map_node(node);
+    if (!map_node) continue;
+
+    GraphView::InputPort input_port = graph.GetInputPort(map_node->name(), 0);
+    const NodeDef* parent_map_node =
+        get_map_node(*graph.GetRegularFanin(input_port).node);
+    if (!parent_map_node) continue;
+
+    const auto* fused_function = get_fused_function(parent_map_node, map_node);
+    const auto* fused_maps_node = graph.AddNode(
+        MakeFusedNode(*parent_map_node, *map_node, *fused_function, &graph));
+
+    graph.ReplaceInput(*map_node, *fused_maps_node);
+
+    // TODO(prazek): we should run some optimizations on the fused map
+    // functions, or make sure that optimization passes run after map
+    // fusion.
+    TF_RETURN_IF_ERROR(function_library.AddFunctionDef(*fused_function));
+
+    // TODO(prazek): we could also remove map functions from library if they
+    // are not used anymore.
+    nodes_to_delete.insert(parent_map_node->name());
+    nodes_to_delete.insert(map_node->name());
+  }
+
+  graph.DeleteNodes(nodes_to_delete);
+  return Status::OK();
+}
+
+void MapFusion::Feedback(Cluster* cluster, const GrapplerItem& item,
+                         const GraphDef& optimize_output, double result) {
+  // no-op
+}
+
+REGISTER_GRAPH_OPTIMIZER_AS(MapFusion, "map_fusion");
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/map_fusion.h b/tensorflow/core/grappler/optimizers/data/map_fusion.h
new file mode 100644
index 0000000000..a6a06592b8
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/map_fusion.h
@@ -0,0 +1,47 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_FUSION_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_FUSION_H_
+
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// This optimization fuses map transformations by merging their map functions.
+class MapFusion : public CustomGraphOptimizer {
+ public:
+  MapFusion() = default;
+  ~MapFusion() override = default;
+
+  string name() const override { return "map_fusion"; };
+
+  Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
+    return Status::OK();
+  }
+
+  Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                  GraphDef* output) override;
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& optimize_output, double result) override;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_FUSION_H_
diff --git a/tensorflow/core/grappler/optimizers/data/map_fusion_test.cc b/tensorflow/core/grappler/optimizers/data/map_fusion_test.cc
new file mode 100644
index 0000000000..df6c19dc7c
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/map_fusion_test.cc
@@ -0,0 +1,90 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/data/map_fusion.h"
+
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+NodeDef MakeMapNode(StringPiece name, StringPiece input_node_name) {
+  return test::function::NDef(
+      name, "MapDataset", {input_node_name.ToString()},
+      {{"f", FunctionDefHelper::FunctionRef("XTimesTwo")},
+       {"Targuments", {}},
+       {"output_shapes", {}},
+       {"output_types", {}}});
+}
+
+TEST(MapFusionTest, FuseTwoMapNodesIntoOne) {
+  using test::function::NDef;
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
+       NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
+       NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       NDef("range", "RangeDataset", {"start", "stop", "step"}, {}),
+       MakeMapNode("map1", "range"), MakeMapNode("map2", "map1")},
+      // FunctionLib
+      {
+          test::function::XTimesTwo(),
+      });
+
+  MapFusion optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("MapDataset", output));
+  EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("map1", output));
+  EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("map2", output));
+}
+
+TEST(MapFusionTest, FuseThreeNodesIntoOne) {
+  using test::function::NDef;
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
+       NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
+       NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       NDef("filename", "Const", {}, {{"value", ""}, {"dtype", DT_STRING}}),
+       NDef("range", "RangeDataset", {"start", "stop", "step"}, {}),
+       MakeMapNode("map1", "range"), MakeMapNode("map2", "map1"),
+       MakeMapNode("map3", "map2"),
+       NDef("cache", "CacheDataset", {"map3", "filename"}, {})},
+      // FunctionLib
+      {
+          test::function::XTimesTwo(),
+      });
+
+  MapFusion optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("MapDataset", output));
+  EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("map1", output));
+  EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("map2", output));
+  EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("map3", output));
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/noop_elimination.cc b/tensorflow/core/grappler/optimizers/data/noop_elimination.cc
index 5670966367..55d57b3b97 100644
--- a/tensorflow/core/grappler/optimizers/data/noop_elimination.cc
+++ b/tensorflow/core/grappler/optimizers/data/noop_elimination.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
-#include "tensorflow/core/grappler/graph_view.h"
 #include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/mutable_graph_view.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
 #include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
@@ -64,18 +64,19 @@ bool IsNoOp(const NodeDef& node, const GraphView& graph) {
 Status NoOpElimination::Optimize(Cluster* cluster, const GrapplerItem& item,
                                  GraphDef* output) {
   *output = item.graph;
-  GraphView graph(output);
+  MutableGraphView graph(output);
   std::set<string> nodes_to_delete;
   for (const NodeDef& node : item.graph.node()) {
     if (!IsNoOp(node, graph)) continue;
 
     GraphView::InputPort input_port = graph.GetInputPort(node.name(), 0);
     NodeDef* const parent = graph.GetRegularFanin(input_port).node;
-    graph_utils::ReplaceInput(node, *parent, &graph);
+    graph.ReplaceInput(node, *parent);
 
     nodes_to_delete.insert(node.name());
   }
-  TF_RETURN_IF_ERROR(graph_utils::DeleteNodes(nodes_to_delete, output));
+
+  graph.DeleteNodes(nodes_to_delete);
   return Status::OK();
 }
 
diff --git a/tensorflow/core/grappler/optimizers/data/noop_elimination_test.cc b/tensorflow/core/grappler/optimizers/data/noop_elimination_test.cc
index 8628b16ea5..a6cc63edba 100644
--- a/tensorflow/core/grappler/optimizers/data/noop_elimination_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/noop_elimination_test.cc
@@ -35,36 +35,32 @@ std::vector<std::pair<string, AttrValue>> GetCommonAttributes() {
   return commonAttributes;
 }
 
-void MakeUnaryNode(GraphDef *graph, const std::string &node_type, int count,
-                   string input_node, NodeDef **return_node) {
-  NodeDef *node_count;
-  TF_ASSERT_OK(
-      graph_utils::AddScalarConstNode<int64>(count, graph, &node_count));
-  TF_ASSERT_OK(graph_utils::AddNode("", node_type,
-                                    {std::move(input_node), node_count->name()},
-                                    GetCommonAttributes(), graph, return_node));
+NodeDef *MakeUnaryNode(const std::string &node_type, int count,
+                       string input_node, MutableGraphView *graph) {
+  NodeDef *node_count = graph_utils::AddScalarConstNode<int64>(count, graph);
+  return graph_utils::AddNode("", node_type,
+                              {std::move(input_node), node_count->name()},
+                              GetCommonAttributes(), graph);
 }
 
-void MakeCacheNode(GraphDef *graph, string input_node, NodeDef **return_node) {
-  NodeDef *node_filename;
-  TF_ASSERT_OK(
-      graph_utils::AddScalarConstNode<StringPiece>("", graph, &node_filename));
-  TF_ASSERT_OK(graph_utils::AddNode(
-      "", "CacheDataset", {std::move(input_node), node_filename->name()},
-      GetCommonAttributes(), graph, return_node));
+NodeDef *MakeCacheNode(string input_node, MutableGraphView *graph) {
+  NodeDef *node_filename =
+      graph_utils::AddScalarConstNode<StringPiece>("", graph);
+  return graph_utils::AddNode("", "CacheDataset",
+                              {std::move(input_node), node_filename->name()},
+                              GetCommonAttributes(), graph);
 }
 
-void MakeRangeNode(GraphDef *graph, NodeDef **range_node) {
-  NodeDef *start_node, *stop_node, *step_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(0, graph, &start_node));
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(10, graph, &stop_node));
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(1, graph, &step_node));
+NodeDef *MakeRangeNode(MutableGraphView *graph) {
+  auto *start_node = graph_utils::AddScalarConstNode<int64>(0, graph);
+  auto *stop_node = graph_utils::AddScalarConstNode<int64>(10, graph);
+  auto *step_node = graph_utils::AddScalarConstNode<int64>(1, graph);
 
   std::vector<string> range_inputs = {start_node->name(), stop_node->name(),
                                       step_node->name()};
 
-  TF_ASSERT_OK(graph_utils::AddNode("", "RangeDataset", range_inputs,
-                                    GetCommonAttributes(), graph, range_node));
+  return graph_utils::AddNode("", "RangeDataset", range_inputs,
+                              GetCommonAttributes(), graph);
 }
 
 struct NoOpLastEliminationTest
@@ -74,23 +70,22 @@ struct NoOpLastEliminationTest
 // transformations at the end of the pipeline.
 TEST_P(NoOpLastEliminationTest, EliminateLastNoOpNode) {
   GrapplerItem item;
-  GraphDef *graph = &item.graph;
+  MutableGraphView graph(&item.graph);
 
   const std::string &node_type = std::get<0>(GetParam());
   const int node_count = std::get<1>(GetParam());
   const bool should_keep_node = std::get<2>(GetParam());
 
-  NodeDef *range_node;
-  MakeRangeNode(graph, &range_node);
+  NodeDef *range_node = MakeRangeNode(&graph);
 
-  NodeDef *node;
-  MakeUnaryNode(graph, node_type, node_count, range_node->name(), &node);
+  NodeDef *node =
+      MakeUnaryNode(node_type, node_count, range_node->name(), &graph);
 
   NoOpElimination optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
 
-  EXPECT_EQ(graph_utils::ContainsNodeWithName(node->name(), output),
+  EXPECT_EQ(graph_utils::ContainsGraphNodeWithName(node->name(), output),
             should_keep_node);
 }
 
@@ -113,30 +108,29 @@ struct NoOpMiddleEliminationTest
 // transformations int the middle of the pipeline.
 TEST_P(NoOpMiddleEliminationTest, EliminateMiddleNoOpNode) {
   GrapplerItem item;
-  GraphDef *graph = &item.graph;
+  MutableGraphView graph(&item.graph);
 
   const std::string &node_type = std::get<0>(GetParam());
   const int node_count = std::get<1>(GetParam());
   const bool should_keep_node = std::get<2>(GetParam());
 
-  NodeDef *range_node;
-  MakeRangeNode(graph, &range_node);
+  NodeDef *range_node = MakeRangeNode(&graph);
 
-  NodeDef *node;
-  MakeUnaryNode(graph, node_type, node_count, range_node->name(), &node);
+  NodeDef *node =
+      MakeUnaryNode(node_type, node_count, range_node->name(), &graph);
 
-  NodeDef *cache_node;
-  MakeCacheNode(graph, node->name(), &cache_node);
+  NodeDef *cache_node = MakeCacheNode(node->name(), &graph);
   NoOpElimination optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
 
-  EXPECT_EQ(graph_utils::ContainsNodeWithName(node->name(), output),
+  EXPECT_EQ(graph_utils::ContainsGraphNodeWithName(node->name(), output),
             should_keep_node);
-  EXPECT_TRUE(graph_utils::ContainsNodeWithName(cache_node->name(), output));
+  EXPECT_TRUE(
+      graph_utils::ContainsGraphNodeWithName(cache_node->name(), output));
 
-  NodeDef cache_node_out =
-      output.node(graph_utils::FindNodeWithName(cache_node->name(), output));
+  NodeDef cache_node_out = output.node(
+      graph_utils::FindGraphNodeWithName(cache_node->name(), output));
 
   EXPECT_EQ(cache_node_out.input_size(), 2);
   auto last_node_input = (should_keep_node ? node : range_node)->name();
@@ -162,41 +156,40 @@ struct NoOpMultipleEliminationTest : ::testing::TestWithParam<NodesTypes> {};
 // multiple noop nodes.
 TEST_P(NoOpMultipleEliminationTest, EliminateMultipleNoOpNode) {
   GrapplerItem item;
-  GraphDef *graph = &item.graph;
+  MutableGraphView graph(&item.graph);
 
   static_assert(std::tuple_size<NodesTypes>::value == 2,
                 "Make sure to include everything in the test");
   const std::vector<std::pair<string, int>> noop_nodes = {
       std::get<0>(GetParam()), std::get<1>(GetParam())};
 
-  NodeDef *range_node;
-  MakeRangeNode(graph, &range_node);
+  NodeDef *range_node = MakeRangeNode(&graph);
 
   NodeDef *previous = range_node;
   std::vector<string> nodes_to_remove;
   nodes_to_remove.reserve(noop_nodes.size());
 
   for (const auto &noop_node : noop_nodes) {
-    NodeDef *node;
-    MakeUnaryNode(graph, noop_node.first, noop_node.second, previous->name(),
-                  &node);
+    NodeDef *node = MakeUnaryNode(noop_node.first, noop_node.second,
+                                  previous->name(), &graph);
     nodes_to_remove.push_back(node->name());
     previous = node;
   }
 
-  NodeDef *cache_node;
-  MakeCacheNode(graph, previous->name(), &cache_node);
+  NodeDef *cache_node = MakeCacheNode(previous->name(), &graph);
   NoOpElimination optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
 
   for (const auto &noop_node_name : nodes_to_remove)
-    EXPECT_FALSE(graph_utils::ContainsNodeWithName(noop_node_name, output));
+    EXPECT_FALSE(
+        graph_utils::ContainsGraphNodeWithName(noop_node_name, output));
 
-  EXPECT_TRUE(graph_utils::ContainsNodeWithName(cache_node->name(), output));
+  EXPECT_TRUE(
+      graph_utils::ContainsGraphNodeWithName(cache_node->name(), output));
 
-  NodeDef cache_node_out =
-      output.node(graph_utils::FindNodeWithName(cache_node->name(), output));
+  NodeDef cache_node_out = output.node(
+      graph_utils::FindGraphNodeWithName(cache_node->name(), output));
 
   EXPECT_EQ(cache_node_out.input_size(), 2);
   EXPECT_EQ(cache_node_out.input(0), range_node->name());
diff --git a/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.cc b/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.cc
index 8332fb0b1e..7c7161c5b2 100644
--- a/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
-#include "tensorflow/core/grappler/graph_view.h"
 #include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/mutable_graph_view.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
 #include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
@@ -38,55 +38,62 @@ Status ShuffleAndRepeatFusion::Optimize(Cluster* cluster,
                                         const GrapplerItem& item,
                                         GraphDef* output) {
   *output = item.graph;
-  GraphView graph(output);
+  MutableGraphView graph(output);
   std::set<string> nodes_to_delete;
-  for (const NodeDef& node : item.graph.node()) {
-    if (node.op() != "RepeatDataset") {
-      continue;
-    }
 
-    // Use a more descriptive variable name now that we know the node type.
-    const NodeDef repeat_node(node);
-    GraphView::InputPort input_port = graph.GetInputPort(repeat_node.name(), 0);
-    NodeDef* node2 = graph.GetRegularFanin(input_port).node;
-    if (node2->op() != "ShuffleDataset") {
-      continue;
-    }
-
-    NodeDef* new_node = output->add_node();
-    new_node->set_op(kFusedOpName);
-    graph_utils::SetUniqueName(kFusedOpName, output, new_node);
-
-    // Use a more descriptive variable name now that we know the node type.
-    NodeDef* shuffle_node = node2;
+  auto make_shuffle_and_repeat_node = [&output](const NodeDef& shuffle_node,
+                                                const NodeDef& repeat_node) {
+    NodeDef new_node;
+    new_node.set_op(kFusedOpName);
+    graph_utils::SetUniqueGraphNodeName(kFusedOpName, output, &new_node);
 
     // Set the `input` input argument.
-    new_node->add_input(shuffle_node->input(0));
+    new_node.add_input(shuffle_node.input(0));
 
     // Set the `buffer_size` input argument.
-    new_node->add_input(shuffle_node->input(1));
+    new_node.add_input(shuffle_node.input(1));
 
     // Set the `seed` input argument.
-    new_node->add_input(shuffle_node->input(2));
+    new_node.add_input(shuffle_node.input(2));
 
     // Set the `seed2` input argument.
-    new_node->add_input(shuffle_node->input(3));
+    new_node.add_input(shuffle_node.input(3));
 
     // Set the `count` input argument.
-    new_node->add_input(repeat_node.input(1));
+    new_node.add_input(repeat_node.input(1));
 
     // Set `output_types` and `output_shapes` attributes.
     for (auto key : {"output_shapes", "output_types"}) {
-      (*new_node->mutable_attr())[key] = repeat_node.attr().at(key);
+      (*new_node.mutable_attr())[key] = repeat_node.attr().at(key);
     }
+    return new_node;
+  };
+
+  for (const NodeDef& node : item.graph.node()) {
+    if (node.op() != "RepeatDataset") {
+      continue;
+    }
+
+    // Use a more descriptive variable name now that we know the node type.
+    const NodeDef& repeat_node = node;
+    GraphView::InputPort input_port = graph.GetInputPort(repeat_node.name(), 0);
+    NodeDef* node2 = graph.GetRegularFanin(input_port).node;
+    if (node2->op() != "ShuffleDataset") {
+      continue;
+    }
+    // Use a more descriptive variable name now that we know the node type.
+    const NodeDef& shuffle_node = *node2;
+
+    NodeDef* shuffle_and_repeat_node =
+        graph.AddNode(make_shuffle_and_repeat_node(shuffle_node, repeat_node));
+    graph.ReplaceInput(repeat_node, *shuffle_and_repeat_node);
 
     // Mark the `Shuffle` and `Repeat` nodes for removal.
-    nodes_to_delete.insert(shuffle_node->name());
+    nodes_to_delete.insert(shuffle_node.name());
     nodes_to_delete.insert(repeat_node.name());
-
-    graph_utils::ReplaceInput(repeat_node, *new_node, &graph);
   }
-  TF_RETURN_IF_ERROR(graph_utils::DeleteNodes(nodes_to_delete, output));
+
+  graph.DeleteNodes(nodes_to_delete);
   return Status::OK();
 }
 
diff --git a/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion_test.cc b/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion_test.cc
index e89675efb7..a2e470e511 100644
--- a/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion_test.cc
@@ -27,7 +27,7 @@ namespace {
 
 TEST(ShuffleAndRepeatFusionTest, FuseShuffleAndRepeatNodesIntoOne) {
   GrapplerItem item;
-  GraphDef *graph = &item.graph;
+  MutableGraphView graph(&item.graph);
 
   std::vector<std::pair<string, AttrValue>> common_attrs(2);
   AttrValue shapes_attr;
@@ -37,52 +37,44 @@ TEST(ShuffleAndRepeatFusionTest, FuseShuffleAndRepeatNodesIntoOne) {
   SetAttrValue("output_types", &types_attr);
   common_attrs[1] = std::make_pair("output_types", types_attr);
 
-  NodeDef *start_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(0, graph, &start_node));
-  NodeDef *stop_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(10, graph, &stop_node));
-  NodeDef *step_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(1, graph, &step_node));
+  NodeDef *start_node = graph_utils::AddScalarConstNode<int64>(0, &graph);
+  NodeDef *stop_node = graph_utils::AddScalarConstNode<int64>(10, &graph);
+  NodeDef *step_node = graph_utils::AddScalarConstNode<int64>(1, &graph);
 
   std::vector<string> range_inputs(3);
   range_inputs[0] = start_node->name();
   range_inputs[1] = stop_node->name();
   range_inputs[2] = step_node->name();
-  NodeDef *range_node;
-  TF_ASSERT_OK(graph_utils::AddNode("", "RangeDataset", range_inputs,
-                                    common_attrs, graph, &range_node));
-
-  NodeDef *buffer_size_node;
-  TF_ASSERT_OK(
-      graph_utils::AddScalarConstNode<int64>(128, graph, &buffer_size_node));
-  NodeDef *seed_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(-1, graph, &seed_node));
-  NodeDef *seed2_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(-1, graph, &seed2_node));
+  NodeDef *range_node = graph_utils::AddNode("", "RangeDataset", range_inputs,
+                                             common_attrs, &graph);
+
+  NodeDef *buffer_size_node =
+      graph_utils::AddScalarConstNode<int64>(128, &graph);
+  NodeDef *seed_node = graph_utils::AddScalarConstNode<int64>(-1, &graph);
+  NodeDef *seed2_node = graph_utils::AddScalarConstNode<int64>(-1, &graph);
   std::vector<string> shuffle_inputs(4);
   shuffle_inputs[0] = range_node->name();
   shuffle_inputs[1] = buffer_size_node->name();
   shuffle_inputs[2] = seed_node->name();
   shuffle_inputs[3] = seed2_node->name();
-  NodeDef *shuffle_node;
-  TF_ASSERT_OK(graph_utils::AddNode("", "ShuffleDataset", shuffle_inputs,
-                                    common_attrs, graph, &shuffle_node));
+  NodeDef *shuffle_node = graph_utils::AddNode(
+      "", "ShuffleDataset", shuffle_inputs, common_attrs, &graph);
 
-  NodeDef *count_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(-1, graph, &count_node));
+  NodeDef *count_node = graph_utils::AddScalarConstNode<int64>(-1, &graph);
   std::vector<string> repeat_inputs(2);
   repeat_inputs[0] = shuffle_node->name();
   repeat_inputs[1] = count_node->name();
-  NodeDef *repeat_node;
-  TF_ASSERT_OK(graph_utils::AddNode("", "RepeatDataset", repeat_inputs,
-                                    common_attrs, graph, &repeat_node));
+  NodeDef *repeat_node = graph_utils::AddNode(
+      "", "RepeatDataset", repeat_inputs, common_attrs, &graph);
 
   ShuffleAndRepeatFusion optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
 
-  EXPECT_FALSE(graph_utils::ContainsNodeWithName(shuffle_node->name(), output));
-  EXPECT_FALSE(graph_utils::ContainsNodeWithName(repeat_node->name(), output));
+  EXPECT_FALSE(
+      graph_utils::ContainsGraphNodeWithName(shuffle_node->name(), output));
+  EXPECT_FALSE(
+      graph_utils::ContainsGraphNodeWithName(repeat_node->name(), output));
   EXPECT_TRUE(
       graph_utils::ContainsNodeWithOp("ShuffleAndRepeatDataset", output));
   NodeDef shuffle_and_repeat_node = output.node(
@@ -103,7 +95,7 @@ TEST(ShuffleAndRepeatFusionTest, FuseShuffleAndRepeatNodesIntoOne) {
 
 TEST(ShuffleAndRepeatFusionTest, NoChange) {
   GrapplerItem item;
-  GraphDef *graph = &item.graph;
+  MutableGraphView graph(&item.graph);
 
   std::vector<std::pair<string, AttrValue>> common_attrs(2);
   AttrValue shapes_attr;
@@ -113,35 +105,29 @@ TEST(ShuffleAndRepeatFusionTest, NoChange) {
   SetAttrValue("output_types", &types_attr);
   common_attrs[1] = std::make_pair("output_types", types_attr);
 
-  NodeDef *start_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(0, graph, &start_node));
-  NodeDef *stop_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(10, graph, &stop_node));
-  NodeDef *step_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(1, graph, &step_node));
+  NodeDef *start_node = graph_utils::AddScalarConstNode<int64>(0, &graph);
+  NodeDef *stop_node = graph_utils::AddScalarConstNode<int64>(10, &graph);
+  NodeDef *step_node = graph_utils::AddScalarConstNode<int64>(1, &graph);
 
   std::vector<string> range_inputs(3);
   range_inputs[0] = start_node->name();
   range_inputs[1] = stop_node->name();
   range_inputs[2] = step_node->name();
-  NodeDef *range_node;
-  TF_ASSERT_OK(graph_utils::AddNode("", "RangeDataset", range_inputs,
-                                    common_attrs, graph, &range_node));
+  NodeDef *range_node = graph_utils::AddNode("", "RangeDataset", range_inputs,
+                                             common_attrs, &graph);
 
-  NodeDef *count_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(-1, graph, &count_node));
+  NodeDef *count_node = graph_utils::AddScalarConstNode<int64>(-1, &graph);
   std::vector<string> repeat_inputs(2);
   repeat_inputs[0] = range_node->name();
   repeat_inputs[1] = count_node->name();
-  NodeDef *repeat_node;
-  TF_ASSERT_OK(graph_utils::AddNode("", "RepeatDataset", repeat_inputs,
-                                    common_attrs, graph, &repeat_node));
+  graph_utils::AddNode("", "RepeatDataset", repeat_inputs, common_attrs,
+                       &graph);
 
   ShuffleAndRepeatFusion optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
 
-  EXPECT_TRUE(graph_utils::Compare(*graph, output));
+  EXPECT_TRUE(graph_utils::Compare(*graph.GetGraph(), output));
 }
 
 }  // namespace
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
index fdd82b9603..bb14ce310d 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
+#include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
@@ -53,16 +54,6 @@ bool RemoveInput(NodeDef* node, const string& input, NodeMap* node_map) {
   return removed_input;
 }
 
-void DeleteNodes(const std::set<int>& nodes_to_delete, GraphDef* graph) {
-  int last = graph->node_size() - 1;
-  for (auto it = nodes_to_delete.rbegin(); it != nodes_to_delete.rend(); ++it) {
-    const int index = *it;
-    graph->mutable_node()->SwapElements(index, last);
-    last--;
-  }
-  graph->mutable_node()->DeleteSubrange(last + 1, nodes_to_delete.size());
-}
-
 }  // namespace
 
 bool DependencyOptimizer::SafeToRemoveIdentity(const NodeDef& node) const {
@@ -441,7 +432,7 @@ Status DependencyOptimizer::OptimizeDependencies() {
   if (fetch_nodes_known_) {
     VLOG(1) << "Deleted " << nodes_to_delete.size() << " out of "
             << optimized_graph_->node_size() << " nodes.";
-    DeleteNodes(nodes_to_delete, optimized_graph_);
+    EraseNodesFromGraph(nodes_to_delete, optimized_graph_);
     node_map_.reset(new NodeMap(optimized_graph_));
     BuildNodeToIdx();
   }
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer.cc b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
index 9627ed7323..405778222a 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
@@ -616,15 +616,13 @@ Status RemoveDeadBranches(const std::unordered_set<string>& nodes_to_preserve,
     }
   }
 
-  int last = optimized_graph->node_size() - 1;
-  for (int i = optimized_graph->node_size() - 1; i >= 0; --i) {
-    NodeDef* node = optimized_graph->mutable_node(i);
-    if (dead_nodes.find(node) != dead_nodes.end()) {
-      optimized_graph->mutable_node()->SwapElements(i, last);
-      last--;
-    }
+  std::vector<int> nodes_idx_to_delete;
+  nodes_idx_to_delete.reserve(dead_nodes.size());
+  for (int i = 0; i < optimized_graph->node_size(); ++i) {
+    if (dead_nodes.count(&optimized_graph->node(i)))
+      nodes_idx_to_delete.push_back(i);
   }
-  optimized_graph->mutable_node()->DeleteSubrange(last + 1, dead_nodes.size());
+  EraseNodesFromGraph(std::move(nodes_idx_to_delete), optimized_graph);
 
   for (const auto& itr : dead_merge_inputs) {
     NodeDef* dead_node = itr.first;
diff --git a/tensorflow/core/grappler/utils.cc b/tensorflow/core/grappler/utils.cc
index c8e63f95e1..153785d3b4 100644
--- a/tensorflow/core/grappler/utils.cc
+++ b/tensorflow/core/grappler/utils.cc
@@ -13,7 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/grappler/utils.h"
+
 #include <memory>
+#include <queue>
 #include <vector>
 
 #include "tensorflow/core/framework/attr_value.pb.h"
@@ -21,7 +24,6 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/scanner.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -354,13 +356,51 @@ void DedupControlInputs(NodeDef* node) {
 }
 
 namespace {
+
+template <typename UniqueContainer>
+void EraseNodesFromGraphImpl(const UniqueContainer& nodes_to_delete,
+                             GraphDef* graph) {
+  static_assert(std::is_same<typename UniqueContainer::value_type, int>::value,
+                "Need to pass container of ints");
+
+  int last = graph->node_size() - 1;
+  for (auto it = nodes_to_delete.rbegin(); it != nodes_to_delete.rend(); ++it) {
+    const int index = *it;
+    graph->mutable_node()->SwapElements(index, last);
+    last--;
+  }
+  graph->mutable_node()->DeleteSubrange(last + 1, nodes_to_delete.size());
+}
+
 template <typename T>
 inline void STLSortAndRemoveDuplicates(T* v) {
   std::sort(v->begin(), v->end());
   v->erase(std::unique(v->begin(), v->end()), v->end());
 }
+
 }  // namespace
 
+void EraseNodesFromGraph(const std::set<int>& nodes_to_delete,
+                         GraphDef* graph) {
+  EraseNodesFromGraphImpl(nodes_to_delete, graph);
+}
+
+void EraseNodesFromGraph(std::vector<int>&& nodes_to_delete, GraphDef* graph) {
+  STLSortAndRemoveDuplicates(&nodes_to_delete);
+  EraseNodesFromGraphImpl(nodes_to_delete, graph);
+}
+
+void EraseNodesFromGraph(const std::set<string>& nodes_to_delete,
+                         GraphDef* graph) {
+  std::vector<int> nodes_idx_to_delete;
+  nodes_idx_to_delete.reserve(nodes_to_delete.size());
+  for (int i = 0; i < graph->node_size(); ++i) {
+    if (nodes_to_delete.count(graph->node(i).name()))
+      nodes_idx_to_delete.push_back(i);
+  }
+  EraseNodesFromGraphImpl(nodes_idx_to_delete, graph);
+}
+
 Status SimpleGraphView::Initialize(
     const GraphDef& graph,
     const std::vector<std::pair<const NodeDef*, const NodeDef*>>*
diff --git a/tensorflow/core/grappler/utils.h b/tensorflow/core/grappler/utils.h
index 1c6fef59ea..b297caa8d4 100644
--- a/tensorflow/core/grappler/utils.h
+++ b/tensorflow/core/grappler/utils.h
@@ -209,6 +209,13 @@ void PermuteNodesInPlace(GraphDef* graph, std::vector<int>* permutation,
 
 Status SetTensorValue(DataType dtype, int value, Tensor* tensor);
 
+void EraseNodesFromGraph(const std::set<int>& nodes_to_delete, GraphDef* graph);
+
+void EraseNodesFromGraph(std::vector<int>&& nodes_to_delete, GraphDef* graph);
+
+void EraseNodesFromGraph(const std::set<string>& nodes_to_delete,
+                         GraphDef* graph);
+
 class SimpleGraphView {
  public:
   // Build a graph view for the specified graphdef.
@@ -237,6 +244,9 @@ class SimpleGraphView {
     DCHECK(it != name_to_index_.end());
     return it == name_to_index_.end() ? -1 : it->second;
   }
+  inline const NodeDef& node(int node_idx) const {
+    return graph_->node(node_idx);
+  }
   inline const string& node_name(int node_idx) const {
     return index_to_name_[node_idx];
   }
diff --git a/tensorflow/core/grappler/utils_test.cc b/tensorflow/core/grappler/utils_test.cc
index 49a1996d25..c6e035834c 100644
--- a/tensorflow/core/grappler/utils_test.cc
+++ b/tensorflow/core/grappler/utils_test.cc
@@ -16,7 +16,9 @@ limitations under the License.
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/notification.h"
@@ -333,7 +335,9 @@ TEST_F(UtilsTest, NumNonControlOutputs) {
   EXPECT_EQ(1, NumNonControlDataOutputs(*add_node, node_map));
 }
 
-TEST_F(UtilsTest, DeleteNodes) {}
+TEST_F(UtilsTest, DeleteNodes) {
+  // TODO(rmlarsen): write forgtten test.
+}
 
 }  // namespace
 }  // namespace grappler
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 10cbcdecc8..65a7f8ccf3 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -124,6 +124,7 @@ tf_kernel_library(
         ":bounds_check",
         ":dense_update_functor",
         ":ops_util",
+        ":training_op_helpers",
         ":variable_ops",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -557,12 +558,6 @@ cc_header_only_library(
     deps = [":image_resizer_state"],
 )
 
-cc_library(
-    name = "crop_resize_bilinear_core",
-    hdrs = ["crop_resize_bilinear_core.h"],
-    visibility = ["//visibility:private"],
-)
-
 # OpKernel libraries ----------------------------------------------------------
 
 ARRAY_DEPS = [
@@ -2158,7 +2153,7 @@ tf_kernel_library(
 tf_kernel_library(
     name = "crop_and_resize_op",
     prefix = "crop_and_resize_op",
-    deps = IMAGE_DEPS + [":crop_resize_bilinear_core"],
+    deps = IMAGE_DEPS,
 )
 
 tf_kernel_library(
@@ -2224,7 +2219,7 @@ tf_kernel_library(
 tf_kernel_library(
     name = "resize_bilinear_op",
     prefix = "resize_bilinear_op",
-    deps = IMAGE_DEPS + [":crop_resize_bilinear_core"],
+    deps = IMAGE_DEPS,
 )
 
 tf_kernel_library(
@@ -4864,7 +4859,6 @@ filegroup(
         "concat_op.cc",
         "constant_op.cc",
         "constant_op.h",
-        "crop_resize_bilinear_core.h",
         "cwise_ops.h",
         "cwise_ops_common.cc",
         "cwise_ops_common.h",
@@ -5232,6 +5226,16 @@ filegroup(
     visibility = ["//visibility:public"],
 )
 
+ANDROID_TEXTUAL_HDRS = [
+    "gather_nd_op_cpu_impl.h",
+    "gemm_functors.h",
+    "mirror_pad_op_cpu_impl.h",
+    "scatter_nd_op_cpu_impl.h",
+    "slice_op_cpu_impl.h",
+    "strided_slice_op_impl.h",
+    "tile_ops_cpu_impl.h",
+]
+
 # A file group which contains nearly all available operators which
 # may work on Android. This is intended to be used with selective
 # registration.
@@ -5293,10 +5297,20 @@ filegroup(
             "batch_kernels.*",
             "regex_full_match_op.cc",
             "regex_replace_op.cc",
-        ],
+            # Ops that are inherently incompatible with Android (e.g. tied to x86 platform).
+            "mkl_*",
+            "xsmm_*",
+            "cwise_ops_sycl_common.h",
+        ] + ANDROID_TEXTUAL_HDRS,
     ),
     visibility = ["//visibility:public"],
 )
+
+filegroup(
+    name = "android_all_ops_textual_hdrs",
+    srcs = ANDROID_TEXTUAL_HDRS,
+    visibility = ["//visibility:public"],
+)
 # LINT.ThenChange(//tensorflow/contrib/makefile/tf_op_files.txt)
 
 cc_library(
diff --git a/tensorflow/core/kernels/argmax_op.cc b/tensorflow/core/kernels/argmax_op.cc
index adc573e40c..c731b64993 100644
--- a/tensorflow/core/kernels/argmax_op.cc
+++ b/tensorflow/core/kernels/argmax_op.cc
@@ -76,6 +76,10 @@ class ArgOp : public OpKernel {
     Tensor* output = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
 
+    if (output_shape.num_elements() == 0) {
+      return;
+    }
+
 #define HANDLE_DIM(NDIM)                                        \
   case NDIM:                                                    \
     ArgFunctor::Reduce##NDIM(context->eigen_device<Device>(),   \
diff --git a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
index b77c14d012..656b6ced6d 100644
--- a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
@@ -147,13 +147,21 @@ class AdaptiveSharedBatchScheduler
 
   // Tracks processing latency and adjusts in_flight_batches_limit to minimize.
   void CallbackWrapper(const internal::ASBSBatch<TaskType>* batch,
-                       BatchProcessor callback);
+                       BatchProcessor callback, bool is_express);
 
   // Schedules batch if in_flight_batches_limit_ is not met.
   void MaybeScheduleNextBatch() EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
+  // Schedules the earliest closed batch in batches_
+  // if batch_thread_pool_ has an idle thead.
+  // Batches scheduled this way are called express batches.
+  // Express batches are not limited by in_flight_batches_limit_, and
+  // their latencies will not affect in_flight_batches_limit_.
+  void MaybeScheduleClosedBatch() EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
   // Notifies scheduler of non-empty batch which is eligible for processing.
-  void AddBatch(const internal::ASBSBatch<TaskType>* batch);
+  void AddBatch(const internal::ASBSBatch<TaskType>* batch,
+                bool also_schedule_closed_batch);
 
   // Removes queue from scheduler.
   void RemoveQueue(const internal::ASBSQueue<TaskType>* queue);
@@ -180,8 +188,10 @@ class AdaptiveSharedBatchScheduler
   // results in an actual cap of 3 80% of the time, and 4 20% of the time.
   double in_flight_batches_limit_ GUARDED_BY(mu_);
 
-  // Number of batches currently being processed.
+  // Number of regular batches currently being processed.
   int64 in_flight_batches_ GUARDED_BY(mu_) = 0;
+  // Number of express batches currently being processed.
+  int64 in_flight_express_batches_ GUARDED_BY(mu_) = 0;
 
   // RNG engine and distribution.
   std::default_random_engine rand_engine_;
@@ -363,10 +373,14 @@ Status AdaptiveSharedBatchScheduler<TaskType>::AddQueue(
 
 template <typename TaskType>
 void AdaptiveSharedBatchScheduler<TaskType>::AddBatch(
-    const internal::ASBSBatch<TaskType>* batch) {
+    const internal::ASBSBatch<TaskType>* batch,
+    bool also_schedule_closed_batch) {
   mutex_lock l(mu_);
   batches_.push_back(batch);
   MaybeScheduleNextBatch();
+  if (also_schedule_closed_batch) {
+    MaybeScheduleClosedBatch();
+  }
 }
 
 template <typename TaskType>
@@ -407,19 +421,45 @@ void AdaptiveSharedBatchScheduler<TaskType>::MaybeScheduleNextBatch() {
   batch->queue()->ReleaseBatch(batch);
   batch_thread_pool_->Schedule(
       std::bind(&AdaptiveSharedBatchScheduler<TaskType>::CallbackWrapper, this,
-                batch, queues_and_callbacks_[batch->queue()]));
+                batch, queues_and_callbacks_[batch->queue()], false));
   in_flight_batches_++;
 }
 
 template <typename TaskType>
+void AdaptiveSharedBatchScheduler<TaskType>::MaybeScheduleClosedBatch() {
+  if (in_flight_batches_ + in_flight_express_batches_ >=
+      options_.num_batch_threads) {
+    return;
+  }
+  for (auto it = batches_.begin(); it != batches_.end(); it++) {
+    if ((*it)->IsClosed()) {
+      const internal::ASBSBatch<TaskType>* batch = *it;
+      batches_.erase(it);
+      batch->queue()->ReleaseBatch(batch);
+      batch_thread_pool_->Schedule(
+          std::bind(&AdaptiveSharedBatchScheduler<TaskType>::CallbackWrapper,
+                    this, batch, queues_and_callbacks_[batch->queue()], true));
+      in_flight_express_batches_++;
+      return;
+    }
+  }
+}
+
+template <typename TaskType>
 void AdaptiveSharedBatchScheduler<TaskType>::CallbackWrapper(
     const internal::ASBSBatch<TaskType>* batch,
-    AdaptiveSharedBatchScheduler<TaskType>::BatchProcessor callback) {
+    AdaptiveSharedBatchScheduler<TaskType>::BatchProcessor callback,
+    bool is_express) {
   int64 start_time = batch->creation_time_micros();
   callback(std::unique_ptr<Batch<TaskType>>(
       const_cast<internal::ASBSBatch<TaskType>*>(batch)));
   int64 end_time = GetEnv()->NowMicros();
   mutex_lock l(mu_);
+  if (is_express) {
+    in_flight_express_batches_--;
+    MaybeScheduleClosedBatch();
+    return;
+  }
   in_flight_batches_--;
   batch_count_++;
   batch_latency_sum_ += end_time - start_time;
@@ -496,6 +536,7 @@ Status ASBSQueue<TaskType>::Schedule(std::unique_ptr<TaskType>* task) {
                                    " is larger than maximum batch size ",
                                    options_.max_batch_size);
   }
+  bool is_old_batch_closed = false;
   {
     mutex_lock l(mu_);
     // Current batch is full, create another if allowed.
@@ -505,6 +546,7 @@ Status ASBSQueue<TaskType>::Schedule(std::unique_ptr<TaskType>* task) {
         return errors::Unavailable("The batch scheduling queue is full");
       }
       current_batch_->Close();
+      is_old_batch_closed = true;
       current_batch_ = nullptr;
     }
     if (!current_batch_) {
@@ -516,7 +558,8 @@ Status ASBSQueue<TaskType>::Schedule(std::unique_ptr<TaskType>* task) {
     num_enqueued_tasks_++;
   }
   // AddBatch must be called outside of lock, since it may call ReleaseBatch.
-  if (new_batch != nullptr) scheduler_->AddBatch(new_batch);
+  if (new_batch != nullptr)
+    scheduler_->AddBatch(new_batch, is_old_batch_closed);
   return Status::OK();
 }
 
diff --git a/tensorflow/core/kernels/cast_op.cc b/tensorflow/core/kernels/cast_op.cc
index e6e388b3d1..0478c93280 100644
--- a/tensorflow/core/kernels/cast_op.cc
+++ b/tensorflow/core/kernels/cast_op.cc
@@ -55,8 +55,41 @@ typedef Eigen::SyclDevice SYCLDevice;
   FN(arg0, std::complex<double>)
 
 CastOpBase::CastOpBase(OpKernelConstruction* ctx) : OpKernel(ctx) {
-  OP_REQUIRES_OK(ctx, ctx->GetAttr("SrcT", &src_dtype_));
-  OP_REQUIRES_OK(ctx, ctx->GetAttr("DstT", &dst_dtype_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("SrcT", &external_src_dtype_));
+
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("DstT", &external_dst_dtype_));
+
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("Truncate", &use_truncation_));
+
+  // Quantized data types use the same underlying format as their non quantized
+  // version so we use the non quantized implementation for casting.
+  if (external_dst_dtype_ == DT_QUINT8) {
+    dst_dtype_ = DT_UINT8;
+  } else if (external_dst_dtype_ == DT_QINT8) {
+    dst_dtype_ = DT_INT8;
+  } else if (external_dst_dtype_ == DT_QINT32) {
+    dst_dtype_ = DT_INT32;
+  } else if (external_dst_dtype_ == DT_QINT16) {
+    dst_dtype_ = DT_INT16;
+  } else if (external_dst_dtype_ == DT_QUINT16) {
+    dst_dtype_ = DT_UINT16;
+  } else {
+    dst_dtype_ = external_dst_dtype_;
+  }
+
+  if (external_src_dtype_ == DT_QUINT8) {
+    src_dtype_ = DT_UINT8;
+  } else if (external_src_dtype_ == DT_QINT8) {
+    src_dtype_ = DT_INT8;
+  } else if (external_src_dtype_ == DT_QINT32) {
+    src_dtype_ = DT_INT32;
+  } else if (external_src_dtype_ == DT_QINT16) {
+    src_dtype_ = DT_INT16;
+  } else if (external_src_dtype_ == DT_QUINT16) {
+    src_dtype_ = DT_UINT16;
+  } else {
+    src_dtype_ = external_src_dtype_;
+  }
 }
 
 void CastOpBase::Compute(OpKernelContext* ctx) {
@@ -64,15 +97,20 @@ void CastOpBase::Compute(OpKernelContext* ctx) {
   if (work_ == nullptr) {
     ctx->set_output(0, inp);
   } else {
+    Tensor in;
+    in.UnsafeCopyFromInternal(inp, src_dtype_, inp.shape());
     Tensor* out = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, inp.shape(), &out));
-    work_(ctx, inp, out);
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, in.shape(), &out));
+    out->set_dtype(dst_dtype_);
+    work_(ctx, in, out, use_truncation_);
+    out->set_dtype(external_dst_dtype_);
   }
 }
 
 Status CastOpBase::Unimplemented() {
-  return errors::Unimplemented("Cast ", DataTypeString(src_dtype_), " to ",
-                               DataTypeString(dst_dtype_), " is not supported");
+  return errors::Unimplemented("Cast ", DataTypeString(external_src_dtype_),
+                               " to ", DataTypeString(external_dst_dtype_),
+                               " is not supported");
 }
 
 CpuCastOp::CpuCastOp(OpKernelConstruction* ctx) : CastOpBase(ctx) {
@@ -80,7 +118,7 @@ CpuCastOp::CpuCastOp(OpKernelConstruction* ctx) : CastOpBase(ctx) {
 }
 
 Status CpuCastOp::Prepare() {
-  if (src_dtype_ == dst_dtype_) {
+  if (external_src_dtype_ == external_dst_dtype_) {
     work_ = nullptr;  // Identity
     return Status::OK();
   }
@@ -133,7 +171,7 @@ class GpuCastOp : public CastOpBase {
 
  private:
   Status Prepare() {
-    if (src_dtype_ == dst_dtype_) {
+    if (external_src_dtype_ == external_dst_dtype_) {
       work_ = nullptr;  // Identity
       return Status::OK();
     }
@@ -215,7 +253,7 @@ class SyclCastOp : public CastOpBase {
 
  private:
   Status Prepare() {
-    if (src_dtype_ == dst_dtype_) {
+    if (external_src_dtype_ == external_dst_dtype_) {
       work_ = nullptr;  // Identity
       return Status::OK();
     }
diff --git a/tensorflow/core/kernels/cast_op.h b/tensorflow/core/kernels/cast_op.h
index 16d2e0e0a5..527ab528c9 100644
--- a/tensorflow/core/kernels/cast_op.h
+++ b/tensorflow/core/kernels/cast_op.h
@@ -24,8 +24,71 @@ limitations under the License.
 #include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/types.h"
 
+// Note that the GPU cast functor templates need to be instantiated unlike the
+// CPU ones, and hence their specializations are different than that for CPUs.
+#ifdef SPECIALIZE_FOR_GPUS
+#define SPECIALIZE_CAST(DEVICE, OUT_TYPE, IN_OUT)                   \
+  template <typename Device>                                        \
+  struct CastFunctor<Device, OUT_TYPE, IN_OUT> {                    \
+    void operator()(const Device& d,                                \
+                    typename TTypes<OUT_TYPE>::Flat out_tensor,     \
+                    typename TTypes<IN_OUT>::ConstFlat in_tensor,   \
+                    bool truncate = false) {                        \
+      if (truncate) {                                               \
+        out_tensor.device(d) =                                      \
+            in_tensor.unaryExpr(LSBZeroSetter<IN_OUT, OUT_TYPE>())  \
+                .template cast<OUT_TYPE>();                         \
+      } else {                                                      \
+        out_tensor.device(d) = in_tensor.template cast<OUT_TYPE>(); \
+      }                                                             \
+    }                                                               \
+  };                                                                \
+  template struct CastFunctor<DEVICE, OUT_TYPE, IN_OUT>;
+#else
+#define SPECIALIZE_CAST(DEVICE, OUT_TYPE, IN_OUT)                   \
+  template <>                                                       \
+  struct CastFunctor<DEVICE, OUT_TYPE, IN_OUT> {                    \
+    void operator()(const DEVICE& d,                                \
+                    typename TTypes<OUT_TYPE>::Flat out_tensor,     \
+                    typename TTypes<IN_OUT>::ConstFlat in_tensor,   \
+                    bool truncate = false) {                        \
+      if (truncate) {                                               \
+        out_tensor.device(d) =                                      \
+            in_tensor.unaryExpr(LSBZeroSetter<IN_OUT, OUT_TYPE>())  \
+                .template cast<OUT_TYPE>();                         \
+      } else {                                                      \
+        out_tensor.device(d) = in_tensor.template cast<OUT_TYPE>(); \
+      }                                                             \
+    }                                                               \
+  };
+#endif
+
+#define CAST_FUNCTORS(devname)                                        \
+  SPECIALIZE_CAST(devname, float, double)                             \
+  SPECIALIZE_CAST(devname, float, std::complex<double>)               \
+  SPECIALIZE_CAST(devname, std::complex<float>, std::complex<double>) \
+  SPECIALIZE_CAST(devname, std::complex<float>, double)               \
+  SPECIALIZE_CAST(devname, Eigen::half, double)                       \
+  SPECIALIZE_CAST(devname, Eigen::half, float)                        \
+  SPECIALIZE_CAST(devname, Eigen::half, std::complex<double>)         \
+  SPECIALIZE_CAST(devname, Eigen::half, std::complex<float>)          \
+  SPECIALIZE_CAST(devname, bfloat16, float)                           \
+  template <typename OUT_TYPE, typename IN_OUT>                       \
+  struct CastFunctor<devname, OUT_TYPE, IN_OUT> {                     \
+    void operator()(const devname& d,                                 \
+                    typename TTypes<OUT_TYPE>::Flat out_tensor,       \
+                    typename TTypes<IN_OUT>::ConstFlat in_tensor,     \
+                    bool truncate = false) {                          \
+      out_tensor.device(d) = in_tensor.template cast<OUT_TYPE>();     \
+    }                                                                 \
+  };
+
 namespace tensorflow {
 
+typedef std::function<void(OpKernelContext*, const Tensor&, Tensor*,
+                           bool trunc)>
+    CastFunctorType;
+
 // Common base class of Cast kernels
 class CastOpBase : public OpKernel {
  public:
@@ -36,8 +99,10 @@ class CastOpBase : public OpKernel {
  protected:
   DataType src_dtype_;
   DataType dst_dtype_;
-  std::function<void(OpKernelContext*, const Tensor&, Tensor*)> work_ = nullptr;
-
+  DataType external_src_dtype_;
+  DataType external_dst_dtype_;
+  bool use_truncation_;
+  CastFunctorType work_ = nullptr;
   Status Unimplemented();
 
   TF_DISALLOW_COPY_AND_ASSIGN(CastOpBase);
@@ -54,6 +119,23 @@ class CpuCastOp : public CastOpBase {
 
 namespace functor {
 
+template <typename I>
+constexpr int MantissaWidth() {
+  return std::numeric_limits<I>::digits;
+}
+
+template <>
+constexpr int MantissaWidth<Eigen::half>() {
+  // Remember, there's 1 hidden bit
+  return 10 + 1;
+}
+
+template <>
+constexpr int MantissaWidth<bfloat16>() {
+  // Remember, there's 1 hidden bit
+  return 7 + 1;
+}
+
 template <typename Device, typename Tout, typename Tin>
 void Cast(const Device& d, typename TTypes<Tout>::Flat o,
           typename TTypes<Tin>::ConstFlat i) {
@@ -63,7 +145,85 @@ void Cast(const Device& d, typename TTypes<Tout>::Flat o,
 template <typename Device, typename Tout, typename Tin>
 struct CastFunctor {
   void operator()(const Device& d, typename TTypes<Tout>::Flat o,
-                  typename TTypes<Tin>::ConstFlat i);
+                  typename TTypes<Tin>::ConstFlat i, bool truncate = false);
+};
+
+// Only enable LSBZeroSetterHelper for 64 and 32 bit input data types.
+// Specialize for others if needed in future.
+template <typename I>
+typename std::enable_if<sizeof(I) == 8, void>::type EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE static LSBZeroSetterHelper(I& t, int n) {
+  // Only zero the bits for non-NaNs.
+  // For NaNs, let the non-truncation version handle it.
+  if (!std::isnan(t)) {
+    uint64_t* p = reinterpret_cast<uint64_t*>(&t);
+    *p &= (0xFFFFFFFFFFFFFFFF << n);
+  }
+}
+
+template <typename I>
+typename std::enable_if<sizeof(I) == 4, void>::type EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE static LSBZeroSetterHelper(I& t, int n) {
+  // Only zero the bits for non-NaNs.
+  // For NaNs, let the non-truncation version handle it.
+  if (!std::isnan(t)) {
+    uint32_t* p = reinterpret_cast<uint32_t*>(&t);
+    *p &= (0xFFFFFFFF << n);
+  }
+}
+
+// Set n least significant bits to 0
+template <typename I, typename O>
+struct LSBZeroSetter {
+  EIGEN_EMPTY_STRUCT_CTOR(LSBZeroSetter)
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const I operator()(const I& a) const {
+    constexpr int bits = MantissaWidth<I>() - MantissaWidth<O>();
+    static_assert(
+        bits > 0,
+        "The output type must have fewer mantissa bits than the input type\n");
+    I t = a;
+    LSBZeroSetterHelper(t, bits);
+    return t;
+  }
+};
+
+template <typename I, typename O>
+struct LSBZeroSetter<std::complex<I>, std::complex<O>> {
+  EIGEN_EMPTY_STRUCT_CTOR(LSBZeroSetter)
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const std::complex<I> operator()(
+      const std::complex<I>& a) const {
+    constexpr int bits = MantissaWidth<I>() - MantissaWidth<O>();
+    static_assert(
+        bits > 0,
+        "The output type must have fewer mantissa bits than the input type\n");
+    I re = std::real(a);
+    I img = std::imag(a);
+    LSBZeroSetterHelper(re, bits);
+    LSBZeroSetterHelper(img, bits);
+    std::complex<I> toReturn(re, img);
+    return toReturn;
+  }
+};
+
+template <typename I, typename O>
+struct LSBZeroSetter<std::complex<I>, O> {
+  EIGEN_EMPTY_STRUCT_CTOR(LSBZeroSetter)
+  // Sets the 16 LSBits of the float to 0
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const std::complex<I> operator()(
+      const std::complex<I>& a) const {
+    constexpr int bits = MantissaWidth<I>() - MantissaWidth<O>();
+    static_assert(
+        bits > 0,
+        "The output type must have fewer mantissa bits than the input type\n");
+    I re = std::real(a);
+    I img = std::imag(a);
+    LSBZeroSetterHelper(re, bits);
+    LSBZeroSetterHelper(img, bits);
+    std::complex<I> toReturn(re, img);
+    return toReturn;
+  }
 };
 
 }  // end namespace functor
diff --git a/tensorflow/core/kernels/cast_op_gpu.cu.cc b/tensorflow/core/kernels/cast_op_gpu.cu.cc
index 607e7f5efd..036996fca2 100644
--- a/tensorflow/core/kernels/cast_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/cast_op_gpu.cu.cc
@@ -18,22 +18,19 @@ limitations under the License.
 #define EIGEN_USE_GPU
 
 #include "tensorflow/core/framework/bfloat16.h"
+#define SPECIALIZE_FOR_GPUS
 #include "tensorflow/core/kernels/cast_op.h"
+#undef SPECIALIZE_FOR_GPUS
 
 namespace tensorflow {
 namespace functor {
 
 typedef Eigen::GpuDevice GPUDevice;
 
-template <typename O, typename I>
-struct CastFunctor<GPUDevice, O, I> {
-  void operator()(const GPUDevice& d, typename TTypes<O>::Flat o,
-                  typename TTypes<I>::ConstFlat i) {
-    Cast<GPUDevice, O, I>(d, o, i);
-  }
-};
+CAST_FUNCTORS(GPUDevice);
 
 #define DEFINE(O, I) template struct CastFunctor<GPUDevice, O, I>
+
 #define DEFINE_ALL_FROM(in_type)        \
   DEFINE(in_type, bool);                \
   DEFINE(in_type, uint8);               \
@@ -59,14 +56,43 @@ DEFINE_ALL_FROM(int8);
 DEFINE_ALL_FROM(int16);
 DEFINE_ALL_FROM(int32);
 DEFINE_ALL_FROM(int64);
-DEFINE_ALL_FROM(Eigen::half);
-DEFINE_ALL_FROM(float);
 DEFINE_ALL_FROM(double);
-DEFINE_ALL_FROM(std::complex<float>);
 DEFINE_ALL_FROM(std::complex<double>);
-DEFINE(bfloat16, float);
 DEFINE(float, bfloat16);
 
+#define DEFINE_ALL_TO_FLOAT(out_type) \
+  DEFINE(out_type, bool);             \
+  DEFINE(out_type, uint8);            \
+  DEFINE(out_type, uint16);           \
+  DEFINE(out_type, uint32);           \
+  DEFINE(out_type, uint64);           \
+  DEFINE(out_type, int8);             \
+  DEFINE(out_type, int16);            \
+  DEFINE(out_type, int32);            \
+  DEFINE(out_type, int64);            \
+  DEFINE(out_type, Eigen::half);      \
+  DEFINE(out_type, float);            \
+  DEFINE(out_type, std::complex<float>)
+
+#define DEFINE_ALL_TO_HALF(out_type) \
+  DEFINE(out_type, bool);            \
+  DEFINE(out_type, uint8);           \
+  DEFINE(out_type, uint16);          \
+  DEFINE(out_type, uint32);          \
+  DEFINE(out_type, uint64);          \
+  DEFINE(out_type, int8);            \
+  DEFINE(out_type, int16);           \
+  DEFINE(out_type, int32);           \
+  DEFINE(out_type, int64);           \
+  DEFINE(out_type, Eigen::half)
+
+DEFINE_ALL_TO_HALF(Eigen::half);
+DEFINE_ALL_TO_HALF(bfloat16);
+DEFINE_ALL_TO_FLOAT(float);
+DEFINE_ALL_TO_FLOAT(std::complex<float>);
+
+#undef DEFINE_ALL_TO_FLOAT
+#undef DEFINE_ALL_TO_HALF
 #undef DEFINE_ALL_FROM
 #undef DEFINE
 
diff --git a/tensorflow/core/kernels/cast_op_impl.h b/tensorflow/core/kernels/cast_op_impl.h
index fe821b25df..b899bac681 100644
--- a/tensorflow/core/kernels/cast_op_impl.h
+++ b/tensorflow/core/kernels/cast_op_impl.h
@@ -25,22 +25,10 @@ namespace tensorflow {
 
 namespace functor {
 
-template <typename O, typename I>
-struct CastFunctor<Eigen::ThreadPoolDevice, O, I> {
-  void operator()(const Eigen::ThreadPoolDevice& d, typename TTypes<O>::Flat o,
-                  typename TTypes<I>::ConstFlat i) {
-    o.device(d) = i.template cast<O>();
-  }
-};
+CAST_FUNCTORS(Eigen::ThreadPoolDevice);
 
 #ifdef TENSORFLOW_USE_SYCL
-template <typename O, typename I>
-struct CastFunctor<Eigen::SyclDevice, O, I> {
-  void operator()(const Eigen::SyclDevice& d, typename TTypes<O>::Flat o,
-                  typename TTypes<I>::ConstFlat i) {
-    o.device(d) = i.template cast<O>();
-  }
-};
+CAST_FUNCTORS(Eigen::SyclDevice);
 #endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
@@ -68,139 +56,103 @@ struct CastFunctor<Eigen::SyclDevice, O, I> {
   CURRY_TYPES3_NO_BF16(FN, arg0, arg1) \
   FN(arg0, arg1, bfloat16);
 
-#define CAST_CASE(DEVICE, IN, OUT)                                         \
-  if (DataTypeToEnum<OUT>::value == dst_dtype) {                           \
-    return [](OpKernelContext* ctx, const Tensor& inp, Tensor* out) {      \
-      functor::CastFunctor<DEVICE, OUT, IN> func;                          \
-      func(ctx->eigen_device<DEVICE>(), out->flat<OUT>(), inp.flat<IN>()); \
-    };                                                                     \
+#define CAST_CASE(DEVICE, IN, OUT)                                        \
+  if (DataTypeToEnum<OUT>::value == dst_dtype) {                          \
+    return [](OpKernelContext* ctx, const Tensor& inp, Tensor* out,       \
+              bool truncate) {                                            \
+      functor::CastFunctor<DEVICE, OUT, IN> func;                         \
+      func(ctx->eigen_device<DEVICE>(), out->flat<OUT>(), inp.flat<IN>(), \
+           truncate);                                                     \
+    };                                                                    \
   }
 
 // The functions below are implemented in the cast_op_impl_*.cc files.
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromBool(DataType dst_dtype);
+CastFunctorType GetCpuCastFromBool(DataType dst_dtype);
+
+CastFunctorType GetCpuCastFromUint8(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromUint8(DataType dst_dtype);
+CastFunctorType GetCpuCastFromUint16(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromUint16(DataType dst_dtype);
+CastFunctorType GetCpuCastFromInt8(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromUint32(DataType dst_dtype);
+CastFunctorType GetCpuCastFromUint32(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromUint64(DataType dst_dtype);
+CastFunctorType GetCpuCastFromUint64(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromInt8(DataType dst_dtype);
+CastFunctorType GetCpuCastFromInt8(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromInt16(DataType dst_dtype);
+CastFunctorType GetCpuCastFromInt16(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromInt32(DataType dst_dtype);
+CastFunctorType GetCpuCastFromInt32(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromInt64(DataType dst_dtype);
+CastFunctorType GetCpuCastFromInt64(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromHalf(DataType dst_dtype);
+CastFunctorType GetCpuCastFromHalf(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromFloat(DataType dst_dtype);
+CastFunctorType GetCpuCastFromFloat(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromDouble(DataType dst_dtype);
+CastFunctorType GetCpuCastFromDouble(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromComplex64(DataType dst_dtype);
+CastFunctorType GetCpuCastFromComplex64(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromComplex128(DataType dst_dtype);
+CastFunctorType GetCpuCastFromComplex128(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromBfloat(DataType dst_dtype);
+CastFunctorType GetCpuCastFromBfloat(DataType dst_dtype);
 
 #if GOOGLE_CUDA
 // Same, for GPU.
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromBool(DataType dst_dtype);
+CastFunctorType GetGpuCastFromBool(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromUint8(DataType dst_dtype);
+CastFunctorType GetGpuCastFromUint8(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromUint16(DataType dst_dtype);
+CastFunctorType GetGpuCastFromUint16(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromUint32(DataType dst_dtype);
+CastFunctorType GetGpuCastFromInt8(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromUint64(DataType dst_dtype);
+CastFunctorType GetGpuCastFromUint32(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromInt8(DataType dst_dtype);
+CastFunctorType GetGpuCastFromUint64(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromInt16(DataType dst_dtype);
+CastFunctorType GetGpuCastFromInt16(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromInt32(DataType dst_dtype);
+CastFunctorType GetGpuCastFromInt32(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromInt64(DataType dst_dtype);
+CastFunctorType GetGpuCastFromInt64(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromHalf(DataType dst_dtype);
+CastFunctorType GetGpuCastFromHalf(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromFloat(DataType dst_dtype);
+CastFunctorType GetGpuCastFromFloat(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromDouble(DataType dst_dtype);
+CastFunctorType GetGpuCastFromDouble(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromComplex64(DataType dst_dtype);
+CastFunctorType GetGpuCastFromComplex64(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromComplex128(DataType dst_dtype);
+CastFunctorType GetGpuCastFromComplex128(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromBfloat(DataType dst_dtype);
+CastFunctorType GetGpuCastFromBfloat(DataType dst_dtype);
 
 #endif  // GOOGLE_CUDA
 
 #ifdef TENSORFLOW_USE_SYCL
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetSyclCastFromBool(DataType dst_dtype);
+CastFunctorType GetSyclCastFromBool(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetSyclCastFromUint8(DataType dst_dtype);
+CastFunctorType GetSyclCastFromUint8(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetSyclCastFromUint16(DataType dst_dtype);
+CastFunctorType GetSyclCastFromUint16(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetSyclCastFromUint32(DataType dst_dtype);
+CastFunctorType GetSyclCastFromUint32(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetSyclCastFromUint64(DataType dst_dtype);
+CastFunctorType GetSyclCastFromUint64(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetSyclCastFromInt16(DataType dst_dtype);
+CastFunctorType GetSyclCastFromInt16(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetSyclCastFromInt32(DataType dst_dtype);
+CastFunctorType GetSyclCastFromInt32(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetSyclCastFromInt64(DataType dst_dtype);
+CastFunctorType GetSyclCastFromInt64(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetSyclCastFromFloat(DataType dst_dtype);
+CastFunctorType GetSyclCastFromFloat(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetSyclCastFromDouble(DataType dst_dtype);
+CastFunctorType GetSyclCastFromDouble(DataType dst_dtype);
 #endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cast_op_impl_bfloat.cc b/tensorflow/core/kernels/cast_op_impl_bfloat.cc
index bfa7ba0d47..96aae15608 100644
--- a/tensorflow/core/kernels/cast_op_impl_bfloat.cc
+++ b/tensorflow/core/kernels/cast_op_impl_bfloat.cc
@@ -22,20 +22,19 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromBfloat(DataType dst_dtype) {
+CastFunctorType GetCpuCastFromBfloat(DataType dst_dtype) {
   CURRY_TYPES3(CAST_CASE, CPUDevice, bfloat16);
   return nullptr;
 }
 
 #if GOOGLE_CUDA
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromBfloat(DataType dst_dtype) {
+CastFunctorType GetGpuCastFromBfloat(DataType dst_dtype) {
   if (dst_dtype == DT_FLOAT) {
-    return [](OpKernelContext* ctx, const Tensor& inp, Tensor* out) {
+    return [](OpKernelContext* ctx, const Tensor& inp, Tensor* out,
+              bool truncate) {
       functor::CastFunctor<GPUDevice, float, bfloat16> func;
       func(ctx->eigen_device<GPUDevice>(), out->flat<float>(),
-           inp.flat<bfloat16>());
+           inp.flat<bfloat16>(), truncate);
     };
   }
   return nullptr;
diff --git a/tensorflow/core/kernels/cast_op_impl_bool.cc b/tensorflow/core/kernels/cast_op_impl_bool.cc
index c5c7394b43..792d4781f2 100644
--- a/tensorflow/core/kernels/cast_op_impl_bool.cc
+++ b/tensorflow/core/kernels/cast_op_impl_bool.cc
@@ -20,15 +20,13 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromBool(DataType dst_dtype) {
+CastFunctorType GetCpuCastFromBool(DataType dst_dtype) {
   CURRY_TYPES3(CAST_CASE, CPUDevice, bool);
   return nullptr;
 }
 
 #if GOOGLE_CUDA
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromBool(DataType dst_dtype) {
+CastFunctorType GetGpuCastFromBool(DataType dst_dtype) {
   CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, bool);
   return nullptr;
 }
@@ -36,8 +34,7 @@ GetGpuCastFromBool(DataType dst_dtype) {
 
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetSyclCastFromBool(DataType dst_dtype) {
+CastFunctorType GetSyclCastFromBool(DataType dst_dtype) {
   CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, bool);
   return nullptr;
 }
diff --git a/tensorflow/core/kernels/cast_op_impl_complex128.cc b/tensorflow/core/kernels/cast_op_impl_complex128.cc
index 52899d58cd..9a184e5954 100644
--- a/tensorflow/core/kernels/cast_op_impl_complex128.cc
+++ b/tensorflow/core/kernels/cast_op_impl_complex128.cc
@@ -20,15 +20,13 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromComplex128(DataType dst_dtype) {
+CastFunctorType GetCpuCastFromComplex128(DataType dst_dtype) {
   CURRY_TYPES3(CAST_CASE, CPUDevice, std::complex<double>);
   return nullptr;
 }
 
 #if GOOGLE_CUDA
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromComplex128(DataType dst_dtype) {
+CastFunctorType GetGpuCastFromComplex128(DataType dst_dtype) {
   CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, std::complex<double>);
   return nullptr;
 }
diff --git a/tensorflow/core/kernels/cast_op_impl_complex64.cc b/tensorflow/core/kernels/cast_op_impl_complex64.cc
index 617bda53d5..77bc620b46 100644
--- a/tensorflow/core/kernels/cast_op_impl_complex64.cc
+++ b/tensorflow/core/kernels/cast_op_impl_complex64.cc
@@ -20,15 +20,13 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromComplex64(DataType dst_dtype) {
+CastFunctorType GetCpuCastFromComplex64(DataType dst_dtype) {
   CURRY_TYPES3(CAST_CASE, CPUDevice, std::complex<float>);
   return nullptr;
 }
 
 #if GOOGLE_CUDA
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromComplex64(DataType dst_dtype) {
+CastFunctorType GetGpuCastFromComplex64(DataType dst_dtype) {
   CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, std::complex<float>);
   return nullptr;
 }
diff --git a/tensorflow/core/kernels/cast_op_impl_double.cc b/tensorflow/core/kernels/cast_op_impl_double.cc
index 7dc485ddad..ff9056897f 100644
--- a/tensorflow/core/kernels/cast_op_impl_double.cc
+++ b/tensorflow/core/kernels/cast_op_impl_double.cc
@@ -20,15 +20,13 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromDouble(DataType dst_dtype) {
+CastFunctorType GetCpuCastFromDouble(DataType dst_dtype) {
   CURRY_TYPES3(CAST_CASE, CPUDevice, double);
   return nullptr;
 }
 
 #if GOOGLE_CUDA
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromDouble(DataType dst_dtype) {
+CastFunctorType GetGpuCastFromDouble(DataType dst_dtype) {
   CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, double);
   return nullptr;
 }
@@ -36,8 +34,7 @@ GetGpuCastFromDouble(DataType dst_dtype) {
 
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetSyclCastFromDouble(DataType dst_dtype) {
+CastFunctorType GetSyclCastFromDouble(DataType dst_dtype) {
   CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, double);
   return nullptr;
 }
diff --git a/tensorflow/core/kernels/cast_op_impl_float.cc b/tensorflow/core/kernels/cast_op_impl_float.cc
index 1c933914fd..f1e8f0e37b 100644
--- a/tensorflow/core/kernels/cast_op_impl_float.cc
+++ b/tensorflow/core/kernels/cast_op_impl_float.cc
@@ -22,15 +22,13 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromFloat(DataType dst_dtype) {
+CastFunctorType GetCpuCastFromFloat(DataType dst_dtype) {
   CURRY_TYPES3(CAST_CASE, CPUDevice, float);
   return nullptr;
 }
 
 #if GOOGLE_CUDA
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromFloat(DataType dst_dtype) {
+CastFunctorType GetGpuCastFromFloat(DataType dst_dtype) {
   CURRY_TYPES3(CAST_CASE, GPUDevice, float);
   return nullptr;
 }
@@ -38,8 +36,7 @@ GetGpuCastFromFloat(DataType dst_dtype) {
 
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetSyclCastFromFloat(DataType dst_dtype) {
+CastFunctorType GetSyclCastFromFloat(DataType dst_dtype) {
   CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, float);
   return nullptr;
 }
diff --git a/tensorflow/core/kernels/cast_op_impl_half.cc b/tensorflow/core/kernels/cast_op_impl_half.cc
index ef4b94e326..5da3a01352 100644
--- a/tensorflow/core/kernels/cast_op_impl_half.cc
+++ b/tensorflow/core/kernels/cast_op_impl_half.cc
@@ -20,15 +20,13 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromHalf(DataType dst_dtype) {
+CastFunctorType GetCpuCastFromHalf(DataType dst_dtype) {
   CURRY_TYPES3(CAST_CASE, CPUDevice, Eigen::half);
   return nullptr;
 }
 
 #if GOOGLE_CUDA
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromHalf(DataType dst_dtype) {
+CastFunctorType GetGpuCastFromHalf(DataType dst_dtype) {
   CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, Eigen::half);
   return nullptr;
 }
diff --git a/tensorflow/core/kernels/cast_op_impl_int16.cc b/tensorflow/core/kernels/cast_op_impl_int16.cc
index 59360f7445..440ee88fb5 100644
--- a/tensorflow/core/kernels/cast_op_impl_int16.cc
+++ b/tensorflow/core/kernels/cast_op_impl_int16.cc
@@ -20,15 +20,13 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromInt16(DataType dst_dtype) {
+CastFunctorType GetCpuCastFromInt16(DataType dst_dtype) {
   CURRY_TYPES3(CAST_CASE, CPUDevice, int16);
   return nullptr;
 }
 
 #if GOOGLE_CUDA
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromInt16(DataType dst_dtype) {
+CastFunctorType GetGpuCastFromInt16(DataType dst_dtype) {
   CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, int16);
   return nullptr;
 }
@@ -36,8 +34,7 @@ GetGpuCastFromInt16(DataType dst_dtype) {
 
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetSyclCastFromInt16(DataType dst_dtype) {
+CastFunctorType GetSyclCastFromInt16(DataType dst_dtype) {
   CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, int16);
   return nullptr;
 }
diff --git a/tensorflow/core/kernels/cast_op_impl_int32.cc b/tensorflow/core/kernels/cast_op_impl_int32.cc
index a867392fde..4b3e7efddc 100644
--- a/tensorflow/core/kernels/cast_op_impl_int32.cc
+++ b/tensorflow/core/kernels/cast_op_impl_int32.cc
@@ -20,15 +20,13 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromInt32(DataType dst_dtype) {
+CastFunctorType GetCpuCastFromInt32(DataType dst_dtype) {
   CURRY_TYPES3(CAST_CASE, CPUDevice, int32);
   return nullptr;
 }
 
 #if GOOGLE_CUDA
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromInt32(DataType dst_dtype) {
+CastFunctorType GetGpuCastFromInt32(DataType dst_dtype) {
   CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, int32);
   return nullptr;
 }
@@ -36,8 +34,7 @@ GetGpuCastFromInt32(DataType dst_dtype) {
 
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetSyclCastFromInt32(DataType dst_dtype) {
+CastFunctorType GetSyclCastFromInt32(DataType dst_dtype) {
   CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, int32);
   return nullptr;
 }
diff --git a/tensorflow/core/kernels/cast_op_impl_int64.cc b/tensorflow/core/kernels/cast_op_impl_int64.cc
index 467a8f6c89..0f711aa560 100644
--- a/tensorflow/core/kernels/cast_op_impl_int64.cc
+++ b/tensorflow/core/kernels/cast_op_impl_int64.cc
@@ -20,15 +20,13 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromInt64(DataType dst_dtype) {
+CastFunctorType GetCpuCastFromInt64(DataType dst_dtype) {
   CURRY_TYPES3(CAST_CASE, CPUDevice, int64);
   return nullptr;
 }
 
 #if GOOGLE_CUDA
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromInt64(DataType dst_dtype) {
+CastFunctorType GetGpuCastFromInt64(DataType dst_dtype) {
   CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, int64);
   return nullptr;
 }
@@ -36,8 +34,7 @@ GetGpuCastFromInt64(DataType dst_dtype) {
 
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetSyclCastFromInt64(DataType dst_dtype) {
+CastFunctorType GetSyclCastFromInt64(DataType dst_dtype) {
   CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, int64);
   return nullptr;
 }
diff --git a/tensorflow/core/kernels/cast_op_impl_int8.cc b/tensorflow/core/kernels/cast_op_impl_int8.cc
index 21002a4321..eac185d5a0 100644
--- a/tensorflow/core/kernels/cast_op_impl_int8.cc
+++ b/tensorflow/core/kernels/cast_op_impl_int8.cc
@@ -20,15 +20,13 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromInt8(DataType dst_dtype) {
+CastFunctorType GetCpuCastFromInt8(DataType dst_dtype) {
   CURRY_TYPES3(CAST_CASE, CPUDevice, int8);
   return nullptr;
 }
 
 #if GOOGLE_CUDA
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromInt8(DataType dst_dtype) {
+CastFunctorType GetGpuCastFromInt8(DataType dst_dtype) {
   CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, int8);
   return nullptr;
 }
@@ -36,8 +34,7 @@ GetGpuCastFromInt8(DataType dst_dtype) {
 
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetSyclCastFromInt8(DataType dst_dtype) {
+CastFunctorType GetSyclCastFromInt8(DataType dst_dtype) {
   CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, int8);
   return nullptr;
 }
diff --git a/tensorflow/core/kernels/cast_op_impl_uint16.cc b/tensorflow/core/kernels/cast_op_impl_uint16.cc
index cd829bae2a..3aebbdc1f3 100644
--- a/tensorflow/core/kernels/cast_op_impl_uint16.cc
+++ b/tensorflow/core/kernels/cast_op_impl_uint16.cc
@@ -20,15 +20,13 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromUint16(DataType dst_dtype) {
+CastFunctorType GetCpuCastFromUint16(DataType dst_dtype) {
   CURRY_TYPES3(CAST_CASE, CPUDevice, uint16);
   return nullptr;
 }
 
 #if GOOGLE_CUDA
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromUint16(DataType dst_dtype) {
+CastFunctorType GetGpuCastFromUint16(DataType dst_dtype) {
   CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, uint16);
   return nullptr;
 }
@@ -36,8 +34,7 @@ GetGpuCastFromUint16(DataType dst_dtype) {
 
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetSyclCastFromUint16(DataType dst_dtype) {
+CastFunctorType GetSyclCastFromUint16(DataType dst_dtype) {
   CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, uint16);
   return nullptr;
 }
diff --git a/tensorflow/core/kernels/cast_op_impl_uint32.cc b/tensorflow/core/kernels/cast_op_impl_uint32.cc
index d1a854d98b..86f5961bcc 100644
--- a/tensorflow/core/kernels/cast_op_impl_uint32.cc
+++ b/tensorflow/core/kernels/cast_op_impl_uint32.cc
@@ -20,15 +20,13 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromUint32(DataType dst_dtype) {
+CastFunctorType GetCpuCastFromUint32(DataType dst_dtype) {
   CURRY_TYPES3(CAST_CASE, CPUDevice, uint32);
   return nullptr;
 }
 
 #if GOOGLE_CUDA
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromUint32(DataType dst_dtype) {
+CastFunctorType GetGpuCastFromUint32(DataType dst_dtype) {
   CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, uint32);
   return nullptr;
 }
@@ -36,8 +34,7 @@ GetGpuCastFromUint32(DataType dst_dtype) {
 
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetSyclCastFromUint32(DataType dst_dtype) {
+CastFunctorType GetSyclCastFromUint32(DataType dst_dtype) {
   CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, uint32);
   return nullptr;
 }
diff --git a/tensorflow/core/kernels/cast_op_impl_uint64.cc b/tensorflow/core/kernels/cast_op_impl_uint64.cc
index 604e0424fc..6478c266ee 100644
--- a/tensorflow/core/kernels/cast_op_impl_uint64.cc
+++ b/tensorflow/core/kernels/cast_op_impl_uint64.cc
@@ -20,15 +20,13 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromUint64(DataType dst_dtype) {
+CastFunctorType GetCpuCastFromUint64(DataType dst_dtype) {
   CURRY_TYPES3(CAST_CASE, CPUDevice, uint64);
   return nullptr;
 }
 
 #if GOOGLE_CUDA
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromUint64(DataType dst_dtype) {
+CastFunctorType GetGpuCastFromUint64(DataType dst_dtype) {
   CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, uint64);
   return nullptr;
 }
@@ -36,8 +34,7 @@ GetGpuCastFromUint64(DataType dst_dtype) {
 
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetSyclCastFromUint64(DataType dst_dtype) {
+CastFunctorType GetSyclCastFromUint64(DataType dst_dtype) {
   CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, uint64);
   return nullptr;
 }
diff --git a/tensorflow/core/kernels/cast_op_impl_uint8.cc b/tensorflow/core/kernels/cast_op_impl_uint8.cc
index 2d1a6f3a4e..b22547a23e 100644
--- a/tensorflow/core/kernels/cast_op_impl_uint8.cc
+++ b/tensorflow/core/kernels/cast_op_impl_uint8.cc
@@ -20,15 +20,13 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromUint8(DataType dst_dtype) {
+CastFunctorType GetCpuCastFromUint8(DataType dst_dtype) {
   CURRY_TYPES3(CAST_CASE, CPUDevice, uint8);
   return nullptr;
 }
 
 #if GOOGLE_CUDA
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromUint8(DataType dst_dtype) {
+CastFunctorType GetGpuCastFromUint8(DataType dst_dtype) {
   CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, uint8);
   return nullptr;
 }
@@ -36,8 +34,7 @@ GetGpuCastFromUint8(DataType dst_dtype) {
 
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetSyclCastFromUint8(DataType dst_dtype) {
+CastFunctorType GetSyclCastFromUint8(DataType dst_dtype) {
   CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, uint8);
   return nullptr;
 }
diff --git a/tensorflow/core/kernels/cast_op_test.cc b/tensorflow/core/kernels/cast_op_test.cc
index 75e21802c0..cb305de5e3 100644
--- a/tensorflow/core/kernels/cast_op_test.cc
+++ b/tensorflow/core/kernels/cast_op_test.cc
@@ -40,17 +40,27 @@ static Graph* Cast(int num) {
 
 class CastOpTest : public OpsTestBase {
  protected:
-  void MakeOp(DataType src, DataType dst) {
-    TF_EXPECT_OK(NodeDefBuilder("cast_op", "Cast")
-                     .Input(FakeInput(src))
-                     .Attr("SrcT", src)
-                     .Attr("DstT", dst)
-                     .Finalize(node_def()));
+  void MakeOp(DataType src, DataType dst, bool trunc = false) {
+    if (trunc) {
+      TF_EXPECT_OK(NodeDefBuilder("cast_op", "Cast")
+                       .Input(FakeInput(src))
+                       .Attr("SrcT", src)
+                       .Attr("DstT", dst)
+                       .Attr("Truncate", true)
+                       .Finalize(node_def()));
+    } else {
+      TF_EXPECT_OK(NodeDefBuilder("cast_op", "Cast")
+                       .Input(FakeInput(src))
+                       .Attr("SrcT", src)
+                       .Attr("DstT", dst)
+                       .Finalize(node_def()));
+    }
+
     TF_EXPECT_OK(InitOp());
   }
 
   template <typename INPUT, typename OUTPUT>
-  void CheckCast() {
+  void CheckCast(bool trunc = false) {
     DataType in_type = DataTypeToEnum<INPUT>::v();
     DataType out_type = DataTypeToEnum<OUTPUT>::v();
     MakeOp(in_type, out_type);
@@ -64,8 +74,9 @@ class CastOpTest : public OpsTestBase {
   }
 };
 
-#define TEST_CAST(in, out) \
-  TEST_F(CastOpTest, TestCast##_##in##_##out) { CheckCast<in, out>(); }
+#define TEST_CAST(in, out)                                              \
+  TEST_F(CastOpTest, TestCast##_##in##_##out) { CheckCast<in, out>(); } \
+  TEST_F(CastOpTest, TestCast2##_##in##_##out) { CheckCast<in, out>(true); }
 
 #define TEST_ALL_CASTS_FROM(in) \
   TEST_CAST(in, uint8);         \
@@ -78,7 +89,12 @@ class CastOpTest : public OpsTestBase {
   TEST_CAST(in, half);          \
   TEST_CAST(in, float);         \
   TEST_CAST(in, double);        \
-  TEST_CAST(in, bfloat16);
+  TEST_CAST(in, bfloat16);      \
+  TEST_CAST(in, quint8);        \
+  TEST_CAST(in, qint8);         \
+  TEST_CAST(in, qint32);        \
+  TEST_CAST(in, qint16);        \
+  TEST_CAST(in, quint16);
 
 TEST_ALL_CASTS_FROM(uint8)
 TEST_ALL_CASTS_FROM(uint16)
@@ -91,6 +107,11 @@ TEST_ALL_CASTS_FROM(half)
 TEST_ALL_CASTS_FROM(float)
 TEST_ALL_CASTS_FROM(double)
 TEST_ALL_CASTS_FROM(bfloat16)
+TEST_ALL_CASTS_FROM(quint8)
+TEST_ALL_CASTS_FROM(qint8)
+TEST_ALL_CASTS_FROM(qint32)
+TEST_ALL_CASTS_FROM(qint16)
+TEST_ALL_CASTS_FROM(quint16)
 
 #undef TEST_ALL_CASTS_FROM
 #undef TEST_CAST
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc
index aca75176a5..63b1bcda43 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@@ -909,6 +909,7 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
       dims.in_depth,                       // in_depths
       {{input_desc.height(),               // in_rows
         input_desc.width()}},              // in_cols
+      FORMAT_NCHW,                         // compute_data_format
       dims.out_depth,                      // out_depths
       {{dims.spatial_dims[0].filter_size,  // filter_rows
         dims.spatial_dims[1].filter_size,  // filter_cols
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index 63a775afa8..d664a11e73 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -957,6 +957,7 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
       dims.in_depth,                       // in_depths
       {{input_desc.height(),               // in_rows
         input_desc.width()}},              // in_cols
+      FORMAT_NCHW,                         // compute_data_format
       dims.out_depth,                      // out_depths
       {{dims.spatial_dims[0].filter_size,  // filter_rows
         dims.spatial_dims[1].filter_size,  // filter_cols
diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_ops_3d.cc
index 980b1063de..15f1bf9aba 100644
--- a/tensorflow/core/kernels/conv_grad_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc
@@ -716,6 +716,7 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
         batch,
         in_depth,
         {{input_size[0], input_size[1], input_size[2]}},
+        FORMAT_NCHW,
         out_depth,
         {{filter_size[0], filter_size[1], filter_size[2]}},
         {{dilations[0], dilations[1], dilations[2]}},
@@ -1112,6 +1113,7 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
         batch,
         in_depth,
         {{input_size[0], input_size[1], input_size[2]}},
+        FORMAT_NCHW,
         out_depth,
         {{filter_size[0], filter_size[1], filter_size[2]}},
         {{dilations[0], dilations[1], dilations[2]}},
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index 3b9886eece..ef692418d6 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -713,6 +713,7 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
       in_depths,         // in_depths
       {{in_rows,         // in_rows
         in_cols}},       // in_cols
+      FORMAT_NCHW,       // compute_data_format
       out_depths,        // out_depths
       {{patch_rows,      // filter_rows
         patch_cols,      // filter_cols
diff --git a/tensorflow/core/kernels/conv_ops_3d.cc b/tensorflow/core/kernels/conv_ops_3d.cc
index 9ec16be67d..a1eed4e68c 100644
--- a/tensorflow/core/kernels/conv_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_ops_3d.cc
@@ -415,6 +415,7 @@ struct LaunchConvOp<GPUDevice, T> {
         in_batch,
         in_depth,
         {{in_planes, in_rows, in_cols}},
+        FORMAT_NCHW,
         out_depth,
         {{filter_planes, filter_rows, filter_cols}},
         {{dilations[0], dilations[1], dilations[2]}},
diff --git a/tensorflow/core/kernels/conv_ops_gpu.h b/tensorflow/core/kernels/conv_ops_gpu.h
index d2c8020bb6..afc611f277 100644
--- a/tensorflow/core/kernels/conv_ops_gpu.h
+++ b/tensorflow/core/kernels/conv_ops_gpu.h
@@ -85,13 +85,15 @@ class ConvParameters {
  public:
   using SpatialArray = gtl::InlinedVector<int64, 3>;
   ConvParameters(int64 batch, int64 in_depths, const SpatialArray& in,
-                 int64 out_depths, const SpatialArray& filter,
-                 const SpatialArray& dilation, const SpatialArray& stride,
-                 const SpatialArray& padding, DataType dtype, int device_id)
+                 TensorFormat data_format, int64 out_depths,
+                 const SpatialArray& filter, const SpatialArray& dilation,
+                 const SpatialArray& stride, const SpatialArray& padding,
+                 DataType dtype, int device_id)
       : batch_(batch),
         in_depths_(in_depths),
         out_depths_(out_depths),
         in_(in),
+        data_format_(data_format),
         filter_(filter),
         dilation_(dilation),
         stride_(stride),
@@ -101,6 +103,7 @@ class ConvParameters {
     hash_code_ = batch;
     hash_code_ = Hash64Combine(hash_code_, in_depths);
     for (int64 val : in) hash_code_ = Hash64Combine(hash_code_, val);
+    hash_code_ = Hash64Combine(hash_code_, data_format);
     hash_code_ = Hash64Combine(hash_code_, out_depths);
     for (int64 val : filter) hash_code_ = Hash64Combine(hash_code_, val);
     for (int64 val : dilation) hash_code_ = Hash64Combine(hash_code_, val);
@@ -123,6 +126,7 @@ class ConvParameters {
     return strings::StrCat(
         batch_, ", ", in_depths_, ", ",
         "(", str_util::Join(in_, ", "), "), ",
+        ::tensorflow::ToString(data_format_), ", ",
         out_depths_, ", ",
         "(", str_util::Join(filter_, ", "), "), ",
         "(", str_util::Join(dilation_, ", "), "), ",
@@ -148,12 +152,13 @@ class ConvParameters {
 
  protected:
   using ParameterDataType =
-      std::tuple<int64, int64, SpatialArray, int64, SpatialArray, SpatialArray,
-                 SpatialArray, SpatialArray, DataType, int>;
+      std::tuple<int64, int64, SpatialArray, TensorFormat, int64, SpatialArray,
+                 SpatialArray, SpatialArray, SpatialArray, DataType, int>;
 
   ParameterDataType get_data_as_tuple() const {
-    return std::make_tuple(batch_, in_depths_, in_, out_depths_, filter_,
-                           dilation_, stride_, padding_, dtype_, device_id_);
+    return std::make_tuple(batch_, in_depths_, in_, data_format_, out_depths_,
+                           filter_, dilation_, stride_, padding_, dtype_,
+                           device_id_);
   }
 
   uint64 hash_code_;
@@ -178,6 +183,7 @@ class ConvParameters {
   int64 in_depths_;
   int64 out_depths_;
   SpatialArray in_;
+  TensorFormat data_format_;
   SpatialArray filter_;
   SpatialArray dilation_;
   SpatialArray stride_;
diff --git a/tensorflow/core/kernels/conv_ops_test.cc b/tensorflow/core/kernels/conv_ops_test.cc
index 4f9a96ce17..c281153795 100644
--- a/tensorflow/core/kernels/conv_ops_test.cc
+++ b/tensorflow/core/kernels/conv_ops_test.cc
@@ -44,41 +44,43 @@ struct ConvParametersPeer {
 
 TEST(ConvParameters, WinogradNonfusedAlgoSize) {
   ConvParametersPeer conv_params_small = {{
-      1,         // batch
-      32,        // in_depths
-      {{300,     // in_rows
-        300}},   // in_cols
-      128,       // out_depths
-      {{3,       // filter_rows
-        3}},     // filter_cols
-      {{1,       // dilation_rows
-        1}},     // dilation_cols
-      {{1,       // stride_rows
-        1}},     // stride_cols
-      {{0,       // padding_rows
-        0}},     // padding_cols
-      DT_FLOAT,  // tensor datatype
-      0,         // device_id
+      1,            // batch
+      32,           // in_depths
+      {{300,        // in_rows
+        300}},      // in_cols
+      FORMAT_NCHW,  // compute_data_format
+      128,          // out_depths
+      {{3,          // filter_rows
+        3}},        // filter_cols
+      {{1,          // dilation_rows
+        1}},        // dilation_cols
+      {{1,          // stride_rows
+        1}},        // stride_cols
+      {{0,          // padding_rows
+        0}},        // padding_cols
+      DT_FLOAT,     // tensor datatype
+      0,            // device_id
   }};
   EXPECT_TRUE(
       conv_params_small.ShouldIncludeWinogradNonfusedAlgoPreCudnn7<float>());
 
   ConvParametersPeer conv_params_large = {{
-      1,         // batch
-      128,       // in_depths
-      {{300,     // in_rows
-        300}},   // in_cols
-      768,       // out_depths
-      {{3,       // filter_rows
-        3}},     // filter_cols
-      {{1,       // dilation_rows
-        1}},     // dilation_cols
-      {{1,       // stride_rows
-        1}},     // stride_cols
-      {{0,       // padding_rows
-        0}},     // padding_cols
-      DT_FLOAT,  // tensor datatype
-      0,         // device_id
+      1,            // batch
+      128,          // in_depths
+      {{300,        // in_rows
+        300}},      // in_cols
+      FORMAT_NCHW,  // compute_data_format
+      768,          // out_depths
+      {{3,          // filter_rows
+        3}},        // filter_cols
+      {{1,          // dilation_rows
+        1}},        // dilation_cols
+      {{1,          // stride_rows
+        1}},        // stride_cols
+      {{0,          // padding_rows
+        0}},        // padding_cols
+      DT_FLOAT,     // tensor datatype
+      0,            // device_id
   }};
   EXPECT_FALSE(
       conv_params_large.ShouldIncludeWinogradNonfusedAlgoPreCudnn7<float>());
diff --git a/tensorflow/core/kernels/crop_and_resize_op.cc b/tensorflow/core/kernels/crop_and_resize_op.cc
index 22524dc1c4..99d01b4db6 100644
--- a/tensorflow/core/kernels/crop_and_resize_op.cc
+++ b/tensorflow/core/kernels/crop_and_resize_op.cc
@@ -28,7 +28,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/bounds_check.h"
-#include "tensorflow/core/kernels/crop_resize_bilinear_core.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
@@ -43,10 +42,6 @@ limitations under the License.
 using stream_executor::cuda::ScopedActivateExecutorContext;
 #endif  // GOOGLE_CUDA
 
-using ::tensorflow::internal::CachedInterpolation;
-using ::tensorflow::internal::compute_interpolation_weights;
-using ::tensorflow::internal::crop_resize_single_image;
-
 namespace tensorflow {
 namespace {
 
@@ -254,34 +249,39 @@ struct CropAndResize<CPUDevice, T> {
             continue;
           }
           if (method_name == "bilinear") {
-            CachedInterpolation *interp_x = nullptr, *interp_y = nullptr;
-            int min_ix, max_ix, min_iy, max_iy;
-            compute_interpolation_weights(crop_width, image_width, x1, x2,
-                                          min_ix, max_ix, interp_x);
-            compute_interpolation_weights(crop_height, image_height, y1, y2,
-                                          min_iy, max_iy, interp_y);
-
-            // multiply by depth to avoid multiplication in resize_single_image.
-            for (int i = min_ix; i <= max_ix; ++i) {
-              interp_x[i - min_ix].lower *= depth;
-              interp_x[i - min_ix].upper *= depth;
-            }
+            const int top_y_index = floorf(in_y);
+            const int bottom_y_index = ceilf(in_y);
+            const float y_lerp = in_y - top_y_index;
+
+            for (int x = 0; x < crop_width; ++x) {
+              const float in_x = (crop_width > 1)
+                                     ? x1 * (image_width - 1) + x * width_scale
+                                     : 0.5 * (x1 + x2) * (image_width - 1);
+              if (in_x < 0 || in_x > image_width - 1) {
+                for (int d = 0; d < depth; ++d) {
+                  crops(b, y, x, d) = extrapolation_value;
+                }
+                continue;
+              }
+              const int left_x_index = floorf(in_x);
+              const int right_x_index = ceilf(in_x);
+              const float x_lerp = in_x - left_x_index;
 
-            crop_resize_single_image<T, float>(
-                image.data() + static_cast<int64>(b_in) *
-                                   static_cast<int64>(image_height) *
-                                   static_cast<int64>(image_width) *
-                                   static_cast<int64>(depth),
-                image_height, image_width, crop_height, crop_width, depth,
-                min_ix, max_ix, interp_x, min_iy, max_iy, interp_y,
-                extrapolation_value, false, false,
-                crops.data() + static_cast<int64>(b) *
-                                   static_cast<int64>(crop_height) *
-                                   static_cast<int64>(crop_width) *
-                                   static_cast<int64>(depth));
-
-            delete[] interp_y;
-            delete[] interp_x;
+              for (int d = 0; d < depth; ++d) {
+                const float top_left(static_cast<float>(
+                    image(b_in, top_y_index, left_x_index, d)));
+                const float top_right(static_cast<float>(
+                    image(b_in, top_y_index, right_x_index, d)));
+                const float bottom_left(static_cast<float>(
+                    image(b_in, bottom_y_index, left_x_index, d)));
+                const float bottom_right(static_cast<float>(
+                    image(b_in, bottom_y_index, right_x_index, d)));
+                const float top = top_left + (top_right - top_left) * x_lerp;
+                const float bottom =
+                    bottom_left + (bottom_right - bottom_left) * x_lerp;
+                crops(b, y, x, d) = top + (bottom - top) * y_lerp;
+              }
+            }
           } else {  // method == "nearest"
             for (int x = 0; x < crop_width; ++x) {
               const float in_x = (crop_width > 1)
diff --git a/tensorflow/core/kernels/crop_resize_bilinear_core.h b/tensorflow/core/kernels/crop_resize_bilinear_core.h
deleted file mode 100644
index 51327aca67..0000000000
--- a/tensorflow/core/kernels/crop_resize_bilinear_core.h
+++ /dev/null
@@ -1,464 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_KERNELS_CROP_RESIZE_BILINEAR_CORE_H_
-#define TENSORFLOW_CORE_KERNELS_CROP_RESIZE_BILINEAR_CORE_H_
-
-namespace tensorflow {
-namespace internal {
-// Compute the interpolation indices only once.
-struct CachedInterpolation {
-  int lower;  // Lower source index used in the interpolation
-  int upper;  // Upper source index used in the interpolation
-  // 1-D linear iterpolation scale (see:
-  // https://en.wikipedia.org/wiki/Bilinear_interpolation)
-  float lerp;
-};
-
-inline bool compute_single_interpolation_weight(
-    const int in_size, const float out2in_scale, const float out2in_start,
-    const bool clip, const int i, int* lower, int* upper, float* lerp) {
-  const float in = i * out2in_scale + out2in_start;
-  *lower = (int)floor(in);
-  *upper = (int)ceil(in);
-  *lerp = (float)(in - (float)*lower);
-  if (clip) {
-    if (*lower < 0)
-      *lower = 0;
-    else if (*lower >= in_size)
-      *lower = in_size - 1;
-    if (*upper < 0)
-      *upper = 0;
-    else if (*upper >= in_size)
-      *upper = in_size - 1;
-    return true;
-  } else {
-    return (*lower >= 0 && *upper < in_size) ? true : false;
-  }
-}
-/**
- * Compute interpolation values for output indexes in range
- * [out_start,out_start+out_size-1].
- * Returns true if all output indexes have lower and upper (input) indexes
- * within range [0,in_size-1].
- */
-inline bool compute_interpolation_weights(const int min_i, const int max_i,
-                                          const int in_size,
-                                          const float out2in_scale,
-                                          const float out2in_start,
-                                          const bool clip,
-                                          CachedInterpolation* interpolation) {
-  bool rval = true;
-  int num_i = max_i - min_i + 1;
-  for (int i = 0; i < num_i; ++i) {
-    if (!compute_single_interpolation_weight(
-            in_size, out2in_scale, out2in_start, clip, i + min_i,
-            &interpolation[i].lower, &interpolation[i].upper,
-            &interpolation[i].lerp)) {
-      rval = false;
-    }
-  }
-  return rval;
-}
-/**
- * Compatibility method for resize_bilinear_op.cc
- */
-inline void compute_interpolation_weights(const int out_size, const int in_size,
-                                          const float out2in_scale,
-                                          CachedInterpolation* interpolation) {
-  interpolation[out_size].lower = 0;
-  interpolation[out_size].upper = 0;
-  const bool clip = true;
-  if (!compute_interpolation_weights(0, out_size - 1, in_size, out2in_scale,
-                                     0.0f, clip, interpolation)) {
-    // Should never happen, check for it anyway
-    printf(
-        "Warning! Interpolation values have lower,upper indexes outside of "
-        "range [0,in_size-1]\n");
-  }
-}
-/**
- * Compute minimum and maximum (output) i where both lower and upper (input) is
- * in range [0,in_size-1]
- * If no values of i satisfy condition, min_i = in_size, max_i = -1 and method
- * returns false.
- * Returns true if min_i >= max_i.
- */
-inline bool compute_minmax_indexes(const int out_size, const int in_size,
-                                   const float out2in_scale,
-                                   const float out2in_start, int& min_i,
-                                   int& max_i) {
-  min_i = out_size;
-  max_i = -1;
-  int lower, upper;
-  float lerp;
-  for (int i = 0; i < out_size; ++i) {
-    if (compute_single_interpolation_weight(in_size, out2in_scale, out2in_start,
-                                            false, i, &lower, &upper, &lerp)) {
-      if (i < min_i) min_i = i;
-      if (i > max_i) max_i = i;
-    }
-  }
-  return (min_i <= max_i) ? true : false;
-}
-/**
- * Compute interpolation weights for crop_and_resize_op.cc
- * Also computes extrapolation areas.
- * Returns true if at least one point requires interpolation, false otherwise.
- */
-inline bool compute_interpolation_weights(
-    const int out_size, const int in_size,
-    const float x1,  // lower bounding box, crop region starts at in_size*x1
-    const float x2,  // upper bounding box, crop region ends at in_size*x2
-    int& min_i, int& max_i, CachedInterpolation*& interpolation) {
-  float out2in_start = out_size > 1
-                           ? (float)(in_size - 1) * (float)x1
-                           : (float)(in_size - 1) * (float)(x1 + x2) / 2.0f;
-  float out2in_scale = out_size > 1 ? (float)(x2 - x1) * (float)(in_size - 1) /
-                                          (float)(out_size - 1)
-                                    : 0.0f;
-  if (compute_minmax_indexes(out_size, in_size, out2in_scale, out2in_start,
-                             min_i, max_i)) {
-    interpolation = new CachedInterpolation[max_i - min_i + 1];
-    bool all_inputs_ok =
-        compute_interpolation_weights(min_i, max_i, in_size, out2in_scale,
-                                      out2in_start, false, interpolation);
-    if (!all_inputs_ok) {
-      // should never happen, purpose of compute_minmax_indexes is to ensure
-      // that all inputs are ok.
-      printf(
-          "Error! compute_interpolation_weights returned input indexes outside "
-          "valid range - SEGV will likely ensue.\n");
-    }
-    return true;
-  } else {
-    interpolation = 0l;
-    return false;
-  }
-}
-
-/**
- * Cast float v to type U with range clamping.
- *
- * If v<min_val, return value is clamped to u_min_val. similarly if v>max_val,
- * return value is clamped to u_max_val.
- */
-template <typename U>
-U cast_to(float v, float min_val, float max_val, U u_min_val, U u_max_val);
-template <typename U>
-inline U cast_to(float v, float min_val, float max_val, U u_min_val,
-                 U u_max_val) {
-  if (v < min_val)
-    return u_min_val;
-  else if (v > max_val)
-    return u_max_val;
-  else
-    return static_cast<U>(v);
-}
-template <>
-inline float cast_to<float>(float v, float min_val, float max_val,
-                            float u_min_val, float u_max_val) {
-  return v;
-}
-
-inline float compute_lerp(const float top_left, const float top_right,
-                          const float bottom_left, const float bottom_right,
-                          const float x_lerp, const float y_lerp) {
-  const float top = top_left + (top_right - top_left) * x_lerp;
-  const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp;
-  return top + (bottom - top) * y_lerp;
-}
-
-/**
- * Computes the bilinear interpolation from the appropriate 4 float points
- * and the linear interpolation weights.
- * Accepts input tensors of type T and produces output tensors of type U.
- * Optionally flips horizontal and/or vertical axis.
- */
-template <typename T, typename U>
-void crop_resize_single_image(const T* image, const int64 in_height,
-                              const int64 in_width, const int64 out_height,
-                              const int64 out_width, const int channels,
-                              const int min_ix, const int max_ix,
-                              const CachedInterpolation* xs, const int min_iy,
-                              const int max_iy, const CachedInterpolation* ys,
-                              const float extrapolated_value, const bool flip_x,
-                              const bool flip_y,
-                              U* output) TF_ATTRIBUTE_NOINLINE;
-template <typename T, typename U>
-void crop_resize_single_image(const T* image, const int64 in_height,
-                              const int64 in_width, const int64 out_height,
-                              const int64 out_width, const int channels,
-                              const int min_ix, const int max_ix,
-                              const CachedInterpolation* xs, const int min_iy,
-                              const int max_iy, const CachedInterpolation* ys,
-                              const float extrapolated_value, const bool flip_x,
-                              const bool flip_y, U* output) {
-  const int64 in_row_size = in_width * channels;
-  const int64 out_row_size = out_width * channels;
-  U u_min_val = std::numeric_limits<U>::min();
-  U u_max_val = std::numeric_limits<U>::max();
-  float min_val = static_cast<float>(u_min_val);
-  float max_val = static_cast<float>(u_max_val);
-  U uEx =
-      cast_to<U>(extrapolated_value, min_val, max_val, u_min_val, u_max_val);
-  // low y extrapolation zone
-  if (min_iy > 0) {
-    U* p = flip_y ? output + out_row_size * (out_height - min_iy) : output;
-    int64 nn = out_row_size * (int64)min_iy;
-    for (int64 i = 0; i < nn; ++i) p[i] = uEx;
-  }
-  // high y extrapolation zone
-  if (max_iy < out_height - 1) {
-    U* p = flip_y ? output : output + out_row_size * (max_iy + 1);
-    int64 nn = out_row_size * (int64)(out_height - 1 - max_iy);
-    for (int64 i = 0; i < nn; ++i) p[i] = uEx;
-  }
-  // low x extrapolation zone
-  if (min_ix > 0) {
-    for (int iy = min_iy; iy <= max_iy; ++iy) {
-      int xx0 = flip_x ? (out_width - min_ix) * channels : 0;
-      int nxx = min_ix * channels;
-      U* p = output + xx0 +
-             out_row_size * (int64)(flip_y ? out_height - 1 - iy : iy);
-      for (int ix = 0; ix < nxx; ++ix) {
-        p[ix] = uEx;
-      }
-    }
-  }
-  // high x extrapolation zone
-  if (max_ix < out_width - 1) {
-    for (int iy = min_iy; iy <= max_iy; ++iy) {
-      int xx0 = flip_x ? 0 : (max_ix + 1) * channels;
-      int nxx = (out_width - 1 - max_ix) * channels;
-      U* p = output + xx0 +
-             out_row_size * (int64)(flip_y ? out_height - 1 - iy : iy);
-      for (int ix = 0; ix < nxx; ++ix) {
-        p[ix] = uEx;
-      }
-    }
-  }
-  U* output_y_ptr =
-      output +
-      out_row_size * (int64)(flip_y ? out_height - 1 - min_iy : min_iy);
-  // interpolation zone
-  if (channels == 1) {
-    for (int y = min_iy; y <= max_iy; ++y) {
-      const int iy = y - min_iy;
-      const T* ys_input_lower_ptr = image + ys[iy].lower * in_row_size;
-      const T* ys_input_upper_ptr = image + ys[iy].upper * in_row_size;
-      const float ys_lerp = ys[iy].lerp;
-      const int x0 = flip_x ? out_width - 1 - max_ix : min_ix;
-      const int x1 = flip_x ? out_width - 1 - min_ix : max_ix;
-      for (int x = x0; x <= x1; ++x) {
-        const int ix = flip_x ? out_width - 1 - min_ix - x : x - min_ix;
-        const int64 xs_lower = xs[ix].lower;
-        const int64 xs_upper = xs[ix].upper;
-        const float xs_lerp = xs[ix].lerp;
-
-        // Read channel 0.
-        const float top_left0(ys_input_lower_ptr[xs_lower]);
-        const float top_right0(ys_input_lower_ptr[xs_upper]);
-        const float bottom_left0(ys_input_upper_ptr[xs_lower]);
-        const float bottom_right0(ys_input_upper_ptr[xs_upper]);
-
-        // Compute output.
-        float result0 = compute_lerp(top_left0, top_right0, bottom_left0,
-                                     bottom_right0, xs_lerp, ys_lerp);
-        output_y_ptr[x] =
-            cast_to<U>(result0, min_val, max_val, u_min_val, u_max_val);
-      }
-      output_y_ptr =
-          flip_y ? output_y_ptr - out_row_size : output_y_ptr + out_row_size;
-    }
-  } else if (channels == 2) {
-    for (int y = min_iy; y <= max_iy; ++y) {
-      const int iy = y - min_iy;
-      const T* ys_input_lower_ptr = image + ys[iy].lower * in_row_size;
-      const T* ys_input_upper_ptr = image + ys[iy].upper * in_row_size;
-      const float ys_lerp = ys[iy].lerp;
-      const int x0 = flip_x ? out_width - 1 - max_ix : min_ix;
-      const int x1 = flip_x ? out_width - 1 - min_ix : max_ix;
-      for (int x = x0; x <= x1; ++x) {
-        const int ix = flip_x ? out_width - 1 - min_ix - x : x - min_ix;
-        const int64 xs_lower = xs[ix].lower;
-        const int64 xs_upper = xs[ix].upper;
-        const float xs_lerp = xs[ix].lerp;
-
-        // Read channel 0.
-        const float top_left0(ys_input_lower_ptr[xs_lower + 0]);
-        const float top_right0(ys_input_lower_ptr[xs_upper + 0]);
-        const float bottom_left0(ys_input_upper_ptr[xs_lower + 0]);
-        const float bottom_right0(ys_input_upper_ptr[xs_upper + 0]);
-
-        // Read channel 1.
-        const float top_left1(ys_input_lower_ptr[xs_lower + 1]);
-        const float top_right1(ys_input_lower_ptr[xs_upper + 1]);
-        const float bottom_left1(ys_input_upper_ptr[xs_lower + 1]);
-        const float bottom_right1(ys_input_upper_ptr[xs_upper + 1]);
-
-        // Compute output.
-        float result0 = compute_lerp(top_left0, top_right0, bottom_left0,
-                                     bottom_right0, xs_lerp, ys_lerp);
-        float result1 = compute_lerp(top_left1, top_right1, bottom_left1,
-                                     bottom_right1, xs_lerp, ys_lerp);
-        output_y_ptr[x * 2 + 0] =
-            cast_to<U>(result0, min_val, max_val, u_min_val, u_max_val);
-        output_y_ptr[x * 2 + 1] =
-            cast_to<U>(result1, min_val, max_val, u_min_val, u_max_val);
-      }
-      output_y_ptr =
-          flip_y ? output_y_ptr - out_row_size : output_y_ptr + out_row_size;
-    }
-  } else if (channels == 3) {
-    for (int y = min_iy; y <= max_iy; ++y) {
-      const int iy = y - min_iy;
-      const T* ys_input_lower_ptr = image + ys[iy].lower * in_row_size;
-      const T* ys_input_upper_ptr = image + ys[iy].upper * in_row_size;
-      const float ys_lerp = ys[iy].lerp;
-      const int x0 = flip_x ? out_width - 1 - max_ix : min_ix;
-      const int x1 = flip_x ? out_width - 1 - min_ix : max_ix;
-      for (int x = x0; x <= x1; ++x) {
-        const int ix = flip_x ? out_width - 1 - min_ix - x : x - min_ix;
-        const int64 xs_lower = xs[ix].lower;
-        const int64 xs_upper = xs[ix].upper;
-        const float xs_lerp = xs[ix].lerp;
-
-        // Read channel 0.
-        const float top_left0(ys_input_lower_ptr[xs_lower + 0]);
-        const float top_right0(ys_input_lower_ptr[xs_upper + 0]);
-        const float bottom_left0(ys_input_upper_ptr[xs_lower + 0]);
-        const float bottom_right0(ys_input_upper_ptr[xs_upper + 0]);
-
-        // Read channel 1.
-        const float top_left1(ys_input_lower_ptr[xs_lower + 1]);
-        const float top_right1(ys_input_lower_ptr[xs_upper + 1]);
-        const float bottom_left1(ys_input_upper_ptr[xs_lower + 1]);
-        const float bottom_right1(ys_input_upper_ptr[xs_upper + 1]);
-
-        // Read channel 2.
-        const float top_left2(ys_input_lower_ptr[xs_lower + 2]);
-        const float top_right2(ys_input_lower_ptr[xs_upper + 2]);
-        const float bottom_left2(ys_input_upper_ptr[xs_lower + 2]);
-        const float bottom_right2(ys_input_upper_ptr[xs_upper + 2]);
-
-        // Compute output.
-        float result0 = compute_lerp(top_left0, top_right0, bottom_left0,
-                                     bottom_right0, xs_lerp, ys_lerp);
-        float result1 = compute_lerp(top_left1, top_right1, bottom_left1,
-                                     bottom_right1, xs_lerp, ys_lerp);
-        float result2 = compute_lerp(top_left2, top_right2, bottom_left2,
-                                     bottom_right2, xs_lerp, ys_lerp);
-        output_y_ptr[x * 3 + 0] =
-            cast_to<U>(result0, min_val, max_val, u_min_val, u_max_val);
-        output_y_ptr[x * 3 + 1] =
-            cast_to<U>(result1, min_val, max_val, u_min_val, u_max_val);
-        output_y_ptr[x * 3 + 2] =
-            cast_to<U>(result2, min_val, max_val, u_min_val, u_max_val);
-      }
-      output_y_ptr =
-          flip_y ? output_y_ptr - out_row_size : output_y_ptr + out_row_size;
-    }
-  } else if (channels == 4) {
-    for (int y = min_iy; y <= max_iy; ++y) {
-      const int iy = y - min_iy;
-      const T* ys_input_lower_ptr = image + ys[iy].lower * in_row_size;
-      const T* ys_input_upper_ptr = image + ys[iy].upper * in_row_size;
-      const float ys_lerp = ys[iy].lerp;
-      const int x0 = flip_x ? out_width - 1 - max_ix : min_ix;
-      const int x1 = flip_x ? out_width - 1 - min_ix : max_ix;
-      for (int x = x0; x <= x1; ++x) {
-        const int ix = flip_x ? out_width - 1 - min_ix - x : x - min_ix;
-        const int64 xs_lower = xs[ix].lower;
-        const int64 xs_upper = xs[ix].upper;
-        const float xs_lerp = xs[ix].lerp;
-
-        // Read channel 0.
-        const float top_left0(ys_input_lower_ptr[xs_lower + 0]);
-        const float top_right0(ys_input_lower_ptr[xs_upper + 0]);
-        const float bottom_left0(ys_input_upper_ptr[xs_lower + 0]);
-        const float bottom_right0(ys_input_upper_ptr[xs_upper + 0]);
-
-        // Read channel 1.
-        const float top_left1(ys_input_lower_ptr[xs_lower + 1]);
-        const float top_right1(ys_input_lower_ptr[xs_upper + 1]);
-        const float bottom_left1(ys_input_upper_ptr[xs_lower + 1]);
-        const float bottom_right1(ys_input_upper_ptr[xs_upper + 1]);
-
-        // Read channel 2.
-        const float top_left2(ys_input_lower_ptr[xs_lower + 2]);
-        const float top_right2(ys_input_lower_ptr[xs_upper + 2]);
-        const float bottom_left2(ys_input_upper_ptr[xs_lower + 2]);
-        const float bottom_right2(ys_input_upper_ptr[xs_upper + 2]);
-
-        // Read channel 3.
-        const float top_left3(ys_input_lower_ptr[xs_lower + 3]);
-        const float top_right3(ys_input_lower_ptr[xs_upper + 3]);
-        const float bottom_left3(ys_input_upper_ptr[xs_lower + 3]);
-        const float bottom_right3(ys_input_upper_ptr[xs_upper + 3]);
-
-        // Compute output.
-        float result0 = compute_lerp(top_left0, top_right0, bottom_left0,
-                                     bottom_right0, xs_lerp, ys_lerp);
-        float result1 = compute_lerp(top_left1, top_right1, bottom_left1,
-                                     bottom_right1, xs_lerp, ys_lerp);
-        float result2 = compute_lerp(top_left2, top_right2, bottom_left2,
-                                     bottom_right2, xs_lerp, ys_lerp);
-        float result3 = compute_lerp(top_left3, top_right3, bottom_left3,
-                                     bottom_right3, xs_lerp, ys_lerp);
-        output_y_ptr[x * 4 + 0] =
-            cast_to<U>(result0, min_val, max_val, u_min_val, u_max_val);
-        output_y_ptr[x * 4 + 1] =
-            cast_to<U>(result1, min_val, max_val, u_min_val, u_max_val);
-        output_y_ptr[x * 4 + 2] =
-            cast_to<U>(result2, min_val, max_val, u_min_val, u_max_val);
-        output_y_ptr[x * 4 + 3] =
-            cast_to<U>(result3, min_val, max_val, u_min_val, u_max_val);
-      }
-      output_y_ptr =
-          flip_y ? output_y_ptr - out_row_size : output_y_ptr + out_row_size;
-    }
-  } else {
-    for (int y = min_iy; y <= max_iy; ++y) {
-      const int iy = y - min_iy;
-      const T* ys_input_lower_ptr = image + ys[iy].lower * in_row_size;
-      const T* ys_input_upper_ptr = image + ys[iy].upper * in_row_size;
-      const float ys_lerp = ys[iy].lerp;
-      const int x0 = flip_x ? out_width - 1 - max_ix : min_ix;
-      const int x1 = flip_x ? out_width - 1 - min_ix : max_ix;
-      for (int x = x0; x <= x1; ++x) {
-        const int ix = flip_x ? out_width - 1 - min_ix - x : x - min_ix;
-        const int64 xs_lower = xs[ix].lower;
-        const int64 xs_upper = xs[ix].upper;
-        const float xs_lerp = xs[ix].lerp;
-        for (int ichan = 0; ichan < channels; ++ichan) {
-          const float top_left0(ys_input_lower_ptr[xs_lower + ichan]);
-          const float top_right0(ys_input_lower_ptr[xs_upper + ichan]);
-          const float bottom_left0(ys_input_upper_ptr[xs_lower + ichan]);
-          const float bottom_right0(ys_input_upper_ptr[xs_upper + ichan]);
-          float result0 = compute_lerp(top_left0, top_right0, bottom_left0,
-                                       bottom_right0, xs_lerp, ys_lerp);
-          output_y_ptr[x * channels + ichan] =
-              cast_to<U>(result0, min_val, max_val, u_min_val, u_max_val);
-        }
-      }
-      output_y_ptr =
-          flip_y ? output_y_ptr - out_row_size : output_y_ptr + out_row_size;
-    }
-  }
-}
-}  // namespace internal
-}  // namespace tensorflow
-#endif  // TENSORFLOW_CORE_KERNELS_CROP_RESIZE_BILINEAR_CORE_H_
diff --git a/tensorflow/core/kernels/cwise_op_tan.cc b/tensorflow/core/kernels/cwise_op_tan.cc
index c1a25767d3..90762fb1b0 100644
--- a/tensorflow/core/kernels/cwise_op_tan.cc
+++ b/tensorflow/core/kernels/cwise_op_tan.cc
@@ -16,7 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER2(UnaryOp, CPU, "Tan", functor::tan, float, double);
+REGISTER4(UnaryOp, CPU, "Tan", functor::tan, float, double, complex64,
+          complex128);
 
 #if GOOGLE_CUDA
 REGISTER2(UnaryOp, GPU, "Tan", functor::tan, float, double);
diff --git a/tensorflow/core/kernels/data/stats_dataset_ops.cc b/tensorflow/core/kernels/data/stats_dataset_ops.cc
index 754c32b6ca..58ec3d4495 100644
--- a/tensorflow/core/kernels/data/stats_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/stats_dataset_ops.cc
@@ -390,7 +390,7 @@ class FeatureStatsDatasetOp : public UnaryDatasetOpKernel {
 
         for (const auto& feature_list :
              example.feature_lists().feature_list()) {
-          stats_aggregator->IncrementCounter("feature_lists_count", "reainer",
+          stats_aggregator->IncrementCounter("feature_lists_count", "trainer",
                                              1);
           for (const auto& feature : feature_list.second.feature()) {
             feature_values_list_size_sum += AddStatsFeatureValues(feature);
diff --git a/tensorflow/core/kernels/functional_ops.cc b/tensorflow/core/kernels/functional_ops.cc
index 519c475332..cb285bf732 100644
--- a/tensorflow/core/kernels/functional_ops.cc
+++ b/tensorflow/core/kernels/functional_ops.cc
@@ -536,6 +536,7 @@ class FakeParamOp : public OpKernel {
 };
 
 REGISTER_KERNEL_BUILDER(Name("FakeParam").Device(DEVICE_CPU), FakeParamOp);
+REGISTER_KERNEL_BUILDER(Name("FakeParam").Device(DEVICE_GPU), FakeParamOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.cc b/tensorflow/core/kernels/fused_batch_norm_op.cc
index f99dd643f7..d89f1592bd 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op.cc
@@ -45,6 +45,24 @@ struct FusedBatchNorm;
 template <typename Device, typename T, typename U>
 struct FusedBatchNormGrad;
 
+template <bool IsSame, typename Y, typename X, typename T>
+struct CastIfNecessary {
+  static inline void process(
+      Y& y, X& x_shifted, const Eigen::DSizes<Eigen::Index, 2>& rest_by_depth,
+      const CPUDevice& d) {
+    y.reshape(rest_by_depth).device(d) = x_shifted.template cast<T>();
+  }
+};
+
+template <typename Y, typename X, typename T>
+struct CastIfNecessary<true, Y, X, T> {
+  static inline void process(
+      Y& y, X& x_shifted, const Eigen::DSizes<Eigen::Index, 2>& rest_by_depth,
+      const CPUDevice& d) {
+    y.reshape(rest_by_depth).device(d) = x_shifted;
+  }
+};
+
 template <typename T, typename U>
 struct FusedBatchNorm<CPUDevice, T, U> {
   void operator()(OpKernelContext* context, const Tensor& x_input,
@@ -125,7 +143,11 @@ struct FusedBatchNorm<CPUDevice, T, U> {
     auto x_shifted =
         x_scaled + offset.reshape(one_by_depth).broadcast(bcast_spec);
 
-    y.reshape(rest_by_depth).device(d) = x_shifted.template cast<T>();
+    // Explicitly checks the types of T and U and only casts x_shifted when
+    // T != U. (Not doing so caused a 35-50% performance slowdown for
+    // some compiler flags.)
+    CastIfNecessary<std::is_same<T, U>::value, decltype(y), decltype(x_shifted),
+                    T>::process(y, x_shifted, rest_by_depth, d);
   }
 };
 
diff --git a/tensorflow/core/kernels/gather_nd_op.cc b/tensorflow/core/kernels/gather_nd_op.cc
index 4e53291b7f..e50b7fe3bf 100644
--- a/tensorflow/core/kernels/gather_nd_op.cc
+++ b/tensorflow/core/kernels/gather_nd_op.cc
@@ -188,12 +188,13 @@ Status DoGatherNd(OpKernelContext* c, const Tensor& params,
 
     // bad_i will only return >= 0 on CPUs right now.
     if (bad_i >= 0) {
+      auto shape = indices.shape();
+      shape.RemoveLastDims(1);
       return errors::InvalidArgument(
-          "flat indices[", bad_i, ", :] = [",
+          "indices", SliceDebugString(shape, bad_i), " = [",
           str_util::Join(
               gtl::ArraySlice<Index>(&indices_mat(bad_i, 0), indices_nd), ", "),
-          "] does not index into param (shape: ", params.shape().DebugString(),
-          ").");
+          "] does not index into param shape ", params.shape().DebugString());
     }
   }
   return Status::OK();
diff --git a/tensorflow/core/kernels/matmul_op.cc b/tensorflow/core/kernels/matmul_op.cc
index b596dbc782..80376c61aa 100644
--- a/tensorflow/core/kernels/matmul_op.cc
+++ b/tensorflow/core/kernels/matmul_op.cc
@@ -453,10 +453,14 @@ class MatMulOp : public OpKernel {
     const Tensor& b = ctx->input(1);
 
     // Check that the dimensions of the two matrices are valid.
-    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(a.shape()),
-                errors::InvalidArgument("In[0] is not a matrix"));
-    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(b.shape()),
-                errors::InvalidArgument("In[1] is not a matrix"));
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsMatrix(a.shape()),
+        errors::InvalidArgument("In[0] is not a matrix. Instead it has shape ",
+                                a.shape().DebugString()));
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsMatrix(b.shape()),
+        errors::InvalidArgument("In[1] is not a matrix. Instead it has shape ",
+                                b.shape().DebugString()));
     Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
     dim_pair[0].first = transpose_a_ ? 0 : 1;
     dim_pair[0].second = transpose_b_ ? 1 : 0;
diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
index 6f490cdc23..d8efb1be3e 100644
--- a/tensorflow/core/kernels/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl_concat_op.cc
@@ -308,11 +308,9 @@ class MklConcatOp : public OpKernel {
     }
 
     if (invoke_eigen) {
-      string msg = std::string("Invoking Eigen version of Concat. Reason:") +
-                   (!is_concat_dim_channel
-                        ? std::string("Concat dimension is not channel")
-                        : std::string("Not all tensors are in Mkl layout"));
-      VLOG(1) << "_MklConcatOp: " << msg;
+      VLOG(1) << "_MklConcatOp: Invoking Eigen version of Concat. Reason:"
+              << (!is_concat_dim_channel ? "Concat dimension is not channel"
+                                         : "Not all tensors are in Mkl layout");
       CallEigenVersion(context, input_tensors, input_shapes);
       return;
     }
diff --git a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
index 4e80f5acce..b73a119a88 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
@@ -328,9 +328,8 @@ class MklConv2DBwdFilterPrimitiveFactory : public MklPrimitiveFactory<T> {
     return instance_;
   }
 
-  static std::string CreateKey(
-      const MklConvBwdFilterParams& convBwdFilterDims) {
-    std::string prefix = "conv2d_bwd_filter";
+  static string CreateKey(const MklConvBwdFilterParams& convBwdFilterDims) {
+    string prefix = "conv2d_bwd_filter";
     FactoryKeyCreator key_creator;
     key_creator.AddAsKey(prefix);
     key_creator.AddAsKey(convBwdFilterDims.src_dims);
@@ -346,13 +345,13 @@ class MklConv2DBwdFilterPrimitiveFactory : public MklPrimitiveFactory<T> {
 
   MklPrimitive* GetConv2dBwdFilter(
       const MklConvBwdFilterParams& convBwdFilterDims) {
-    std::string key = CreateKey(convBwdFilterDims);
+    string key = CreateKey(convBwdFilterDims);
     return this->GetOp(key);
   }
 
   void SetConv2dBwdFilter(
       const MklConvBwdFilterParams& convBwdFilterDims, MklPrimitive* op) {
-    std::string key = CreateKey(convBwdFilterDims);
+    string key = CreateKey(convBwdFilterDims);
     this->SetOp(key, op);
   }
 };
@@ -872,12 +871,10 @@ class MklConv2DCustomBackpropFilterOp
       }
 
       // check if src and diff_dst need reorder
-      std::vector<primitive> net;
       T *src_data = nullptr;
       if (fwd_src_md.data.format != conv2d_bwd_filter->GetSrcMemoryFormat()) {
         src.SetUsrMem(fwd_src_md, &src_tensor);
-        src.CheckReorderToOpMem(
-            bwd_filter_pd->src_primitive_desc(), &net);
+        src.CheckReorderToOpMem(bwd_filter_pd->src_primitive_desc());
         src_data = static_cast<T*>(src.GetOpMem().get_data_handle());
       } else {
         src_data = static_cast<T*>(const_cast<T*>(
@@ -888,15 +885,13 @@ class MklConv2DCustomBackpropFilterOp
       if (diff_dst_md.data.format !=
           conv2d_bwd_filter->GetDiffDstMemoryFormat()) {
         diff_dst.SetUsrMem(diff_dst_md, &diff_dst_tensor);
-        diff_dst.CheckReorderToOpMem(
-            bwd_filter_pd->diff_dst_primitive_desc(), &net);
+        diff_dst.CheckReorderToOpMem(bwd_filter_pd->diff_dst_primitive_desc());
         diff_dst_data = static_cast<T*>(
             diff_dst.GetOpMem().get_data_handle());
       } else {
         diff_dst_data = static_cast<T*>(const_cast<T*>(
             diff_dst_tensor.flat<T>().data()));
       }
-      stream(stream::kind::eager).submit(net).wait();
 
       // For backward filter, convert diff_filter back to Tensorflow layout
       // Here we prepare to reorder op memory back to user memory
@@ -929,9 +924,7 @@ class MklConv2DCustomBackpropFilterOp
 
       // Reorder diff_filter back to Tensorflow layout if necessary
       if (diff_filter_reorder_required) {
-        std::vector<primitive> net;
-        diff_filter.InsertReorderToUserMem(&net);
-        stream(stream::kind::eager).submit(net).wait();
+        diff_filter.InsertReorderToUserMem();
       }
     } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
diff --git a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
index 0af4568b47..39498f1a80 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
@@ -265,9 +265,8 @@ class MklConv2DBwdInputPrimitiveFactory : public MklPrimitiveFactory<T> {
     return instance_;
   }
 
-  static std::string CreateKey(
-      const MklConvBwdInputParams& convBwdInputDims) {
-    std::string prefix = "conv2d_bwd_input";
+  static string CreateKey(const MklConvBwdInputParams& convBwdInputDims) {
+    string prefix = "conv2d_bwd_input";
     FactoryKeyCreator key_creator;
     key_creator.AddAsKey(prefix);
     key_creator.AddAsKey(convBwdInputDims.diff_src_dims);
@@ -282,13 +281,13 @@ class MklConv2DBwdInputPrimitiveFactory : public MklPrimitiveFactory<T> {
 
   MklPrimitive* GetConv2dBwdInput(
       const MklConvBwdInputParams& convBwdInputDims) {
-    std::string key = CreateKey(convBwdInputDims);
+    string key = CreateKey(convBwdInputDims);
     return this->GetOp(key);
   }
 
   void SetConv2dBwdInput(
       const MklConvBwdInputParams& convBwdInputDims, MklPrimitive *op) {
-    std::string key = CreateKey(convBwdInputDims);
+    string key = CreateKey(convBwdInputDims);
     this->SetOp(key, op);
   }
 };
@@ -722,14 +721,11 @@ class MklConv2DCustomBackpropInputOp
           diff_src_tensor->flat<T>().data()));
 
       // check if filter and diff_dst need reorder
-      std::vector<primitive> net;
       T* filter_data = nullptr;
       if (fwd_filter_md.data.format !=
           conv2d_bwd_input->GetFilterMemoryFormat()) {
         filter.SetUsrMem(fwd_filter_md, &filter_tensor);
-        filter.CheckReorderToOpMem(
-           bwd_input_pd->weights_primitive_desc(),
-           &net);
+        filter.CheckReorderToOpMem(bwd_input_pd->weights_primitive_desc());
         filter_data = static_cast<T*>(filter.GetOpMem().get_data_handle());
       } else {
         filter_data = static_cast<T*>(const_cast<T*>(
@@ -740,15 +736,13 @@ class MklConv2DCustomBackpropInputOp
       if (diff_dst_md.data.format !=
           conv2d_bwd_input->GetDiffDstMemoryFormat()) {
         diff_dst.SetUsrMem(diff_dst_md, &diff_dst_tensor);
-        diff_dst.CheckReorderToOpMem(
-           bwd_input_pd->diff_dst_primitive_desc(), &net);
+        diff_dst.CheckReorderToOpMem(bwd_input_pd->diff_dst_primitive_desc());
         diff_dst_data = static_cast<T*>(
                          diff_dst.GetOpMem().get_data_handle());
       } else {
         diff_dst_data = static_cast<T*>(const_cast<T*>(
                          diff_dst_tensor.flat<T>().data()));
       }
-      stream(stream::kind::eager).submit(net).wait();
 
       // execute convolution input bwd
       conv2d_bwd_input->Execute(diff_src_data, filter_data, diff_dst_data);
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index b568973220..62396eeb8b 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -18,7 +18,6 @@ limitations under the License.
 
 #include <string.h>
 #include <map>
-#include <string>
 #include <vector>
 #include <memory>
 
@@ -35,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/util/padding.h"
@@ -298,8 +298,8 @@ class MklConv2DFwdPrimitiveFactory : public MklPrimitiveFactory<T> {
     return instance_;
   }
 
-  static std::string CreateKey(const MklConvFwdParams& convFwdDims) {
-    std::string prefix = "conv2d_fwd_";
+  static string CreateKey(const MklConvFwdParams& convFwdDims) {
+    string prefix = "conv2d_fwd_";
     FactoryKeyCreator key_creator;
     key_creator.AddAsKey(prefix);
     key_creator.AddAsKey(convFwdDims.src_dims);
@@ -314,12 +314,12 @@ class MklConv2DFwdPrimitiveFactory : public MklPrimitiveFactory<T> {
   }
 
   MklPrimitive* GetConv2DFwd(const MklConvFwdParams& convFwdDims) {
-    std::string key = CreateKey(convFwdDims);
+    string key = CreateKey(convFwdDims);
     return this->GetOp(key);
   }
 
   void SetConv2DFwd(const MklConvFwdParams& convFwdDims, MklPrimitive* op) {
-    std::string key = CreateKey(convFwdDims);
+    string key = CreateKey(convFwdDims);
     this->SetOp(key, op);
   }
 };
@@ -930,10 +930,9 @@ class MklConv2DOp : public OpKernel {
         conv2d_fwd->Execute(src_data, filter_data, dst_data);
       }
     } catch (mkldnn::error &e) {
-      string error_msg = "Status: " + std::to_string(e.status) +
-                       ", message: " + std::string(e.message) +
-                       ", in file " + std::string(__FILE__) + ":" +
-                       std::to_string(__LINE__);
+      string error_msg = tensorflow::strings::StrCat(
+          "Status: ", e.status, ", message: ", string(e.message), ", in file ",
+          __FILE__, ":", __LINE__);
       OP_REQUIRES_OK(context,
         errors::Aborted("Operation received an exception:", error_msg));
     }
diff --git a/tensorflow/core/kernels/mkl_conv_ops.h b/tensorflow/core/kernels/mkl_conv_ops.h
index 5e1a5001dc..3f154ff33b 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.h
+++ b/tensorflow/core/kernels/mkl_conv_ops.h
@@ -17,7 +17,6 @@ limitations under the License.
 #define TENSORFLOW_CORE_KERNELS_MKL_CONV_OPS_H_
 
 #include <limits>
-#include <string>
 #include <vector>
 #include <memory>
 
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.h b/tensorflow/core/kernels/mkl_pooling_ops_common.h
index c0dfed7d7d..cb1eecb36a 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.h
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.h
@@ -17,7 +17,6 @@ limitations under the License.
 #define TENSORFLOW_CORE_KERNELS_MKL_POOLING_OPS_COMMON_H_
 
 #ifdef INTEL_MKL
-#include <string>
 #include <vector>
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/padding.h"
diff --git a/tensorflow/core/kernels/partitioned_function_ops.cc b/tensorflow/core/kernels/partitioned_function_ops.cc
index b5c6ba1da3..a7a9609c21 100644
--- a/tensorflow/core/kernels/partitioned_function_ops.cc
+++ b/tensorflow/core/kernels/partitioned_function_ops.cc
@@ -330,6 +330,7 @@ class PartitionedCallOp : public AsyncOpKernel {
     // using device-specific threadpools when available.
     opts.runner = ctx->runner();
     opts.source_device = local_device_name_;
+    opts.allow_dead_tensors = true;
     // TODO(akshayka): Accommodate the multiple-worker scenario by adding the
     // constructed rendezvous to a rendezvous manager.
     Rendezvous* rendez = new IntraProcessRendezvous(lib->device_mgr());
diff --git a/tensorflow/core/kernels/resize_bilinear_op.cc b/tensorflow/core/kernels/resize_bilinear_op.cc
index bf9e93ef3f..dde59e8e74 100644
--- a/tensorflow/core/kernels/resize_bilinear_op.cc
+++ b/tensorflow/core/kernels/resize_bilinear_op.cc
@@ -25,15 +25,10 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/crop_resize_bilinear_core.h"
 #include "tensorflow/core/kernels/image_resizer_state.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 
-using ::tensorflow::internal::CachedInterpolation;
-using ::tensorflow::internal::compute_interpolation_weights;
-using ::tensorflow::internal::crop_resize_single_image;
-
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
@@ -68,6 +63,140 @@ class ResizeBilinearOp : public OpKernel {
   bool align_corners_;
 };
 
+namespace {
+// Compute the interpolation indices only once.
+struct CachedInterpolation {
+  int64 lower;  // Lower source index used in the interpolation
+  int64 upper;  // Upper source index used in the interpolation
+  // 1-D linear iterpolation scale (see:
+  // https://en.wikipedia.org/wiki/Bilinear_interpolation)
+  float lerp;
+};
+
+inline void compute_interpolation_weights(const int64 out_size,
+                                          const int64 in_size,
+                                          const float scale,
+                                          CachedInterpolation* interpolation) {
+  interpolation[out_size].lower = 0;
+  interpolation[out_size].upper = 0;
+  for (int64 i = out_size - 1; i >= 0; --i) {
+    const float in = i * scale;
+    interpolation[i].lower = static_cast<int64>(in);
+    interpolation[i].upper = std::min(interpolation[i].lower + 1, in_size - 1);
+    interpolation[i].lerp = in - interpolation[i].lower;
+  }
+}
+
+/**
+ * Computes the bilinear interpolation from the appropriate 4 float points
+ * and the linear interpolation weights.
+ */
+inline float compute_lerp(const float top_left, const float top_right,
+                          const float bottom_left, const float bottom_right,
+                          const float x_lerp, const float y_lerp) {
+  const float top = top_left + (top_right - top_left) * x_lerp;
+  const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp;
+  return top + (bottom - top) * y_lerp;
+}
+
+template <typename T>
+void resize_image(
+    typename TTypes<T, 4>::ConstTensor images, const int batch_size,
+    const int64 in_height, const int64 in_width, const int64 out_height,
+    const int64 out_width, const int channels,
+    const std::vector<CachedInterpolation>& xs,
+    const std::vector<CachedInterpolation>& ys,
+    typename TTypes<float, 4>::Tensor output) TF_ATTRIBUTE_NOINLINE;
+template <typename T>
+void resize_image(typename TTypes<T, 4>::ConstTensor images,
+                  const int batch_size, const int64 in_height,
+                  const int64 in_width, const int64 out_height,
+                  const int64 out_width, const int channels,
+                  const std::vector<CachedInterpolation>& xs_vec,
+                  const std::vector<CachedInterpolation>& ys,
+                  typename TTypes<float, 4>::Tensor output) {
+  const int64 in_row_size = in_width * channels;
+  const int64 in_batch_num_values = in_height * in_row_size;
+  const int64 out_row_size = out_width * channels;
+
+  const T* input_b_ptr = images.data();
+  const CachedInterpolation* xs = xs_vec.data();
+
+  if (channels == 3) {
+    float* output_y_ptr = output.data();
+    for (int b = 0; b < batch_size; ++b) {
+      for (int64 y = 0; y < out_height; ++y) {
+        const T* ys_input_lower_ptr = input_b_ptr + ys[y].lower * in_row_size;
+        const T* ys_input_upper_ptr = input_b_ptr + ys[y].upper * in_row_size;
+        const float ys_lerp = ys[y].lerp;
+        for (int64 x = 0; x < out_width; ++x) {
+          const int64 xs_lower = xs[x].lower;
+          const int64 xs_upper = xs[x].upper;
+          const float xs_lerp = xs[x].lerp;
+
+          // Read channel 0.
+          const float top_left0(ys_input_lower_ptr[xs_lower + 0]);
+          const float top_right0(ys_input_lower_ptr[xs_upper + 0]);
+          const float bottom_left0(ys_input_upper_ptr[xs_lower + 0]);
+          const float bottom_right0(ys_input_upper_ptr[xs_upper + 0]);
+
+          // Read channel 1.
+          const float top_left1(ys_input_lower_ptr[xs_lower + 1]);
+          const float top_right1(ys_input_lower_ptr[xs_upper + 1]);
+          const float bottom_left1(ys_input_upper_ptr[xs_lower + 1]);
+          const float bottom_right1(ys_input_upper_ptr[xs_upper + 1]);
+
+          // Read channel 2.
+          const float top_left2(ys_input_lower_ptr[xs_lower + 2]);
+          const float top_right2(ys_input_lower_ptr[xs_upper + 2]);
+          const float bottom_left2(ys_input_upper_ptr[xs_lower + 2]);
+          const float bottom_right2(ys_input_upper_ptr[xs_upper + 2]);
+
+          // Compute output.
+          output_y_ptr[x * channels + 0] =
+              compute_lerp(top_left0, top_right0, bottom_left0, bottom_right0,
+                           xs_lerp, ys_lerp);
+          output_y_ptr[x * channels + 1] =
+              compute_lerp(top_left1, top_right1, bottom_left1, bottom_right1,
+                           xs_lerp, ys_lerp);
+          output_y_ptr[x * channels + 2] =
+              compute_lerp(top_left2, top_right2, bottom_left2, bottom_right2,
+                           xs_lerp, ys_lerp);
+        }
+        output_y_ptr += out_row_size;
+      }
+      input_b_ptr += in_batch_num_values;
+    }
+  } else {
+    float* output_y_ptr = output.data();
+    for (int b = 0; b < batch_size; ++b) {
+      for (int64 y = 0; y < out_height; ++y) {
+        const T* ys_input_lower_ptr = input_b_ptr + ys[y].lower * in_row_size;
+        const T* ys_input_upper_ptr = input_b_ptr + ys[y].upper * in_row_size;
+        const float ys_lerp = ys[y].lerp;
+        for (int64 x = 0; x < out_width; ++x) {
+          auto xs_lower = xs[x].lower;
+          auto xs_upper = xs[x].upper;
+          auto xs_lerp = xs[x].lerp;
+          for (int c = 0; c < channels; ++c) {
+            const float top_left(ys_input_lower_ptr[xs_lower + c]);
+            const float top_right(ys_input_lower_ptr[xs_upper + c]);
+            const float bottom_left(ys_input_upper_ptr[xs_lower + c]);
+            const float bottom_right(ys_input_upper_ptr[xs_upper + c]);
+            output_y_ptr[x * channels + c] =
+                compute_lerp(top_left, top_right, bottom_left, bottom_right,
+                             xs_lerp, ys_lerp);
+          }
+        }
+        output_y_ptr += out_row_size;
+      }
+      input_b_ptr += in_batch_num_values;
+    }
+  }
+}
+
+}  // namespace
+
 // Partial specialization of ResizeBilinear functor for a CPUDevice.
 namespace functor {
 template <typename T>
@@ -83,11 +212,6 @@ struct ResizeBilinear<CPUDevice, T> {
     const int64 out_height = output.dimension(1);
     const int64 out_width = output.dimension(2);
 
-    const int64 in_row_size = in_width * channels;
-    const int64 in_batch_num_values = in_height * in_row_size;
-    const int64 out_row_size = out_width * channels;
-    const int64 out_batch_num_values = out_row_size * out_height;
-
     // Handle no-op resizes efficiently.
     if (out_height == in_height && out_width == in_width) {
       output = images.template cast<float>();
@@ -108,13 +232,8 @@ struct ResizeBilinear<CPUDevice, T> {
       xs[i].upper *= channels;
     }
 
-    for (int b = 0; b < batch_size; ++b) {
-      crop_resize_single_image(
-          images.data() + static_cast<int64>(b) * in_batch_num_values,
-          in_height, in_width, out_height, out_width, channels, 0,
-          out_width - 1, xs.data(), 0, out_height - 1, ys.data(), 0.0f, false,
-          false, output.data() + static_cast<int64>(b) * out_batch_num_values);
-    }
+    resize_image<T>(images, batch_size, in_height, in_width, out_height,
+                    out_width, channels, xs, ys, output);
   }
 };
 }  // namespace functor
diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index c5292e1ae1..cab9eb729d 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -213,64 +213,32 @@ class AssignVariableOp : public OpKernel {
                     "Variable and value dtypes don't match; respectively, ",
                     dtype_, " and ", context->input(1).dtype()));
     Var* variable = nullptr;
-    OP_REQUIRES_OK(
-        context,
-        LookupOrCreateResource<Var>(
-            context, HandleFromInput(context, 0), &variable,
-            [this, context](Var** ptr) {
-              *ptr = new Var(dtype_);
-              PersistentTensor unused;
-              Tensor* tmp;
-              AllocatorAttributes attr;
-              if (!relax_constraints_) {
-                attr.set_gpu_compatible(true);
-                attr.set_nic_compatible(true);
-              }
-              TF_RETURN_IF_ERROR(context->allocate_persistent(
-                  dtype_, context->input(1).shape(), &unused, &tmp, attr));
-              *(*ptr)->tensor() = *tmp;
-              return Status::OK();
-            }));
+    const Tensor& value = context->input(1);
+    // Note: every resource-variable-manipulating op assumes copy-on-write
+    // semantics, and creates a copy of the variable's Tensor if its refcount is
+    // bigger than 1 when we try to modify it. This means we never need to copy
+    // the original tensor for AssignVariableOp; even if there are other live
+    // users of it we know none can modify it so this is always safe (even in
+    // esoteric cases where the same tensor is used to initialize multiple
+    // variables or the tensor is a constant this is safe, as future writes will
+    // trigger copies).
+    OP_REQUIRES_OK(context, LookupOrCreateResource<Var>(
+                                context, HandleFromInput(context, 0), &variable,
+                                [this, &value](Var** ptr) {
+                                  *ptr = new Var(dtype_);
+                                  *(*ptr)->tensor() = value;
+                                  (*ptr)->is_initialized = true;
+                                  return Status::OK();
+                                }));
     core::ScopedUnref s(variable);
-
     OP_REQUIRES(context, variable->tensor()->dtype() == dtype_,
                 errors::InvalidArgument(
                     "Trying to assign variable with wrong dtype. Expected ",
                     DataTypeString(variable->tensor()->dtype()), " got ",
                     DataTypeString(dtype_)));
-
-    const Tensor& value = context->input(1);
-    AllocatorAttributes attr;
-    if (!relax_constraints_) {
-      attr.set_gpu_compatible(true);
-      attr.set_nic_compatible(true);
-    }
-
-    // Copying is unnecessary if we are the last user of the value
-    // tensor, we can just adopt the input tensor's buffer instead.
-    std::unique_ptr<Tensor> input_alias = context->forward_input(
-        1, OpKernelContext::Params::kNoReservation /*output_index*/, dtype_,
-        value.shape(), DEVICE_MEMORY, attr);
     mutex_lock ml(*variable->mu());
     variable->is_initialized = true;
-    if (input_alias) {
-      *variable->tensor() = *input_alias;
-      return;
-    }
-
-    // Need to copy, but maybe we can re-use variable's buffer?
-    if (!variable->tensor()->RefCountIsOne() ||
-        !variable->tensor()->shape().IsSameSize(value.shape())) {
-      // Copy to new buffer
-      PersistentTensor unused;
-      Tensor* tmp;
-      OP_REQUIRES_OK(context, context->allocate_persistent(
-                                  dtype_, value.shape(), &unused, &tmp, attr));
-      *variable->tensor() = *tmp;
-    }
-    functor::DenseUpdate<Device, T, ASSIGN> copy_functor;
-    copy_functor(context->eigen_device<Device>(), variable->tensor()->flat<T>(),
-                 value.flat<T>());
+    *variable->tensor() = value;
   }
 
  private:
diff --git a/tensorflow/core/kernels/save_restore_tensor.cc b/tensorflow/core/kernels/save_restore_tensor.cc
index 990bd2bff9..7930ce4615 100644
--- a/tensorflow/core/kernels/save_restore_tensor.cc
+++ b/tensorflow/core/kernels/save_restore_tensor.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
@@ -226,43 +227,53 @@ void RestoreTensor(OpKernelContext* context,
 #undef READER_COPY
 }
 
-Status RestoreTensorsV2(OpKernelContext* context, const Tensor& prefix,
-                        const Tensor& tensor_names,
-                        const Tensor& shape_and_slices,
-                        gtl::ArraySlice<DataType> dtypes) {
-  const string& prefix_string = prefix.scalar<string>()();
+namespace {
 
-  const auto& tensor_names_flat = tensor_names.flat<string>();
-  const auto& shape_and_slices_flat = shape_and_slices.flat<string>();
+// Tensors larger than this threshold will be restored from a thread-pool.
+const int64 kLargeShapeThreshold = 16 << 20;  // 16M
 
-  // Sort lookup keys to improve locality when reading multiple tensors.
-  std::vector<size_t> sorted_name_idx(tensor_names_flat.size());
-  std::iota(sorted_name_idx.begin(), sorted_name_idx.end(), 0);
-  std::sort(sorted_name_idx.begin(), sorted_name_idx.end(),
-            [&tensor_names_flat](size_t a, size_t b) {
-              return tensor_names_flat(a) < tensor_names_flat(b);
-            });
+// A restore operation for a single tensor.  Small tensors may be restored
+// directly from the op thread to improve read locality.  Large tensors can be
+// restored from a thread pool: this requires creating a separate BundleReader
+// for each restore.
+struct RestoreOp {
+  RestoreOp& operator=(const RestoreOp&) = delete;
 
-  BundleReader reader(Env::Default(), prefix_string);
-  TF_RETURN_IF_ERROR(reader.status());
+  bool should_run_in_pool(BundleReader* reader) const {
+    TensorShape restored_full_shape;
 
-  // TODO(zongheng): potential optimization: one Seek() in first lookup.
-  // TODO(zongheng): consider measuring speed and issuing concurrent lookups
-  // within a fixed memory budget.
-  TensorShape restored_full_shape;
-  Tensor* restored_tensor = nullptr;
-  for (auto i : sorted_name_idx) {
-    const string& tensor_name = tensor_names_flat(i);
-    const string& shape_and_slice = shape_and_slices_flat(i);
+    // Ignore status here; we'll catch the error later.
+    if (!reader->LookupTensorShape(tensor_name, &restored_full_shape).ok()) {
+      return false;
+    }
 
+    return restored_full_shape.num_elements() > kLargeShapeThreshold;
+  }
+
+  // Run this restore operation using a new BundleReader.
+  void run_with_new_reader() {
+    BundleReader reader(Env::Default(), reader_prefix);
+    if (!reader.status().ok()) {
+      status = reader.status();
+      return;
+    }
+
+    status = run(&reader);
+  }
+
+  Status run(BundleReader* reader) {
+    TensorShape restored_full_shape;
     TF_RETURN_IF_ERROR(
-        reader.LookupTensorShape(tensor_name, &restored_full_shape));
+        reader->LookupTensorShape(tensor_name, &restored_full_shape));
 
+    VLOG(1) << "Restoring tensor " << idx << " : " << tensor_name << " : "
+            << restored_full_shape.num_elements();
+    Tensor* restored_tensor;
     if (shape_and_slice.empty()) {
       // Lookup the full tensor.
       TF_RETURN_IF_ERROR(
-          context->allocate_output(i, restored_full_shape, &restored_tensor));
-      TF_RETURN_IF_ERROR(reader.Lookup(tensor_name, restored_tensor));
+          context->allocate_output(idx, restored_full_shape, &restored_tensor));
+      TF_RETURN_IF_ERROR(reader->Lookup(tensor_name, restored_tensor));
     } else {
       // Lookup the slice.
       TensorShape parsed_full_shape;
@@ -272,6 +283,7 @@ Status RestoreTensorsV2(OpKernelContext* context, const Tensor& prefix,
       TF_RETURN_IF_ERROR(
           checkpoint::ParseShapeAndSlice(shape_and_slice, &parsed_full_shape,
                                          &parsed_slice, &parsed_slice_shape));
+
       if (!restored_full_shape.IsSameSize(parsed_full_shape)) {
         return errors::InvalidArgument(
             "tensor_name = ", tensor_name, "; shape in shape_and_slice spec ",
@@ -279,19 +291,93 @@ Status RestoreTensorsV2(OpKernelContext* context, const Tensor& prefix,
             " does not match the shape stored in checkpoint: ",
             restored_full_shape.DebugString());
       }
-
       TF_RETURN_IF_ERROR(
-          context->allocate_output(i, parsed_slice_shape, &restored_tensor));
+          context->allocate_output(idx, parsed_slice_shape, &restored_tensor));
       TF_RETURN_IF_ERROR(
-          reader.LookupSlice(tensor_name, parsed_slice, restored_tensor));
+          reader->LookupSlice(tensor_name, parsed_slice, restored_tensor));
+    }
+    return Status::OK();
+  }
+
+  OpKernelContext* context;
+  size_t idx;
+  string tensor_name;
+  string shape_and_slice;
+  string reader_prefix;
+
+  ::tensorflow::Status status;
+};
+
+}  // namespace
+
+Status RestoreTensorsV2(OpKernelContext* context, const Tensor& prefix,
+                        const Tensor& tensor_names,
+                        const Tensor& shape_and_slices,
+                        gtl::ArraySlice<DataType> dtypes) {
+  const string& prefix_string = prefix.scalar<string>()();
+
+  const auto& tensor_names_flat = tensor_names.flat<string>();
+  const auto& shape_and_slices_flat = shape_and_slices.flat<string>();
+
+  // Sort lookup keys to improve locality when reading multiple tensors.
+  std::vector<size_t> sorted_name_idx(tensor_names_flat.size());
+  std::iota(sorted_name_idx.begin(), sorted_name_idx.end(), 0);
+  std::sort(sorted_name_idx.begin(), sorted_name_idx.end(),
+            [&tensor_names_flat](size_t a, size_t b) {
+              return tensor_names_flat(a) < tensor_names_flat(b);
+            });
+
+  std::vector<std::unique_ptr<RestoreOp> > pool_restore_ops;
+  std::vector<std::unique_ptr<RestoreOp> > direct_restore_ops;
+
+  BundleReader default_reader(Env::Default(), prefix_string);
+  TF_RETURN_IF_ERROR(default_reader.status());
+
+  for (auto i : sorted_name_idx) {
+    const string& tensor_name = tensor_names_flat(i);
+    const string& shape_and_slice = shape_and_slices_flat(i);
+    auto op =
+        new RestoreOp{context, i, tensor_name, shape_and_slice, prefix_string};
+    if (op->should_run_in_pool(&default_reader)) {
+      pool_restore_ops.emplace_back(op);
+    } else {
+      direct_restore_ops.emplace_back(op);
     }
-    if (dtypes[i] != restored_tensor->dtype()) {
+  }
+
+  {
+    // Schedule any threaded operations first, skipping thread pool creation if
+    // we don't have any expensive operations.
+    std::unique_ptr<thread::ThreadPool> reader_pool;
+    if (!pool_restore_ops.empty()) {
+      reader_pool.reset(
+          new thread::ThreadPool(Env::Default(), "restore_tensors", 8));
+      for (auto& op : pool_restore_ops) {
+        reader_pool->Schedule([&op]() { op->run_with_new_reader(); });
+      }
+    }
+
+    // Read small tensors from the op thread
+    for (auto& op : direct_restore_ops) {
+      TF_RETURN_IF_ERROR(op->run(&default_reader));
+    }
+  }
+
+  // Check status of pool ops; this must come after the pool shuts down.
+  for (auto& op : pool_restore_ops) {
+    TF_RETURN_IF_ERROR(op->status);
+  }
+
+  for (auto i : sorted_name_idx) {
+    const string& tensor_name = tensor_names_flat(i);
+    if (dtypes[i] != context->mutable_output(i)->dtype()) {
       return errors::InvalidArgument(
           "tensor_name = ", tensor_name, "; expected dtype ",
           DataTypeString(dtypes[i]), " does not equal restored dtype ",
-          DataTypeString(restored_tensor->dtype()));
+          DataTypeString(context->mutable_output(i)->dtype()));
     }
   }
+
   return Status::OK();
 }
 
diff --git a/tensorflow/core/kernels/scatter_nd_op.cc b/tensorflow/core/kernels/scatter_nd_op.cc
index c44753e25e..e0194605ce 100644
--- a/tensorflow/core/kernels/scatter_nd_op.cc
+++ b/tensorflow/core/kernels/scatter_nd_op.cc
@@ -541,11 +541,13 @@ Status DoScatterNd(OpKernelContext* c, const Tensor& indices,
     }
   }
   if (bad_i >= 0) {
+    auto slice_shape = indices.shape();
+    slice_shape.RemoveLastDims(1);
     return errors::InvalidArgument(
-        "Invalid indices: ", SliceDebugString(indices.shape(), bad_i), " = [",
+        "indices", SliceDebugString(slice_shape, bad_i), " = [",
         str_util::Join(
             gtl::ArraySlice<Index>(&indices_flat(bad_i, 0), slice_dim), ", "),
-        "] does not index into ", shape.DebugString());
+        "] does not index into shape ", shape.DebugString());
   }
   return Status::OK();
 }
diff --git a/tensorflow/core/kernels/scatter_nd_op_test.cc b/tensorflow/core/kernels/scatter_nd_op_test.cc
index c134a8dd5b..95ecc69c95 100644
--- a/tensorflow/core/kernels/scatter_nd_op_test.cc
+++ b/tensorflow/core/kernels/scatter_nd_op_test.cc
@@ -185,7 +185,7 @@ TEST_F(ScatterNdUpdateOpTest, Error_IndexOutOfRange) {
                            {100, 101, 102, 777, 778, 779, 10000, 10001, 10002});
   Status s = RunOpKernel();
   EXPECT_TRUE(str_util::StrContains(
-      s.ToString(), "Invalid indices: [2,0] = [99] does not index into [5,3]"))
+      s.ToString(), "indices[2] = [99] does not index into shape [5,3]"))
       << s;
 }
 
diff --git a/tensorflow/core/kernels/strided_slice_op.cc b/tensorflow/core/kernels/strided_slice_op.cc
index 1e3e92a68a..59fdc2262a 100644
--- a/tensorflow/core/kernels/strided_slice_op.cc
+++ b/tensorflow/core/kernels/strided_slice_op.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/training_op_helpers.h"
 #include "tensorflow/core/kernels/variable_ops.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
@@ -304,6 +305,9 @@ class StridedSliceAssignOp : public OpKernel {
       Var* v;
       OP_REQUIRES_OK(context,
                      LookupResource(context, HandleFromInput(context, 0), &v));
+      mutex_lock ml(*v->mu());
+      OP_REQUIRES_OK(context,
+                     PrepareToUpdateVariable<Device, T>(context, v->tensor()));
       old_lhs = *v->tensor();
       OP_REQUIRES(context, old_lhs.dtype() == DataTypeToEnum<T>::value,
                   errors::InvalidArgument(
diff --git a/tensorflow/core/kernels/training_op_helpers.cc b/tensorflow/core/kernels/training_op_helpers.cc
index f288e124ee..d3c4f62071 100644
--- a/tensorflow/core/kernels/training_op_helpers.cc
+++ b/tensorflow/core/kernels/training_op_helpers.cc
@@ -39,8 +39,15 @@ mutex* GetTrainingVariableMutex(OpKernelContext* ctx, int input) {
 // GetInputTensor which will signal a failure.
 std::vector<mutex_lock> MaybeLockVariableInputMutexesInOrder(
     OpKernelContext* ctx, bool do_lock, const std::vector<int>& input_ids) {
+  bool any_resource = false;
+  for (auto i : input_ids) {
+    if (ctx->input_dtype(i) == DT_RESOURCE) {
+      any_resource = true;
+      break;
+    }
+  }
   std::vector<mutex_lock> locks;
-  if (!do_lock) {
+  if (!do_lock && !any_resource) {
     return locks;
   }
   std::vector<mutex*> mutexes;
diff --git a/tensorflow/core/kernels/training_op_helpers.h b/tensorflow/core/kernels/training_op_helpers.h
index 7e56e15450..765335d3a0 100644
--- a/tensorflow/core/kernels/training_op_helpers.h
+++ b/tensorflow/core/kernels/training_op_helpers.h
@@ -80,18 +80,8 @@ Status GetInputTensorFromVariable(OpKernelContext* ctx, int input,
     Var* var;
     TF_RETURN_IF_ERROR(LookupResource(ctx, HandleFromInput(ctx, input), &var));
     core::ScopedUnref unref_var(var);
-    if (lock_held) {
-      TF_RETURN_IF_ERROR(
-          PrepareToUpdateVariable<Device, T>(ctx, var->tensor()));
-      *out = *var->tensor();
-    } else {
-      mutex_lock ml(*var->mu());
-      if (!sparse) {
-        TF_RETURN_IF_ERROR(
-            PrepareToUpdateVariable<Device, T>(ctx, var->tensor()));
-      }
-      *out = *var->tensor();
-    }
+    TF_RETURN_IF_ERROR(PrepareToUpdateVariable<Device, T>(ctx, var->tensor()));
+    *out = *var->tensor();
     return Status::OK();
   }
   *out = ctx->mutable_input(input, lock_held);
diff --git a/tensorflow/core/lib/bfloat16/bfloat16.h b/tensorflow/core/lib/bfloat16/bfloat16.h
index 1c130ba300..d6f3f26cd5 100644
--- a/tensorflow/core/lib/bfloat16/bfloat16.h
+++ b/tensorflow/core/lib/bfloat16/bfloat16.h
@@ -45,17 +45,25 @@ typedef std::complex<double> complex128;
 struct bfloat16 {
   B16_DEVICE_FUNC bfloat16() {}
 
-  B16_DEVICE_FUNC explicit bfloat16(const float v) {
+  B16_DEVICE_FUNC static bfloat16 truncate_to_bfloat16(const float v) {
+    bfloat16 output;
     if (float_isnan(v)) {
-      value = NAN_VALUE;
-      return;
+      output.value = NAN_VALUE;
+      return output;
     }
     const uint16_t* p = reinterpret_cast<const uint16_t*>(&v);
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-    value = p[0];
+    output.value = p[0];
 #else
-    value = p[1];
+    output.value = p[1];
 #endif
+    return output;
+  }
+
+  B16_DEVICE_FUNC explicit bfloat16(const float v) {
+    // TODO(asabne) : change the below line to
+    // value = round_to_bfloat16(v).value;
+    value = truncate_to_bfloat16(v).value;
   }
 
   B16_DEVICE_FUNC explicit bfloat16(const double val)
@@ -169,8 +177,6 @@ struct bfloat16 {
 
   // Converts a float point to bfloat16, with round-nearest-to-even as rounding
   // method.
-  // TODO(b/69266521): Add a truncate_to_bfloat16 function and make this
-  // function as default behavior.
   // TODO: There is a slightly faster implementation (8% faster on CPU)
   // than this (documented in cl/175987786), that is exponentially harder to
   // understand and document. Switch to the faster version when converting to
diff --git a/tensorflow/core/lib/core/errors.h b/tensorflow/core/lib/core/errors.h
index 51c09032df..a631d9815a 100644
--- a/tensorflow/core/lib/core/errors.h
+++ b/tensorflow/core/lib/core/errors.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <sstream>
 
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
@@ -118,6 +119,25 @@ DECLARE_ERROR(Unauthenticated, UNAUTHENTICATED)
 
 #undef DECLARE_ERROR
 
+// Produces a formatted string pattern from the name which can uniquely identify
+// this node upstream to produce an informative error message. The pattern
+// followed is: {{node <name>}}
+// Note: The pattern below determines the regex _NODEDEF_NAME_RE in the file
+// tensorflow/python/client/session.py
+// LINT.IfChange
+inline string FormatNodeNameForError(const string& name) {
+  return strings::StrCat("{{node ", name, "}}");
+}
+// LINT.ThenChange(//tensorflow/python/client/session.py)
+template <typename T>
+string FormatNodeNamesForError(const T& names) {
+  ::tensorflow::str_util::Formatter<string> f(
+      [](string* output, const string& s) {
+        ::tensorflow::strings::StrAppend(output, FormatNodeNameForError(s));
+      });
+  return ::tensorflow::str_util::Join(names, ", ", f);
+}
+
 // The CanonicalCode() for non-errors.
 using ::tensorflow::error::OK;
 
diff --git a/tensorflow/core/lib/db/sqlite.cc b/tensorflow/core/lib/db/sqlite.cc
index cb6943379d..cf11f3a331 100644
--- a/tensorflow/core/lib/db/sqlite.cc
+++ b/tensorflow/core/lib/db/sqlite.cc
@@ -112,6 +112,7 @@ Status EnvPragma(Sqlite* db, const char* pragma, const char* var) {
 /* static */
 Status Sqlite::Open(const string& path, int flags, Sqlite** db) {
   flags |= SQLITE_OPEN_PRIVATECACHE;
+  flags |= SQLITE_OPEN_URI;
   sqlite3* sqlite = nullptr;
   int rc = sqlite3_open_v2(path.c_str(), &sqlite, flags, nullptr);
   if (rc != SQLITE_OK) {
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 69351cd392..4ac8e15160 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -12074,6 +12074,32 @@ op {
   }
 }
 op {
+  name: "Cast"
+  input_arg {
+    name: "x"
+    type_attr: "SrcT"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "DstT"
+  }
+  attr {
+    name: "SrcT"
+    type: "type"
+  }
+  attr {
+    name: "DstT"
+    type: "type"
+  }
+  attr {
+    name: "Truncate"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
   name: "Ceil"
   input_arg {
     name: "x"
diff --git a/tensorflow/core/ops/lookup_ops.cc b/tensorflow/core/ops/lookup_ops.cc
index 444aa8b954..2059741da9 100644
--- a/tensorflow/core/ops/lookup_ops.cc
+++ b/tensorflow/core/ops/lookup_ops.cc
@@ -140,11 +140,13 @@ REGISTER_OP("LookupTableSize")
     .Input("table_handle: Ref(string)")
     .Output("size: int64")
     .SetShapeFn(TwoElementVectorInputsAndScalarOutputs);
+WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS("LookupTableSize");
 
 REGISTER_OP("LookupTableSizeV2")
     .Input("table_handle: resource")
     .Output("size: int64")
     .SetShapeFn(ScalarAndTwoElementVectorInputsAndScalarOutputs);
+WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS("LookupTableSizeV2");
 
 REGISTER_OP("LookupTableExport")
     .Input("table_handle: Ref(string)")
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 4b0591c6e8..1667c398f4 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -114,6 +114,7 @@ REGISTER_OP("Cast")
     .Output("y: DstT")
     .Attr("SrcT: type")
     .Attr("DstT: type")
+    .Attr("Truncate: bool = false")
     .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("_HostCast")
@@ -121,6 +122,7 @@ REGISTER_OP("_HostCast")
     .Output("y: DstT")
     .Attr("SrcT: type")
     .Attr("DstT: type")
+    .Attr("Truncate: bool = false")
     .SetShapeFn(shape_inference::UnchangedShape)
     .Doc(R"doc(
 Cast x of type SrcT to y of DstT.
@@ -1504,6 +1506,13 @@ REGISTER_OP("QuantizedAdd")
     .SetIsCommutative()
     .SetShapeFn([](InferenceContext* c) {
       TF_RETURN_IF_ERROR(shape_inference::BroadcastBinaryOpShapeFn(c));
+      // min_x, max_x, min_y, max_y should be scalar.
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+
       c->set_output(1, c->Scalar());
       c->set_output(2, c->Scalar());
       return Status::OK();
diff --git a/tensorflow/core/ops/math_ops_test.cc b/tensorflow/core/ops/math_ops_test.cc
index 25dc033065..23f1538912 100644
--- a/tensorflow/core/ops/math_ops_test.cc
+++ b/tensorflow/core/ops/math_ops_test.cc
@@ -543,4 +543,19 @@ TEST(MathOpsTest, HistogramFixedWidth_ShapeFn) {
   INFER_OK(op, "[?];[2];[]", "[?]");
   INFER_OK(op, "[?];[2];?", "[?]");
 }
+
+TEST(MathOpsTest, QuantizedAdd_ShapeFn) {
+  ShapeInferenceTestOp op("QuantizedAdd");
+
+  INFER_OK(op, "?;?;?;?;?;?", "?;[];[]");
+  INFER_OK(op, "?;?;[];[];[];[]", "?;[];[]");
+  INFER_OK(op, "[1,2];?;[];[];[];[]", "?;[];[]");
+  INFER_OK(op, "[];[2];[];[];[];[]", "[d1_0];[];[]");
+
+  // Rank checks on input scalars.
+  INFER_ERROR("must be rank 0", op, "?;?;[1];?;?;?");
+  INFER_ERROR("must be rank 0", op, "?;?;?;[2];?;?");
+  INFER_ERROR("must be rank 0", op, "?;?;?;?;[3];?");
+  INFER_ERROR("must be rank 0", op, "?;?;?;?;?;[4]");
+}
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 978bb0bbf4..22a2f423c2 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -4977,6 +4977,13 @@ op {
     name: "DstT"
     type: "type"
   }
+  attr {
+    name: "Truncate"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
   name: "Ceil"
diff --git a/tensorflow/core/platform/default/build_config_root.bzl b/tensorflow/core/platform/default/build_config_root.bzl
index 09029a4b25..3a012c23fd 100644
--- a/tensorflow/core/platform/default/build_config_root.bzl
+++ b/tensorflow/core/platform/default/build_config_root.bzl
@@ -58,3 +58,9 @@ def if_static(extra_deps, otherwise=[]):
       str(Label("//tensorflow:framework_shared_object")): otherwise,
       "//conditions:default": extra_deps,
   })
+
+def if_dynamic_kernels(extra_deps, otherwise=[]):
+  return select({
+      str(Label("//tensorflow:dynamic_loaded_kernels")): extra_deps,
+      "//conditions:default": otherwise,
+  })
diff --git a/tensorflow/core/platform/env_test.cc b/tensorflow/core/platform/env_test.cc
index c461a40086..305a9a682f 100644
--- a/tensorflow/core/platform/env_test.cc
+++ b/tensorflow/core/platform/env_test.cc
@@ -86,7 +86,7 @@ TEST_F(DefaultEnvTest, IncompleteReadOutOfRange) {
 
 TEST_F(DefaultEnvTest, ReadFileToString) {
   for (const int length : {0, 1, 1212, 2553, 4928, 8196, 9000, (1 << 20) - 1,
-                           1 << 20, (1 << 20) + 1}) {
+                           1 << 20, (1 << 20) + 1, (256 << 20) + 100}) {
     const string filename = strings::StrCat(BaseDir(), "/bar/..//file", length);
 
     // Write a file with the given length
diff --git a/tensorflow/core/platform/windows/port.cc b/tensorflow/core/platform/windows/port.cc
index f2aaf13bec..5375f56372 100644
--- a/tensorflow/core/platform/windows/port.cc
+++ b/tensorflow/core/platform/windows/port.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mem.h"
+#include "tensorflow/core/platform/numa.h"
 #include "tensorflow/core/platform/snappy.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -57,6 +58,17 @@ int NumSchedulableCPUs() {
   return system_info.dwNumberOfProcessors;
 }
 
+bool NUMAEnabled() {
+  // Not yet implemented: coming soon.
+  return false;
+}
+
+int NUMANumNodes() { return 1; }
+
+void NUMASetThreadNodeAffinity(int node) {}
+
+int NUMAGetThreadNodeAffinity() { return kNUMANoAffinity; }
+
 void* AlignedMalloc(size_t size, int minimum_alignment) {
 #ifdef TENSORFLOW_USE_JEMALLOC
   void* ptr = NULL;
@@ -108,6 +120,14 @@ void Free(void* ptr) {
 #endif
 }
 
+void* NUMAMalloc(int node, size_t size, int minimum_alignment) {
+  return AlignedMalloc(size, minimum_alignment);
+}
+
+void NUMAFree(void* ptr, size_t size) { Free(ptr); }
+
+int NUMAGetMemAffinity(const void* addr) { return kNUMANoAffinity; }
+
 void MallocExtension_ReleaseToSystem(std::size_t num_bytes) {
   // No-op.
 }
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index 22a2691dcc..d701ce8e12 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -416,6 +416,11 @@ message RunOptions {
   int64 timeout_in_ms = 2;
 
   // The thread pool to use, if session_inter_op_thread_pool is configured.
+  // To use the caller thread set this to -1 - this uses the caller thread
+  // to execute Session::Run() and thus avoids a context switch. Using the
+  // caller thread to execute Session::Run() should be done ONLY for simple
+  // graphs, where the overhead of an additional context switch is
+  // comparable with the overhead of Session::Run().
   int32 inter_op_thread_pool = 3;
 
   // Whether the partition graph(s) executed by the executor(s) should be
diff --git a/tensorflow/core/protobuf/eager_service.proto b/tensorflow/core/protobuf/eager_service.proto
index 5b05a1b3ee..63ba4eb173 100644
--- a/tensorflow/core/protobuf/eager_service.proto
+++ b/tensorflow/core/protobuf/eager_service.proto
@@ -8,6 +8,7 @@ import "tensorflow/core/framework/function.proto";
 import "tensorflow/core/framework/versions.proto";
 import "tensorflow/core/protobuf/tensorflow_server.proto";
 import "tensorflow/core/framework/tensor_shape.proto";
+import "tensorflow/core/framework/tensor.proto";
 
 message RemoteTensorHandle {
   // The ID of the operation that produced this tensor.
@@ -128,6 +129,24 @@ message RegisterFunctionRequest {
 message RegisterFunctionResponse {
 }
 
+message SendTensorRequest {
+  fixed64 context_id = 1;
+
+  // All remote tensors are identified by <Op ID, Output num>. To mimic this
+  // situation when directly sending tensors, we include an "artificial" op ID
+  // (which would have corresponded to the _Recv op when not using SendTensor).
+  int64 op_id = 2;
+  // The index within the repeated field is the output number that will help
+  // uniquely identify (along with the above op_id) the particular tensor.
+  repeated TensorProto tensors = 3;
+
+  // The device on which the tensors should be resident.
+  string device_name = 4;
+}
+
+message SendTensorResponse {
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 //
 // Eager Service defines a TensorFlow service that executes operations eagerly
@@ -174,4 +193,8 @@ service EagerService {
   // Takes a FunctionDef and makes it enqueable on the remote worker.
   rpc RegisterFunction(RegisterFunctionRequest)
       returns (RegisterFunctionResponse);
+
+  // An RPC to push tensors to the server. At times, certain environments don't
+  // allow the server to connect back to the client.
+  rpc SendTensor(SendTensorRequest) returns (SendTensorResponse);
 }
diff --git a/tensorflow/core/util/equal_graph_def_test.cc b/tensorflow/core/util/equal_graph_def_test.cc
index c54540332e..77ca8eaec3 100644
--- a/tensorflow/core/util/equal_graph_def_test.cc
+++ b/tensorflow/core/util/equal_graph_def_test.cc
@@ -85,7 +85,7 @@ TEST_F(EqualGraphDefTest, NoMatch) {
   Input(e_.opts().WithName("A"));
   Input(a_.opts().WithName("B"));
   EXPECT_FALSE(Match());
-  EXPECT_EQ("Did not find expected node 'A = Input[]()'", diff_);
+  EXPECT_EQ("Did not find expected node '{{node A}} = Input[]()'", diff_);
 }
 
 TEST_F(EqualGraphDefTest, MissingNode) {
@@ -93,7 +93,7 @@ TEST_F(EqualGraphDefTest, MissingNode) {
   Input(e_.opts().WithName("B"));
   Input(a_.opts().WithName("A"));
   EXPECT_FALSE(Match());
-  EXPECT_EQ("Did not find expected node 'B = Input[]()'", diff_);
+  EXPECT_EQ("Did not find expected node '{{node B}} = Input[]()'", diff_);
 }
 
 TEST_F(EqualGraphDefTest, ExtraNode) {
@@ -101,7 +101,7 @@ TEST_F(EqualGraphDefTest, ExtraNode) {
   Input(a_.opts().WithName("A"));
   Input(a_.opts().WithName("B"));
   EXPECT_FALSE(Match());
-  EXPECT_EQ("Found unexpected node 'B = Input[]()'", diff_);
+  EXPECT_EQ("Found unexpected node '{{node B}} = Input[]()'", diff_);
 }
 
 TEST_F(EqualGraphDefTest, NodeOrder) {
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index bb447e0393..566a42dbd5 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -17,7 +17,6 @@ limitations under the License.
 #define TENSORFLOW_CORE_UTIL_MKL_UTIL_H_
 #ifdef INTEL_MKL
 
-#include <string>
 #include <vector>
 #include <unordered_map>
 #include <utility>
@@ -1896,16 +1895,17 @@ class MklPrimitiveFactory {
   MklPrimitiveFactory() {}
   ~MklPrimitiveFactory() {}
 
-  MklPrimitive* GetOp(const std::string& key) {
+  MklPrimitive* GetOp(const string& key) {
     auto stream_iter = MklPrimitiveFactory<T>::GetHashMap().find(key);
     if (stream_iter == MklPrimitiveFactory<T>::GetHashMap().end()) {
       return nullptr;
     } else {
+      CHECK(stream_iter->second != nullptr) << "nullptr present in map";
       return stream_iter->second;
     }
   }
 
-  void SetOp(const std::string& key, MklPrimitive* op) {
+  void SetOp(const string& key, MklPrimitive* op) {
     auto stream_iter = MklPrimitiveFactory<T>::GetHashMap().find(key);
 
     CHECK(stream_iter == MklPrimitiveFactory<T>::GetHashMap().end());
@@ -1914,8 +1914,8 @@ class MklPrimitiveFactory {
   }
 
  private:
-  static inline std::unordered_map<std::string, MklPrimitive*>& GetHashMap() {
-    static thread_local std::unordered_map<std::string, MklPrimitive*> map_;
+  static inline std::unordered_map<string, MklPrimitive*>& GetHashMap() {
+    static thread_local std::unordered_map<string, MklPrimitive*> map_;
     return map_;
   }
 };
@@ -1943,9 +1943,7 @@ class FactoryKeyCreator {
     Append(StringPiece(buffer, sizeof(T)));
   }
 
-  std::string GetKey() {
-    return key_;
-  }
+  string GetKey() { return key_; }
 
  private:
   string key_;
@@ -2020,8 +2018,8 @@ class MklReorderPrimitiveFactory : public MklPrimitiveFactory<T> {
     MklReorderPrimitiveFactory() {};
     ~MklReorderPrimitiveFactory() {};
 
-    static std::string CreateKey(const memory* from, const memory* to) {
-      std::string prefix = "reorder";
+    static string CreateKey(const memory* from, const memory* to) {
+      string prefix = "reorder";
       FactoryKeyCreator key_creator;
       auto const &from_desc =  from->get_primitive_desc().desc().data;
       auto const &to_desc =  to->get_primitive_desc().desc().data;
@@ -2038,12 +2036,12 @@ class MklReorderPrimitiveFactory : public MklPrimitiveFactory<T> {
     }
 
     MklPrimitive* GetReorder(const memory* from, const memory* to) {
-      std::string key = CreateKey(from, to);
+      string key = CreateKey(from, to);
       return this->GetOp(key);
     }
 
     void SetReorder(const memory* from, const memory* to, MklPrimitive* op) {
-      std::string key = CreateKey(from, to);
+      string key = CreateKey(from, to);
       this->SetOp(key, op);
     }
 };
diff --git a/tensorflow/core/util/stat_summarizer.cc b/tensorflow/core/util/stat_summarizer.cc
index a5c1fda102..2117042034 100644
--- a/tensorflow/core/util/stat_summarizer.cc
+++ b/tensorflow/core/util/stat_summarizer.cc
@@ -133,7 +133,6 @@ void StatSummarizer::ProcessStepStats(const StepStats& step_stats) {
 
   int64 first_node_start_us =
       step_stats.dev_stats(0).node_stats(0).all_start_micros();
-  std::map<std::string, Detail> details;
 
   int node_num = 0;
   for (const auto& ds : step_stats.dev_stats()) {
@@ -177,22 +176,15 @@ void StatSummarizer::ProcessStepStats(const StepStats& step_stats) {
       ++node_num;
       const int64 curr_time = ns.all_end_rel_micros();
       curr_total_us += curr_time;
-      auto result = details.emplace(name, Detail());
       auto output_result =
           outputs_.emplace(name, std::vector<TensorDescription>());
       std::vector<TensorDescription>* outputs = &(output_result.first->second);
-      Detail* detail = &(result.first->second);
 
-      detail->start_us.UpdateStat(ns.all_start_micros() - first_node_start_us);
-      detail->rel_end_us.UpdateStat(curr_time);
+      int64_t start_us = (ns.all_start_micros() - first_node_start_us);
+      int64_t rel_end_us = curr_time;
 
       // If this is the first pass, initialize some values.
-      if (result.second) {
-        detail->name = name;
-        detail->type = op_type;
-
-        detail->run_order = node_num;
-
+      if (output_result.second) {
         outputs->resize(ns.output_size());
         for (const auto& output : ns.output()) {
           const int32 slot = output.slot();
@@ -202,7 +194,6 @@ void StatSummarizer::ProcessStepStats(const StepStats& step_stats) {
           }
           (*outputs)[slot] = output.tensor_description();
         }
-        detail->times_called = 0;
       }
 
       int64 curr_node_mem = 0;
@@ -210,11 +201,10 @@ void StatSummarizer::ProcessStepStats(const StepStats& step_stats) {
         const int64 mem_usage = mem.total_bytes();
         curr_node_mem += mem_usage;
       }
-      detail->mem_used.UpdateStat(curr_node_mem);
-      mem_total += curr_node_mem;
+      stats_calculator_->AddNodeStats(name, op_type, node_num, start_us,
+                                      rel_end_us, curr_node_mem);
 
-      ++detail->times_called;
-      stats_calculator_->UpdateDetails(details);
+      mem_total += curr_node_mem;
 
       Validate(outputs, ns);
     }
diff --git a/tensorflow/core/util/stats_calculator.cc b/tensorflow/core/util/stats_calculator.cc
index c4befbdb84..eb07754650 100644
--- a/tensorflow/core/util/stats_calculator.cc
+++ b/tensorflow/core/util/stats_calculator.cc
@@ -272,9 +272,24 @@ std::string StatsCalculator::GetOutputString() const {
   return stream.str();
 }
 
-void StatsCalculator::UpdateDetails(
-    const std::map<std::string, Detail>& details) {
-  details_.insert(details.begin(), details.end());
+void StatsCalculator::AddNodeStats(const std::string& name,
+                                   const std::string& type, int64_t run_order,
+                                   int64_t start_us, int64_t rel_end_us,
+                                   int64_t mem_used) {
+  Detail* detail = nullptr;
+  if (details_.find(name) == details_.end()) {
+    details_.insert({name, {}});
+    detail = &details_.at(name);
+    detail->type = type;
+    detail->name = name;
+    detail->run_order = run_order;
+  } else {
+    detail = &details_.at(name);
+  }
+  detail->start_us.UpdateStat(start_us);
+  detail->rel_end_us.UpdateStat(rel_end_us);
+  detail->mem_used.UpdateStat(mem_used);
+  detail->times_called++;
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/stats_calculator.h b/tensorflow/core/util/stats_calculator.h
index 39cef816f1..e191737bb2 100644
--- a/tensorflow/core/util/stats_calculator.h
+++ b/tensorflow/core/util/stats_calculator.h
@@ -163,7 +163,10 @@ class StatsCalculator {
   };
 
   const std::map<std::string, Detail>& GetDetails() const { return details_; }
-  void UpdateDetails(const std::map<std::string, Detail>& details);
+
+  void AddNodeStats(const std::string& name, const std::string& type,
+                    int64_t run_order, int64_t start_us, int64_t rel_end_us,
+                    int64_t mem_used);
 
  private:
   void OrderNodesByMetric(SortingMetric sorting_metric,
diff --git a/tensorflow/core/util/stats_calculator_test.cc b/tensorflow/core/util/stats_calculator_test.cc
new file mode 100644
index 0000000000..00d7bfc2f9
--- /dev/null
+++ b/tensorflow/core/util/stats_calculator_test.cc
@@ -0,0 +1,76 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/util/stats_calculator.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+using Detail = StatsCalculator::Detail;
+
+TEST(StatsCalculatorTest, TotalTimeMs) {
+  auto options = StatSummarizerOptions();
+  StatsCalculator calc(options);
+
+  EXPECT_EQ(0, calc.num_runs());
+  calc.UpdateRunTotalUs(1);
+
+  EXPECT_EQ(1, calc.num_runs());
+  calc.UpdateRunTotalUs(2);
+
+  EXPECT_EQ(2, calc.num_runs());
+  auto run_time_us = calc.run_total_us();
+  EXPECT_EQ(1, run_time_us.min());
+  EXPECT_FLOAT_EQ(1.5, run_time_us.avg());
+}
+
+TEST(StatsCalculatorTest, AddNodeStatsUpdate) {
+  auto options = StatSummarizerOptions();
+  StatsCalculator calc(options);
+  EXPECT_TRUE(calc.GetDetails().empty());
+
+  const int64_t node1_run_order = 1;
+  const int64_t run1_start_us = 1;
+  const int64_t run1_end_us = 2;
+  const int64_t run1_mem_used = 45;
+  calc.AddNodeStats("node1", "type_1", node1_run_order, run1_start_us,
+                    run1_end_us, run1_mem_used);
+  ASSERT_EQ(1, calc.GetDetails().size());
+  const Detail& detail = calc.GetDetails().at("node1");
+  EXPECT_EQ(1, detail.times_called);
+  EXPECT_EQ("node1", detail.name);
+  EXPECT_EQ("type_1", detail.type);
+  EXPECT_EQ(node1_run_order, detail.run_order);
+
+  const int64_t run2_start_us = 3;
+  const int64_t run2_end_us = 5;
+  const int64_t run2_mem_used = 145;
+  calc.AddNodeStats("node1", "type_1", node1_run_order, run2_start_us,
+                    run2_end_us, run2_mem_used);
+  EXPECT_EQ(1, calc.GetDetails().size());
+
+  EXPECT_EQ(2, detail.times_called);
+  EXPECT_EQ("node1", detail.name);
+  EXPECT_EQ("type_1", detail.type);
+  EXPECT_EQ(node1_run_order, detail.run_order);
+
+  EXPECT_EQ(run1_start_us + run2_start_us, detail.start_us.sum());
+  EXPECT_EQ(run1_end_us + run2_end_us, detail.rel_end_us.sum());
+  EXPECT_EQ(run1_mem_used + run2_mem_used, detail.mem_used.sum());
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/docs_src/deploy/distributed.md b/tensorflow/docs_src/deploy/distributed.md
index 8e2c818e39..fc3a60603f 100644
--- a/tensorflow/docs_src/deploy/distributed.md
+++ b/tensorflow/docs_src/deploy/distributed.md
@@ -314,7 +314,7 @@ serve multiple clients.
 
 **Cluster**
 
-A TensorFlow cluster comprises a one or more "jobs", each divided into lists of
+A TensorFlow cluster comprises one or more "jobs", each divided into lists of
 one or more "tasks". A cluster is typically dedicated to a particular high-level
 objective, such as training a neural network, using many machines in parallel. A
 cluster is defined by
diff --git a/tensorflow/docs_src/guide/debugger.md b/tensorflow/docs_src/guide/debugger.md
index 8d78fe6fbd..f0e465214e 100644
--- a/tensorflow/docs_src/guide/debugger.md
+++ b/tensorflow/docs_src/guide/debugger.md
@@ -780,7 +780,7 @@ sess.run(b)
 ``` python
 import numpy as np
 
-a = tf.Variable(np.ones[10], name="a")
+a = tf.Variable(np.ones(10), name="a")
 b = tf.add(a, a, name="b")
 sess = tf.Session()
 sess.run(tf.global_variables_initializer())
diff --git a/tensorflow/docs_src/guide/saved_model.md b/tensorflow/docs_src/guide/saved_model.md
index acc3d3ca0b..717488e7cc 100644
--- a/tensorflow/docs_src/guide/saved_model.md
+++ b/tensorflow/docs_src/guide/saved_model.md
@@ -2,9 +2,8 @@
 
 The @{tf.train.Saver} class provides methods to save and restore models. The
 @{tf.saved_model.simple_save} function is an easy way to build a
-@{tf.saved_model$saved model} suitable for serving.
-[Estimators](@{$guide/estimators}) automatically save and restore
-variables in the `model_dir`.
+@{tf.saved_model$saved model} suitable for serving. [Estimators](./estimators)
+automatically save and restore variables in the `model_dir`.
 
 ## Save and restore variables
 
diff --git a/tensorflow/docs_src/guide/using_gpu.md b/tensorflow/docs_src/guide/using_gpu.md
index c429ca4750..c0218fd12e 100644
--- a/tensorflow/docs_src/guide/using_gpu.md
+++ b/tensorflow/docs_src/guide/using_gpu.md
@@ -143,7 +143,7 @@ If the device you have specified does not exist, you will get
 ```
 InvalidArgumentError: Invalid argument: Cannot assign a device to node 'b':
 Could not satisfy explicit device specification '/device:GPU:2'
-   [[Node: b = Const[dtype=DT_FLOAT, value=Tensor<type: float shape: [3,2]
+   [[{{node b}} = Const[dtype=DT_FLOAT, value=Tensor<type: float shape: [3,2]
    values: 1 2 3...>, _device="/device:GPU:2"]()]]
 ```
 
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index 4c09ba1a8b..31dcad64d4 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -223,46 +223,90 @@ will likely differ from our sample input:
 <pre>
 $ <b>cd tensorflow</b>  # cd to the top-level directory created
 $ <b>./configure</b>
+You have bazel 0.15.0 installed.
 Please specify the location of python. [Default is /usr/bin/python]: <b>/usr/bin/python2.7</b>
+
+
 Found possible Python library paths:
   /usr/local/lib/python2.7/dist-packages
   /usr/lib/python2.7/dist-packages
 Please input the desired Python library path to use.  Default is [/usr/lib/python2.7/dist-packages]
 
-Using python library path: /usr/local/lib/python2.7/dist-packages
-Please specify optimization flags to use during compilation when bazel option "--config=opt" is specified [Default is -march=native]:
-Do you wish to use jemalloc as the malloc implementation? [Y/n]
-jemalloc enabled
-Do you wish to build TensorFlow with Google Cloud Platform support? [y/N]
-No Google Cloud Platform support will be enabled for TensorFlow
-Do you wish to build TensorFlow with Hadoop File System support? [y/N]
-No Hadoop File System support will be enabled for TensorFlow
-Do you wish to build TensorFlow with the XLA just-in-time compiler (experimental)? [y/N]
-No XLA support will be enabled for TensorFlow
-Do you wish to build TensorFlow with VERBS support? [y/N]
-No VERBS support will be enabled for TensorFlow
-Do you wish to build TensorFlow with OpenCL support? [y/N]
-No OpenCL support will be enabled for TensorFlow
-Do you wish to build TensorFlow with CUDA support? [y/N] <b>Y</b>
-CUDA support will be enabled for TensorFlow
-Do you want to use clang as CUDA compiler? [y/N]
-nvcc will be used as CUDA compiler
+Do you wish to build TensorFlow with jemalloc as malloc support? [Y/n]:
+jemalloc as malloc support will be enabled for TensorFlow.
+
+Do you wish to build TensorFlow with Google Cloud Platform support? [Y/n]:
+Google Cloud Platform support will be enabled for TensorFlow.
+
+Do you wish to build TensorFlow with Hadoop File System support? [Y/n]:
+Hadoop File System support will be enabled for TensorFlow.
+
+Do you wish to build TensorFlow with Amazon AWS Platform support? [Y/n]:
+Amazon AWS Platform support will be enabled for TensorFlow.
+
+Do you wish to build TensorFlow with Apache Kafka Platform support? [Y/n]:
+Apache Kafka Platform support will be enabled for TensorFlow.
+
+Do you wish to build TensorFlow with XLA JIT support? [y/N]:
+No XLA JIT support will be enabled for TensorFlow.
+
+Do you wish to build TensorFlow with GDR support? [y/N]:
+No GDR support will be enabled for TensorFlow.
+
+Do you wish to build TensorFlow with VERBS support? [y/N]:
+No VERBS support will be enabled for TensorFlow.
+
+Do you wish to build TensorFlow with OpenCL SYCL support? [y/N]:
+No OpenCL SYCL support will be enabled for TensorFlow.
+
+Do you wish to build TensorFlow with CUDA support? [y/N]: <b>Y</b>
+CUDA support will be enabled for TensorFlow.
+
 Please specify the CUDA SDK version you want to use. [Leave empty to default to CUDA 9.0]: <b>9.0</b>
+
+
 Please specify the location where CUDA 9.0 toolkit is installed. Refer to README.md for more details. [Default is /usr/local/cuda]:
-Please specify which gcc should be used by nvcc as the host compiler. [Default is /usr/bin/gcc]:
-Please specify the cuDNN version you want to use. [Leave empty to default to cuDNN 7.0]: <b>7</b>
+
+
+Please specify the cuDNN version you want to use. [Leave empty to default to cuDNN 7.0]: <b>7.0</b>
+
+
 Please specify the location where cuDNN 7 library is installed. Refer to README.md for more details. [Default is /usr/local/cuda]:
-Please specify a list of comma-separated CUDA compute capabilities you want to build with.
+
+
+Do you wish to build TensorFlow with TensorRT support? [y/N]:
+No TensorRT support will be enabled for TensorFlow.
+
+Please specify the NCCL version you want to use. If NCLL 2.2 is not installed, then you can use version 1.3 that can be fetched automatically but it may have worse performance with multiple GPUs. [Default is 2.2]: 1.3
+
+
+Please specify a list of comma-separated Cuda compute capabilities you want to build with.
 You can find the compute capability of your device at: https://developer.nvidia.com/cuda-gpus.
-Please note that each additional compute capability significantly increases your build time and binary size.
+Please note that each additional compute capability significantly increases your
+build time and binary size. [Default is: 3.5,7.0] <b>6.1</b>
+
+
+Do you want to use clang as CUDA compiler? [y/N]:
+nvcc will be used as CUDA compiler.
 
-Do you wish to build TensorFlow with MPI support? [y/N]
-MPI support will not be enabled for TensorFlow
+Please specify which gcc should be used by nvcc as the host compiler. [Default is /usr/bin/gcc]:
+
+
+Do you wish to build TensorFlow with MPI support? [y/N]:
+No MPI support will be enabled for TensorFlow.
+
+Please specify optimization flags to use during compilation when bazel option "--config=opt" is specified [Default is -march=native]:
+
+
+Would you like to interactively configure ./WORKSPACE for Android builds? [y/N]:
+Not configuring the WORKSPACE for Android builds.
+
+Preconfigured Bazel build configs. You can use any of the below by adding "--config=<>" to your build command. See tools/bazel.rc for more details.
+    --config=mkl            # Build with MKL support.
+    --config=monolithic     # Config for mostly static monolithic build.
 Configuration finished
 </pre>
 
-[Default is: "3.5,7.0"]: <b>6.0,7.0</b>
-
 If you told `configure` to build for GPU support, then `configure` will create a
 canonical set of symbolic links to the CUDA libraries on your system. Therefore,
 every time you change the CUDA library paths, you must rerun the `configure`
diff --git a/tensorflow/docs_src/mobile/README.md b/tensorflow/docs_src/mobile/README.md
new file mode 100644
index 0000000000..ecf4267265
--- /dev/null
+++ b/tensorflow/docs_src/mobile/README.md
@@ -0,0 +1,3 @@
+# TF Lite subsite
+
+This subsite directory lives in [tensorflow/contrib/lite/g3doc](../../contrib/lite/g3doc/).
diff --git a/tensorflow/docs_src/mobile/index.md b/tensorflow/docs_src/mobile/index.md
deleted file mode 100644
index 6032fcad02..0000000000
--- a/tensorflow/docs_src/mobile/index.md
+++ /dev/null
@@ -1,33 +0,0 @@
-# Overview
-
-TensorFlow was designed to be a good deep learning solution for mobile
-platforms. Currently we have two solutions for deploying machine learning
-applications on mobile and embedded devices:
-@{$mobile/mobile_intro$TensorFlow for Mobile} and @{$mobile/tflite$TensorFlow Lite}.
-
-## TensorFlow Lite versus TensorFlow Mobile
-
-Here are a few of the differences between the two:
-
-- TensorFlow Lite is an evolution of TensorFlow Mobile.  In most cases, apps
-  developed with TensorFlow Lite will have a smaller binary size, fewer
-  dependencies, and better performance.
-
-- TensorFlow Lite supports only a limited set of operators, so not all models
-  will work on it by default. TensorFlow for Mobile has a fuller set of
-  supported functionality.
-
-TensorFlow Lite provides better performance and a small binary size on mobile
-platforms as well as the ability to leverage hardware acceleration if available
-on their platforms. In addition, it has many fewer dependencies so it can be
-built and hosted on simpler, more constrained device scenarios. TensorFlow Lite
-also allows targeting accelerators through the [Neural Networks
-API](https://developer.android.com/ndk/guides/neuralnetworks/index.html).
-
-TensorFlow Lite currently has coverage for a limited set of operators. While
-TensorFlow for Mobile supports only a constrained set of ops by default, in
-principle if you use an arbitrary operator in TensorFlow, it can be customized
-to build that kernel. Thus use cases which are not currently supported by
-TensorFlow Lite should continue to use TensorFlow for Mobile. As TensorFlow Lite
-evolves, it will gain additional operators, and the decision will be easier to
-make.
diff --git a/tensorflow/docs_src/mobile/leftnav_files b/tensorflow/docs_src/mobile/leftnav_files
deleted file mode 100644
index 97340ef7e1..0000000000
--- a/tensorflow/docs_src/mobile/leftnav_files
+++ /dev/null
@@ -1,15 +0,0 @@
-index.md
-### TensorFlow Lite
-tflite/index.md
-tflite/devguide.md
-tflite/demo_android.md
-tflite/demo_ios.md
-tflite/performance.md
->>>
-### TensorFlow Mobile
-mobile_intro.md
-android_build.md
-ios_build.md
-linking_libs.md
-prepare_models.md
-optimizing.md
diff --git a/tensorflow/docs_src/performance/performance_guide.md b/tensorflow/docs_src/performance/performance_guide.md
index cb0f5ca924..dafacbe379 100644
--- a/tensorflow/docs_src/performance/performance_guide.md
+++ b/tensorflow/docs_src/performance/performance_guide.md
@@ -464,7 +464,7 @@ equal to the number of physical cores rather than logical cores.
   config = tf.ConfigProto()
   config.intra_op_parallelism_threads = 44
   config.inter_op_parallelism_threads = 44
-  tf.session(config=config)
+  tf.Session(config=config)
 
 ```
 
diff --git a/tensorflow/docs_src/performance/xla/broadcasting.md b/tensorflow/docs_src/performance/xla/broadcasting.md
index eaa709c2f8..7018ded53f 100644
--- a/tensorflow/docs_src/performance/xla/broadcasting.md
+++ b/tensorflow/docs_src/performance/xla/broadcasting.md
@@ -99,7 +99,7 @@ dimensions 1 and 2 of the cuboid.
 
 This type of broadcast is used in the binary ops in `XlaBuilder`, if the
 `broadcast_dimensions` argument is given. For example, see
-[XlaBuilder::Add](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.cc).
+[XlaBuilder::Add](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.cc).
 In the XLA source code, this type of broadcasting is sometimes called "InDim"
 broadcasting.
 
diff --git a/tensorflow/docs_src/performance/xla/operation_semantics.md b/tensorflow/docs_src/performance/xla/operation_semantics.md
index 68c427a316..5f7482f90f 100644
--- a/tensorflow/docs_src/performance/xla/operation_semantics.md
+++ b/tensorflow/docs_src/performance/xla/operation_semantics.md
@@ -1,7 +1,7 @@
 # Operation Semantics
 
 The following describes the semantics of operations defined in the
-[`XlaBuilder`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h)
+[`XlaBuilder`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h)
 interface. Typically, these operations map one-to-one to operations defined in
 the RPC interface in
 [`xla_data.proto`](https://www.tensorflow.org/code/tensorflow/compiler/xla/xla_data.proto).
@@ -16,7 +16,7 @@ and familiar names; for example a *vector* is a 1-dimensional array and a
 ## BatchNormGrad
 
 See also
-[`XlaBuilder::BatchNormGrad`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h)
+[`XlaBuilder::BatchNormGrad`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h)
 and [the original batch normalization paper](https://arxiv.org/abs/1502.03167)
 for a detailed description of the algorithm.
 
@@ -80,7 +80,7 @@ The output type is a tuple of three handles:
 ## BatchNormInference
 
 See also
-[`XlaBuilder::BatchNormInference`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h)
+[`XlaBuilder::BatchNormInference`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h)
 and [the original batch normalization paper](https://arxiv.org/abs/1502.03167)
 for a detailed description of the algorithm.
 
@@ -115,7 +115,7 @@ The output is an n-dimensional, normalized array with the same shape as input
 ## BatchNormTraining
 
 See also
-[`XlaBuilder::BatchNormTraining`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h)
+[`XlaBuilder::BatchNormTraining`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h)
 and [`the original batch normalization paper`](https://arxiv.org/abs/1502.03167)
 for a detailed description of the algorithm.
 
@@ -167,7 +167,7 @@ spatial dimensions using the formulas above.
 ## BitcastConvertType
 
 See also
-[`XlaBuilder::BitcastConvertType`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::BitcastConvertType`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 Similar to a `tf.bitcast` in TensorFlow, performs an element-wise bitcast
 operation from a data shape to a target shape. The dimensions must match, and
@@ -189,7 +189,7 @@ and destination element types must not be tuples.
 ## Broadcast
 
 See also
-[`XlaBuilder::Broadcast`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::Broadcast`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 Adds dimensions to an array by duplicating the data in the array.
 
@@ -217,7 +217,7 @@ For example, if `operand` is a scalar `f32` with value `2.0f`, and
 ## Call
 
 See also
-[`XlaBuilder::Call`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::Call`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 Invokes a computation with the given arguments.
 
@@ -236,7 +236,7 @@ The arity and types of the `args` must match the parameters of the
 ## Clamp
 
 See also
-[`XlaBuilder::Clamp`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::Clamp`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 Clamps an operand to within the range between a minimum and maximum value.
 
@@ -269,7 +269,7 @@ Clamp(min, operand, max) = s32[3]{0, 5, 6};
 ## Collapse
 
 See also
-[`XlaBuilder::Collapse`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h)
+[`XlaBuilder::Collapse`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h)
 and the @{tf.reshape} operation.
 
 Collapses dimensions of an array into one dimension.
@@ -332,7 +332,7 @@ then v12 == f32[8x3] {{10, 11, 12},
 ## Concatenate
 
 See also
-[`XlaBuilder::ConcatInDim`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::ConcatInDim`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 Concatenate composes an array from multiple array operands. The array is of the
 same rank as each of the input array operands (which must be of the same rank as
@@ -388,7 +388,7 @@ Diagram:
 ## Conditional
 
 See also
-[`XlaBuilder::Conditional`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::Conditional`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 <b> `Conditional(pred, true_operand, true_computation, false_operand,
 false_computation)` </b>
@@ -416,7 +416,7 @@ executed depending on the value of `pred`.
 ## Conv (convolution)
 
 See also
-[`XlaBuilder::Conv`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::Conv`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 As ConvWithGeneralPadding, but the padding is specified in a short-hand way as
 either SAME or VALID. SAME padding pads the input (`lhs`) with zeroes so that
@@ -426,7 +426,7 @@ account. VALID padding simply means no padding.
 ## ConvWithGeneralPadding (convolution)
 
 See also
-[`XlaBuilder::ConvWithGeneralPadding`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::ConvWithGeneralPadding`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 Computes a convolution of the kind used in neural networks. Here, a convolution
 can be thought of as a n-dimensional window moving across a n-dimensional base
@@ -538,7 +538,7 @@ for (b, oz, oy, ox) {  // output coordinates
 ## ConvertElementType
 
 See also
-[`XlaBuilder::ConvertElementType`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::ConvertElementType`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 Similar to an element-wise `static_cast` in C++, performs an element-wise
 conversion operation from a data shape to a target shape. The dimensions must
@@ -572,7 +572,7 @@ then b == f32[3]{0.0, 1.0, 2.0}
 ## CrossReplicaSum
 
 See also
-[`XlaBuilder::CrossReplicaSum`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::CrossReplicaSum`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 Computes a sum across replicas.
 
@@ -607,7 +607,7 @@ than another.
 ## CustomCall
 
 See also
-[`XlaBuilder::CustomCall`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::CustomCall`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 Call a user-provided function within a computation.
 
@@ -668,7 +668,7 @@ idempotent.
 ## Dot
 
 See also
-[`XlaBuilder::Dot`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::Dot`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 <b> `Dot(lhs, rhs)` </b>
 
@@ -697,7 +697,7 @@ multiplications or matrix/matrix multiplications.
 ## DotGeneral
 
 See also
-[`XlaBuilder::DotGeneral`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::DotGeneral`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 <b> `DotGeneral(lhs, rhs, dimension_numbers)` </b>
 
@@ -784,15 +784,13 @@ non-contracting/non-batch dimension.
 ## DynamicSlice
 
 See also
-[`XlaBuilder::DynamicSlice`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::DynamicSlice`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 DynamicSlice extracts a sub-array from the input array at dynamic
 `start_indices`. The size of the slice in each dimension is passed in
 `size_indices`, which specify the end point of exclusive slice intervals in each
 dimension: [start, start + size). The shape of `start_indices` must be rank ==
 1, with dimension size equal to the rank of `operand`.
-Note: handling of out-of-bounds slice indices (generated by incorrect runtime
-calculation of 'start_indices') is currently implementation-defined.
 
 <b> `DynamicSlice(operand, start_indices, size_indices)` </b>
 
@@ -812,6 +810,17 @@ calculation of 'start_indices') is currently implementation-defined.
 :                 :                     : dimension to avoid wrapping modulo  :
 :                 :                     : dimension size.                     :
 
+The effective slice indices are computed by applying the following
+transformation for each index `i` in `[1, N)` before performing the slice:
+
+```
+start_indices[i] = clamp(start_indices[i], 0, operand.dimension_size[i] - size_indices[i])
+```
+
+This ensures that the extracted slice is always in-bounds with respect to the
+operand array. If the slice is in-bounds before the transformation is applied,
+the transformation has no effect.
+
 1-dimensional example:
 
 ```
@@ -839,7 +848,7 @@ DynamicSlice(b, s, {2, 2}) produces:
 ## DynamicUpdateSlice
 
 See also
-[`XlaBuilder::DynamicUpdateSlice`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::DynamicUpdateSlice`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 DynamicUpdateSlice generates a result which is the value of the input array
 `operand`, with a slice `update` overwritten at `start_indices`.
@@ -847,8 +856,6 @@ The shape of `update` determines the shape of the sub-array of the result which
 is updated.
 The shape of `start_indices` must be rank == 1, with dimension size equal to
 the rank of `operand`.
-Note: handling of out-of-bounds slice indices (generated by incorrect runtime
-calculation of 'start_indices') is currently implementation-defined.
 
 <b> `DynamicUpdateSlice(operand, update, start_indices)` </b>
 
@@ -866,6 +873,17 @@ calculation of 'start_indices') is currently implementation-defined.
 :                 :         : dimension. Value must be greater than or equal   :
 :                 :         : to zero.                                         :
 
+The effective slice indices are computed by applying the following
+transformation for each index `i` in `[1, N)` before performing the slice:
+
+```
+start_indices[i] = clamp(start_indices[i], 0, operand.dimension_size[i] - update.dimension_size[i])
+```
+
+This ensures that the updated slice is always in-bounds with respect to the
+operand array. If the slice is in-bounds before the transformation is applied,
+the transformation has no effect.
+
 1-dimensional example:
 
 ```
@@ -902,7 +920,7 @@ DynamicUpdateSlice(b, u, s) produces:
 ## Element-wise binary arithmetic operations
 
 See also
-[`XlaBuilder::Add`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::Add`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 A set of element-wise binary arithmetic operations is supported.
 
@@ -947,7 +965,7 @@ shapes of both operands. The semantics are described in detail on the
 ## Element-wise comparison operations
 
 See also
-[`XlaBuilder::Eq`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::Eq`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 A set of standard element-wise binary comparison operations is supported. Note
 that standard IEEE 754 floating-point comparison semantics apply when comparing
@@ -1033,7 +1051,7 @@ potentially different runtime offset) of an input tensor into an output tensor.
 ### General Semantics
 
 See also
-[`XlaBuilder::Gather`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::Gather`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 For a more intuitive description, see the "Informal Description" section below.
 
 <b> `gather(operand, gather_indices, output_window_dims, elided_window_dims, window_bounds, gather_dims_to_operand_dims)` </b>
@@ -1236,7 +1254,7 @@ concatenation of all these rows.
 ## GetTupleElement
 
 See also
-[`XlaBuilder::GetTupleElement`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::GetTupleElement`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 Indexes into a tuple with a compile-time-constant value.
 
@@ -1257,7 +1275,7 @@ See also @{tf.tuple}.
 ## Infeed
 
 See also
-[`XlaBuilder::Infeed`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::Infeed`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 <b> `Infeed(shape)` </b>
 
@@ -1293,17 +1311,30 @@ Infeed of the device.
 > which case the compiler will provide information about how the Infeed
 > operations are serialized in the compiled program.
 
+## Iota
+
+<b> `Iota()` </b>
+
+Builds a constant literal on device rather than a potentially large host
+transfer.  Creates a rank 1 tensor of values starting at zero and incrementing
+by one.
+
+Arguments          | Type            | Semantics
+------------------ | --------------- | ---------------------------
+`type`             | `PrimitiveType` | type U
+`size`             | `int64`         | The number of elements in the tensor.
+
 ## Map
 
 See also
-[`XlaBuilder::Map`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::Map`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 <b> `Map(operands..., computation)` </b>
 
 | Arguments         | Type                   | Semantics                      |
 | ----------------- | ---------------------- | ------------------------------ |
 | `operands`        | sequence of N `XlaOp`s | N arrays of types T_0..T_{N-1} |
-| `computation`     | `XlaComputation`        | computation of type `T_0, T_1, |
+| `computation`     | `XlaComputation`       | computation of type `T_0, T_1, |
 :                   :                        : ..., T_{N + M -1} -> S` with N :
 :                   :                        : parameters of type T and M of  :
 :                   :                        : arbitrary type                 :
@@ -1325,7 +1356,7 @@ input arrays to produce the output array.
 ## Pad
 
 See also
-[`XlaBuilder::Pad`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::Pad`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 <b> `Pad(operand, padding_value, padding_config)` </b>
 
@@ -1364,7 +1395,7 @@ are all 0. The figure below shows examples of different `edge_padding` and
 ## Recv
 
 See also
-[`XlaBuilder::Recv`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::Recv`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 <b> `Recv(shape, channel_handle)` </b>
 
@@ -1398,7 +1429,7 @@ complete and returns the received data.
 ## Reduce
 
 See also
-[`XlaBuilder::Reduce`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::Reduce`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 Applies a reduction function to an array.
 
@@ -1515,7 +1546,7 @@ Reducing the 3D array over all its dimensions produces the scalar `84`.
 ## ReducePrecision
 
 See also
-[`XlaBuilder::ReducePrecision`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::ReducePrecision`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 Models the effect of converting floating-point values to a lower-precision
 format (such as IEEE-FP16) and back to the original format.  The number of
@@ -1546,7 +1577,7 @@ portion of the conversion is then simply a no-op.
 ## ReduceWindow
 
 See also
-[`XlaBuilder::ReduceWindow`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::ReduceWindow`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 Applies a reduction function to all elements in each window of the input
 multi-dimensional array, producing an output multi-dimensional array with the
@@ -1629,7 +1660,7 @@ context of [`Reduce`](#reduce) for more details.
 ## Reshape
 
 See also
-[`XlaBuilder::Reshape`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h)
+[`XlaBuilder::Reshape`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h)
 and the [`Collapse`](#collapse) operation.
 
 Reshapes the dimensions of an array into a new configuration.
@@ -1710,7 +1741,7 @@ Reshape(5, {}, {1,1}) == f32[1x1] {{5}};
 ## Rev (reverse)
 
 See also
-[`XlaBuilder::Rev`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::Rev`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 <b>`Rev(operand, dimensions)`</b>
 
@@ -1732,7 +1763,7 @@ the two window dimensions during the gradient computation in neural networks.
 ## RngNormal
 
 See also
-[`XlaBuilder::RngNormal`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::RngNormal`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 Constructs an output of a given shape with random numbers generated following
 the $$N(\mu, \sigma)$$ normal distribution. The parameters `mu` and `sigma`, and
@@ -1752,7 +1783,7 @@ be scalar valued.
 ## RngUniform
 
 See also
-[`XlaBuilder::RngUniform`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::RngUniform`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 Constructs an output of a given shape with random numbers generated following
 the uniform distribution over the interval $$[a,b)$$. The parameters and output
@@ -1773,7 +1804,7 @@ is implementation-defined.
 ## Select
 
 See also
-[`XlaBuilder::Select`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::Select`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 Constructs an output array from elements of two input arrays, based on the
 values of a predicate array.
@@ -1824,7 +1855,7 @@ the same shape!) then `pred` has to be a scalar of type `PRED`.
 ## SelectAndScatter
 
 See also
-[`XlaBuilder::SelectAndScatter`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::SelectAndScatter`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 This operation can be considered as a composite operation that first computes
 `ReduceWindow` on the `operand` array to select an element from each window, and
@@ -1904,7 +1935,7 @@ context of [`Reduce`](#reduce) for more details.
 ## Send
 
 See also
-[`XlaBuilder::Send`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::Send`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 <b> `Send(operand, channel_handle)` </b>
 
@@ -1959,7 +1990,7 @@ computations. For example, below schedules lead to deadlocks.
 ## Slice
 
 See also
-[`XlaBuilder::Slice`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::Slice`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 Slicing extracts a sub-array from the input array. The sub-array is of the same
 rank as the input and contains the values inside a bounding box within the input
@@ -2008,7 +2039,7 @@ Slice(b, {2, 1}, {4, 3}) produces:
 ## Sort
 
 See also
-[`XlaBuilder::Sort`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::Sort`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 There are two versions of the Sort instruction: a single-operand and a
 two-operand version.
@@ -2068,7 +2099,7 @@ This is the same as Reshape(operand, permutation,
 ## Tuple
 
 See also
-[`XlaBuilder::Tuple`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::Tuple`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 A tuple containing a variable number of data handles, each of which has its own
 shape.
@@ -2087,7 +2118,7 @@ Tuples can be deconstructed (accessed) via the [`GetTupleElement`]
 ## While
 
 See also
-[`XlaBuilder::While`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::While`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 <b> `While(condition, body, init)` </b>
 
diff --git a/tensorflow/docs_src/tutorials/_index.yaml b/tensorflow/docs_src/tutorials/_index.yaml
index c74fe58089..9534114689 100644
--- a/tensorflow/docs_src/tutorials/_index.yaml
+++ b/tensorflow/docs_src/tutorials/_index.yaml
@@ -2,6 +2,7 @@ project_path: /_project.yaml
 book_path: /_book.yaml
 description: <!--no description-->
 landing_page:
+  custom_css_path: /site-assets/css/style.css
   show_side_navs: True
   rows:
   - description: >
@@ -14,57 +15,6 @@ landing_page:
       </p>
     items:
     - custom_html: >
-        <style>
-        .tfo-button-primary {
-          background-color: #fca851;
-        }
-        .tfo-button-primary:hover {
-          background-color: #ef6c02;
-        }
-
-        a.colab-button {
-          display: inline-block;
-          background: rgba(255, 255, 255, 0.75);
-          padding: 4px 8px;
-          border-radius: 4px;
-          font-size: 11px!important;
-          text-decoration: none;
-          color:#aaa;border: none;
-          font-weight: 300;
-          border: solid 1px rgba(0, 0, 0, 0.08);
-          border-bottom-color: rgba(0, 0, 0, 0.15);
-          text-transform: uppercase;
-          line-height: 16px
-        }
-        a.colab-button:hover {
-          color: #666;
-          background: white;
-          border-color: rgba(0, 0, 0, 0.2);
-        }
-        a.colab-button span {
-          background-image: url("/images/colab_logo_button.svg");
-          background-repeat:no-repeat;background-size:20px;
-          background-position-y:2px;display:inline-block;
-          padding-left:24px;border-radius:4px;
-          text-decoration:none;
-        }
-
-        /* adjust code block for smaller screens */
-        @media screen and (max-width: 1000px) {
-          .tfo-landing-row-item-code-block {
-            flex-direction: column !important;
-          }
-          .tfo-landing-row-item-code-block > .devsite-landing-row-item-code {
-            /*display: none;*/
-            width: 100%;
-          }
-        }
-        @media screen and (max-width: 720px) {
-          .tfo-landing-row-item-code-block {
-            display: none;
-          }
-        }
-        </style>
         <div class="devsite-landing-row-item-description">
           <h3 class="hide-from-toc">Learn and use ML</h3>
           <div class="devsite-landing-row-item-description-content">
diff --git a/tensorflow/examples/tutorials/mnist/mnist_deep.py b/tensorflow/examples/tutorials/mnist/mnist_deep.py
index 1e0294db27..5d8d8d84fe 100644
--- a/tensorflow/examples/tutorials/mnist/mnist_deep.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_deep.py
@@ -34,6 +34,8 @@ from tensorflow.examples.tutorials.mnist import input_data
 
 import tensorflow as tf
 
+import numpy
+
 FLAGS = None
 
 
@@ -164,8 +166,15 @@ def main(_):
         print('step %d, training accuracy %g' % (i, train_accuracy))
       train_step.run(feed_dict={x: batch[0], y_: batch[1], keep_prob: 0.5})
 
-    print('test accuracy %g' % accuracy.eval(feed_dict={
-        x: mnist.test.images, y_: mnist.test.labels, keep_prob: 1.0}))
+    # compute in batches to avoid OOM on GPUs 
+    accuracy_l = []
+    for _ in range(20):
+      batch = mnist.test.next_batch(500, shuffle=False)
+      accuracy_l.append(accuracy.eval(feed_dict={x: batch[0], 
+                                                 y_: batch[1], 
+                                                 keep_prob: 1.0}))
+    print('test accuracy %g' % numpy.mean(accuracy_l))
+
 
 if __name__ == '__main__':
   parser = argparse.ArgumentParser()
diff --git a/tensorflow/go/README.md b/tensorflow/go/README.md
index e251356ec8..288a32530a 100644
--- a/tensorflow/go/README.md
+++ b/tensorflow/go/README.md
@@ -46,7 +46,7 @@ from source.
     ```sh
     cd ${GOPATH}/src/github.com/tensorflow/tensorflow
     ./configure
-    bazel build --config opt //tensorflow:libtensorflow.so
+    bazel build -c opt //tensorflow:libtensorflow.so
     ```
 
     This can take a while (tens of minutes, more if also building for GPU).
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 18d7425323..6c9bf1e714 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -5225,12 +5225,26 @@ func IsBoostedTreesEnsembleInitialized(scope *Scope, tree_ensemble_handle tf.Out
 	return op.Output(0)
 }
 
+// CastAttr is an optional argument to Cast.
+type CastAttr func(optionalAttr)
+
+// CastTruncate sets the optional Truncate attribute to value.
+// If not specified, defaults to false
+func CastTruncate(value bool) CastAttr {
+	return func(m optionalAttr) {
+		m["Truncate"] = value
+	}
+}
+
 // Cast x of type SrcT to y of DstT.
-func Cast(scope *Scope, x tf.Output, DstT tf.DataType) (y tf.Output) {
+func Cast(scope *Scope, x tf.Output, DstT tf.DataType, optional ...CastAttr) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"DstT": DstT}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
 		Type: "Cast",
 		Input: []tf.Input{
diff --git a/tensorflow/java/maven/hadoop/pom.xml b/tensorflow/java/maven/hadoop/pom.xml
index 7391dfb965..2c2c4106cb 100644
--- a/tensorflow/java/maven/hadoop/pom.xml
+++ b/tensorflow/java/maven/hadoop/pom.xml
@@ -5,7 +5,7 @@
     <groupId>org.tensorflow</groupId>
     <artifactId>hadoop</artifactId>
     <packaging>jar</packaging>
-    <version>1.9.0</version>
+    <version>1.10.0-rc0</version>
     <name>tensorflow-hadoop</name>
     <url>https://www.tensorflow.org</url>
     <description>TensorFlow TFRecord InputFormat/OutputFormat for Apache Hadoop</description>
diff --git a/tensorflow/java/maven/libtensorflow/pom.xml b/tensorflow/java/maven/libtensorflow/pom.xml
index d44bdf8f81..5d4e04ecd3 100644
--- a/tensorflow/java/maven/libtensorflow/pom.xml
+++ b/tensorflow/java/maven/libtensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.9.0</version>
+    <version>1.10.0-rc0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow</artifactId>
diff --git a/tensorflow/java/maven/libtensorflow_jni/pom.xml b/tensorflow/java/maven/libtensorflow_jni/pom.xml
index e8925c6fb1..e107904f7d 100644
--- a/tensorflow/java/maven/libtensorflow_jni/pom.xml
+++ b/tensorflow/java/maven/libtensorflow_jni/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.9.0</version>
+    <version>1.10.0-rc0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow_jni</artifactId>
diff --git a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
index 3bf4a2590c..b3c525233f 100644
--- a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
+++ b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.9.0</version>
+    <version>1.10.0-rc0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow_jni_gpu</artifactId>
diff --git a/tensorflow/java/maven/pom.xml b/tensorflow/java/maven/pom.xml
index b96dcf2888..a2943a3172 100644
--- a/tensorflow/java/maven/pom.xml
+++ b/tensorflow/java/maven/pom.xml
@@ -6,7 +6,7 @@
   <modelVersion>4.0.0</modelVersion>
   <groupId>org.tensorflow</groupId>
   <artifactId>parentpom</artifactId>
-  <version>1.9.0</version>
+  <version>1.10.0-rc0</version>
   <packaging>pom</packaging>
 
   <url>https://www.tensorflow.org</url>
diff --git a/tensorflow/java/maven/proto/pom.xml b/tensorflow/java/maven/proto/pom.xml
index 5581d864d7..7080d81b7d 100644
--- a/tensorflow/java/maven/proto/pom.xml
+++ b/tensorflow/java/maven/proto/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.9.0</version>
+    <version>1.10.0-rc0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>proto</artifactId>
diff --git a/tensorflow/java/maven/spark-connector/pom.xml b/tensorflow/java/maven/spark-connector/pom.xml
index 64956be02c..003d09a0b7 100644
--- a/tensorflow/java/maven/spark-connector/pom.xml
+++ b/tensorflow/java/maven/spark-connector/pom.xml
@@ -6,7 +6,7 @@
     <groupId>org.tensorflow</groupId>
     <artifactId>spark-connector_2.11</artifactId>
     <packaging>jar</packaging>
-    <version>1.9.0</version>
+    <version>1.10.0-rc0</version>
     <name>spark-tensorflow-connector</name>
     <url>https://www.tensorflow.org</url>
     <description>TensorFlow TFRecord connector for Apache Spark DataFrames</description>
diff --git a/tensorflow/java/maven/tensorflow/pom.xml b/tensorflow/java/maven/tensorflow/pom.xml
index 92e15aa2c7..b9affbf699 100644
--- a/tensorflow/java/maven/tensorflow/pom.xml
+++ b/tensorflow/java/maven/tensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.9.0</version>
+    <version>1.10.0-rc0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>tensorflow</artifactId>
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index c33a579ad2..d35731d3cd 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -96,6 +96,7 @@ py_library(
         ":image_ops",
         ":initializers_ns",
         ":io_ops",
+        ":kernels",
         ":layers",
         ":lib",
         ":list_ops",
@@ -745,8 +746,8 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":framework",
+        ":framework_ops",
         ":function",
-        ":op_def_registry",
         ":tensor_shape",
         ":versions",
         "//tensorflow/core:protos_all_py",
@@ -762,8 +763,10 @@ py_test(
     deps = [
         ":array_ops",
         ":client_testlib",
+        ":constant_op",
         ":dtypes",
         ":framework_ops",
+        ":function",
         ":function_def_to_graph",
         ":graph_to_function_def",
         ":math_ops",
@@ -788,6 +791,19 @@ py_library(
 )
 
 py_library(
+    name = "kernels",
+    srcs = [
+        "framework/kernels.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":pywrap_tensorflow",
+        ":util",
+        "//tensorflow/core:protos_all_py",
+    ],
+)
+
+py_library(
     name = "op_def_library",
     srcs = ["framework/op_def_library.py"],
     srcs_version = "PY2AND3",
@@ -1480,6 +1496,20 @@ py_test(
     ],
 )
 
+py_test(
+    name = "framework_kernels_test",
+    size = "small",
+    srcs = ["framework/kernels_test.py"],
+    main = "framework/kernels_test.py",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":framework_test_lib",
+        ":kernels",
+        ":platform_test",
+        ":test_ops",
+    ],
+)
+
 tf_gen_op_wrapper_private_py(
     name = "array_ops_gen",
     visibility = [
@@ -3377,6 +3407,7 @@ py_test(
         ":math_ops",
         ":util",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -3627,6 +3658,7 @@ tf_cuda_library(
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:session_ref",
         "//third_party/py/numpy:headers",
         "//third_party/python_runtime:headers",
     ],
diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index 8ede6ab54c..f8e20e1b89 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import functools
 import re
 import threading
@@ -243,7 +244,7 @@ class _FetchMapper(object):
     elif isinstance(fetch, (list, tuple)):
       # NOTE(touts): This is also the code path for namedtuples.
       return _ListFetchMapper(fetch)
-    elif isinstance(fetch, dict):
+    elif isinstance(fetch, collections.Mapping):
       return _DictFetchMapper(fetch)
     else:
       # Look for a handler in the registered expansions.
@@ -629,7 +630,7 @@ class BaseSession(SessionInterface):
     opts = tf_session.TF_NewSessionOptions(target=self._target, config=config)
     try:
       # pylint: disable=protected-access
-      self._session = tf_session.TF_NewSession(self._graph._c_graph, opts)
+      self._session = tf_session.TF_NewSessionRef(self._graph._c_graph, opts)
       # pylint: enable=protected-access
     finally:
       tf_session.TF_DeleteSessionOptions(opts)
@@ -1234,8 +1235,12 @@ class BaseSession(SessionInterface):
 
       return _fetch_handler_run
 
-  # Captures the name of a node in an error status.
-  _NODEDEF_NAME_RE = re.compile(r'\[\[Node: ([^ ]*?) =')
+  # Captures the name of a node in an error status. The regex below matches
+  # both the old and the new formats:
+  # Old format: [[Node: <node_name> = ...]]
+  # New format: [[{{node <node_name>}} = ...]]
+  _NODEDEF_NAME_RE = re.compile(
+      r'\[\[(Node: )?(\{\{node )?([^\} ]*)(\}\})?\s*=')
 
   def _do_run(self, handle, target_list, fetch_list, feed_dict, options,
               run_metadata):
@@ -1290,7 +1295,7 @@ class BaseSession(SessionInterface):
       node_def = None
       op = None
       if m is not None:
-        node_name = m.group(1)
+        node_name = m.group(3)
         try:
           op = self._graph.get_operation_by_name(node_name)
           node_def = op.node_def
diff --git a/tensorflow/python/client/tf_session.i b/tensorflow/python/client/tf_session.i
index 1cdd8e0b6a..39a2922ac0 100644
--- a/tensorflow/python/client/tf_session.i
+++ b/tensorflow/python/client/tf_session.i
@@ -777,6 +777,7 @@ def TF_Reset(target, containers=None, config=None):
   $1 = &types_local;
 }
 
+%unignore TF_NewSessionRef;
 %unignore SetRequireShapeInferenceFns;
 %unignore TF_TryEvaluateConstant_wrapper;
 %noexception TF_TryEvaluateConstant_wrapper;
diff --git a/tensorflow/python/client/tf_session_helper.cc b/tensorflow/python/client/tf_session_helper.cc
index b6481e7e29..bcd4af2912 100644
--- a/tensorflow/python/client/tf_session_helper.cc
+++ b/tensorflow/python/client/tf_session_helper.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/core/common_runtime/session_ref.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
@@ -42,6 +43,19 @@ static const char* kFeedDictErrorMsg =
     "feed_dict must be a dictionary mapping strings to NumPy arrays.";
 }  // end namespace
 
+TF_Session* TF_NewSessionRef(TF_Graph* graph, const TF_SessionOptions* opts,
+                             TF_Status* status) {
+  TF_Session* tf_session = TF_NewSession(graph, opts, status);
+  if (tf_session == nullptr) {
+    return nullptr;
+  }
+
+  Session* session = reinterpret_cast<Session*>(tf_session->session);
+  SessionRef* session_ref = new SessionRef(session);
+  tf_session->session = session_ref;
+  return tf_session;
+}
+
 void TF_Run_wrapper_helper(TF_DeprecatedSession* session, const char* handle,
                            const TF_Buffer* run_options, PyObject* feed_dict,
                            const NameVector& output_names,
diff --git a/tensorflow/python/client/tf_session_helper.h b/tensorflow/python/client/tf_session_helper.h
index cfd27c2bee..dab7e71aac 100644
--- a/tensorflow/python/client/tf_session_helper.h
+++ b/tensorflow/python/client/tf_session_helper.h
@@ -40,6 +40,9 @@ typedef tensorflow::gtl::InlinedVector<PyObject*, 8> PyObjectVector;
 // A TF_TensorVector is a vector of borrowed pointers to TF_Tensors.
 typedef gtl::InlinedVector<TF_Tensor*, 8> TF_TensorVector;
 
+TF_Session* TF_NewSessionRef(TF_Graph* graph, const TF_SessionOptions* opts,
+                             TF_Status* status);
+
 // Run the graph associated with the session starting with the
 // supplied inputs[].  Regardless of success or failure, inputs[] are
 // stolen by the implementation (i.e. the implementation will
diff --git a/tensorflow/python/data/ops/iterator_ops.py b/tensorflow/python/data/ops/iterator_ops.py
index 3ef22cf981..494df178df 100644
--- a/tensorflow/python/data/ops/iterator_ops.py
+++ b/tensorflow/python/data/ops/iterator_ops.py
@@ -57,6 +57,13 @@ GET_NEXT_CALL_WARNING_MESSAGE = (
 GLOBAL_ITERATORS = "iterators"
 
 
+def _device_stack_is_empty():
+  # pylint: disable=protected-access
+  device_stack = ops.get_default_graph()._device_functions_outer_to_inner
+  # pylint: enable=protected-access
+  return not bool(device_stack)
+
+
 @tf_export("data.Iterator")
 class Iterator(object):
   """Represents the state of iterating through a `Dataset`."""
@@ -174,7 +181,7 @@ class Iterator(object):
     if shared_name is None:
       shared_name = ""
     if compat.forward_compatible(2018, 8, 3):
-      if not ops.get_default_graph()._graph_device_function_stack:  # pylint: disable=protected-access
+      if _device_stack_is_empty():
         with ops.device("/cpu:0"):
           iterator_resource = gen_dataset_ops.iterator_v2(
               container="",
@@ -263,7 +270,7 @@ class Iterator(object):
     nest.assert_same_structure(output_types, output_shapes)
     string_handle = ops.convert_to_tensor(string_handle, dtype=dtypes.string)
     if compat.forward_compatible(2018, 8, 3):
-      if not ops.get_default_graph()._graph_device_function_stack:  # pylint: disable=protected-access
+      if _device_stack_is_empty():
         with ops.device("/cpu:0"):
           iterator_resource = gen_dataset_ops.iterator_from_string_handle_v2(
               string_handle,
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
new file mode 100644
index 0000000000..2bd0b4320a
--- /dev/null
+++ b/tensorflow/python/distribute/BUILD
@@ -0,0 +1,43 @@
+package(
+    default_visibility = ["//tensorflow:internal"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+py_library(
+    name = "distribute_coordinator",
+    srcs = [
+        "distribute_coordinator.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:training",
+    ],
+)
+
+py_test(
+    name = "distribute_coordinator_test",
+    size = "small",
+    srcs = ["distribute_coordinator_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":distribute_coordinator",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:distributed_framework_test_lib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+)
diff --git a/tensorflow/python/distribute/distribute_coordinator.py b/tensorflow/python/distribute/distribute_coordinator.py
new file mode 100644
index 0000000000..04c50dbafc
--- /dev/null
+++ b/tensorflow/python/distribute/distribute_coordinator.py
@@ -0,0 +1,361 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A unified and split coordinator for distributed TensorFlow."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import json
+import os
+import threading
+
+from tensorflow.core.protobuf import cluster_pb2
+from tensorflow.python.training import server_lib
+
+
+class _TaskType(object):
+  PS = "ps"
+  WORKER = "worker"
+  CHIEF = "chief"
+  EVALUATOR = "evaluator"
+
+
+_coordinator_context = threading.local()
+
+
+def get_current_coordinator_context():
+  """Returns the current coordinator context."""
+  try:
+    return _coordinator_context.current
+  except AttributeError:
+    return None
+
+
+class _Barrier(object):
+  """A reusable barrier class for worker synchronization."""
+
+  def __init__(self, num_participants):
+    """Initializes the barrier object.
+
+    Args:
+      num_participants: an integer which is the expected number of calls of
+        `wait` pass to through this barrier.
+    """
+    self._num_participants = num_participants
+    self._counter = 0
+    self._flag = False
+    self._local_sense = threading.local()
+    self._lock = threading.Lock()
+    self._condition = threading.Condition()
+
+  def wait(self):
+    """Waits until all other callers reach the same wait call."""
+    if not hasattr(self._local_sense, "value"):
+      self._local_sense.value = False
+    self._local_sense.value = not self._flag
+    with self._lock:
+      self._counter += 1
+      if self._counter == self._num_participants:
+        self._counter = 0
+        self._flag = self._local_sense.value
+    with self._condition:
+      while self._flag != self._local_sense.value:
+        self._condition.wait()
+      self._condition.notify_all()
+
+
+def _get_num_workers(cluster_spec):
+  """Gets number of workers including chief."""
+  if not cluster_spec:
+    return 0
+  return len(cluster_spec.as_dict().get(_TaskType.WORKER, [])) + len(
+      cluster_spec.as_dict().get(_TaskType.CHIEF, []))
+
+
+class _CoordinatorContext(object):
+  """The coordinator context class.
+
+  This context object provides configuration information for each task. One
+  context manager with a coordinator context object will be created per
+  invocation to the `worker_fn` where `get_current_coordinator_context` can be
+  called to access the coordinator context object.
+  """
+
+  def __init__(self,
+               cluster_spec,
+               task_type,
+               task_id,
+               between_graph=False,
+               rpc_layer="grpc",
+               worker_barrier=None):
+    """Initialize the coordinator context object.
+
+    Args:
+      cluster_spec: a ClusterSpec object. It can be empty or None in the local
+        training case.
+      task_type: a string indicating the role of the corresponding task, such as
+        "worker" or "ps". It can be None if it is local training or
+        `between_graph` is False.
+      task_id: an integer indicating id of the corresponding task. It can be
+        None if it is local training or `between_graph` is False.
+      between_graph: whether it is between-graph replication or not.
+      rpc_layer: optional string specifying the RPC protocol for communication
+        with worker masters. If None or empty, hosts in the `cluster_spec` will
+        be used directly.
+      worker_barrier: optional, the barrier object for worker synchronization.
+
+    Raises:
+      ValueError: if task_type or task_id is Node or empty and it is distributed
+        between-graph replicated training.
+    """
+    if cluster_spec and between_graph:
+      if not task_type or task_id is None:
+        raise ValueError("`task_type` and `task_id` must be set in the "
+                         "distributed between-graph replicated training.")
+      if task_type not in cluster_spec.jobs:
+        raise ValueError("`task_type` %r not found in the `cluster_spec` %r" %
+                         (task_type, cluster_spec))
+    self._cluster_spec = cluster_spec
+    self._task_type = task_type
+    self._task_id = task_id
+    self._worker_barrier = worker_barrier
+    self._rpc_layer = rpc_layer
+    self._master_target = self._get_master_target()
+    self._num_workers = _get_num_workers(cluster_spec)
+    self._is_chief_node = self._is_chief()
+
+  def __enter__(self):
+    old_context = get_current_coordinator_context()
+    if old_context:
+      raise ValueError(
+          "You cannot run distribute coordinator in a `worker_fn`.")
+    _coordinator_context.current = self
+
+  def __exit__(self, unused_exception_type, unused_exception_value,
+               unused_traceback):
+    _coordinator_context.current = None
+
+  def _get_master_target(self):
+    """Return the master target for a task."""
+    # If cluster_spec is None or empty, we use local master.
+    if not self._cluster_spec:
+      return "local"
+
+    # If task_type is None, then it is in-graph replicated training. In this
+    # case we use the chief or first worker's master target.
+    if not self._task_type:
+      if _TaskType.CHIEF in self._cluster_spec.jobs:
+        assert not self.between_graph
+        task_type = _TaskType.CHIEF
+        task_id = 0
+      else:
+        assert _TaskType.WORKER in self._cluster_spec.jobs
+        task_type = _TaskType.WORKER
+        task_id = 0
+    else:
+      task_type = self._task_type
+      task_id = self._task_id
+
+    prefix = ""
+    if self._rpc_layer:
+      prefix = self._rpc_layer + "://"
+    return prefix + self._cluster_spec.job_tasks(task_type)[task_id or 0]
+
+  def _is_chief(self):
+    """Return whether the task is the chief worker."""
+    if (not self._cluster_spec or self._task_type in [_TaskType.CHIEF, None]):
+      return True
+
+    # If not local and chief not in the cluster_spec, use the first worker as
+    # chief.
+    if (_TaskType.CHIEF not in self._cluster_spec.jobs and
+        self._task_type == _TaskType.WORKER and self._task_id == 0):
+      return True
+    return False
+
+  def wait_for_other_workers(self):
+    """Waits for other workers to reach the same call to this method.
+
+    Raises:
+      ValueError: if `worker_barrier` is not passed to the __init__ method.
+    """
+    if not self._worker_barrier:
+      raise ValueError(
+          "`worker_barrier is not set in the coordinator context.`")
+    self._worker_barrier.wait()
+
+  @property
+  def distributed_mode(self):
+    """Whether it is distributed training or not."""
+    return bool(self._cluster_spec)
+
+  @property
+  def cluster_spec(self):
+    """Returns a copy of the cluster_spec object."""
+    return copy.deepcopy(self._cluster_spec)
+
+  @property
+  def task_type(self):
+    """Returns the role of the corresponing task."""
+    return self._task_type
+
+  @property
+  def task_id(self):
+    """Returns the id or index of the corresponing task."""
+    return self._task_id
+
+  @property
+  def master_target(self):
+    """Returns the session master for the corresponding task to connect to."""
+    return self._master_target
+
+  @property
+  def is_chief(self):
+    """Returns whether the task is a chief node."""
+    return self._is_chief_node
+
+  @property
+  def num_workers(self):
+    """Returns number of workers in the cluster, including chief."""
+    return self._num_workers
+
+
+def _run(worker_fn, cluster_spec, task_type, task_id, between_graph, rpc_layer,
+         worker_barrier):
+  with _CoordinatorContext(cluster_spec, task_type, task_id, between_graph,
+                           rpc_layer, worker_barrier):
+    worker_fn()
+
+
+def run_distribute_coordinator(worker_fn,
+                               cluster_spec=None,
+                               between_graph=False,
+                               rpc_layer=None):
+  """Run the coordinator for distributed TensorFlow.
+
+  This function runs a unified and split coordinator for distributed TensorFlow.
+  Given a `cluster_spec` specifying server addresses and their roles in a
+  cluster, this coordinator will figure out how to set them up, give the
+  underlying function the right targets for master sessions and coordinate their
+  training.
+
+  In addition to be the distribute coordinator, this is also the source of
+  configurations for each job in the distributed training. As there are multiple
+  ways to configure a distributed TensorFlow cluster, its context object
+  provides these configurations so that users or higher-level APIs don't have to
+  figure out the configuration for each job by themselves.
+
+  In the between-graph replicated training, this coordinator will create
+  multiple threads and each calls the `worker_fn` which is supposed to create
+  its own graph and connect to one worker master given by its coordinator
+  context. In the in-graph replicated training, it has only one thread calling
+  this `worker_fn`.
+
+  The `worker_fn` defines the training logic and is called under a its own
+  coordinator context which can be accessed to via
+  `get_current_coordinator_context`. A coordinator context provides access to
+  configurations for each task, e.g. the task_type, task_id, master target and
+  so on. Since `worker_fn` will be called in a thread and possibly multiple
+  times, caller should be careful when it accesses global data. For example, it
+  is unsafe to define flags in a `worker_fn` or to define different environment
+  variables for different `worker_fn`s.
+
+  The `worker_fn` for the between-graph replication is defined as if there are
+  only one worker corresponding to the `worker_fn` and possibly ps jobs. It
+  assigns variables to parameter servers and all other operations to that
+  worker. In the in-graph replication case, the `worker_fn` has to define
+  operations for all worker jobs. Using a distribution strategy can simplify the
+  `worker_fn` by not having to worry about the replication and device assignment
+  of variables and operations.
+
+  This method is intended to be invoked by high-level APIs so that users don't
+  have to explictly call it to run this coordinator. For those who don't use
+  high-level APIs, to change a program to use this coordinator, wrap everything
+  in a the program after global data definitions such as commandline flag
+  definition into the `worker_fn` and get task-specific configurations from
+  the coordinator context.
+
+  The `cluster_spec` can be either passed by the argument or parsed from the
+  "TF_CONFIG" envrionment variable. Example of a TF_CONFIG:
+  ```
+    cluster = {'chief': ['host0:2222'],
+               'ps': ['host1:2222', 'host2:2222'],
+               'worker': ['host3:2222', 'host4:2222', 'host5:2222']}
+    os.environ['TF_CONFIG'] = json.dumps({'cluster': cluster})
+  ```
+
+  If `cluster_spec` is not given in any format, it becomes local training and
+  this coordinator will connect to a local session.
+
+  For evaluation, if "evaluator" exist in the cluster_spec, a separate thread
+  will be created with its `task_type` set to "evaluator". If "evaluator" is not
+  set in the cluster_spec, it entirely depends on the `worker_fn` for how to do
+  evaluation.
+
+  Args:
+    worker_fn: the function to be called and given the access to a coordinator
+      context object.
+    cluster_spec: a dict, ClusterDef or ClusterSpec specifying servers and roles
+      in a cluster. If not set or empty, fall back to local training.
+    between_graph: a boolean. It is only useful when `cluster_spec` is set and
+      not empty. If true, it will use between-graph replicated training;
+      otherwise it will use in-graph replicated training.
+    rpc_layer: optional string, the protocol for RPC, e.g. "grpc".
+
+  Raises:
+    ValueError: if `cluster_spec` is supplied but not a dict or a ClusterDef or
+      a ClusterSpec.
+  """
+  if not cluster_spec:
+    tf_config = json.loads(os.environ.get("TF_CONFIG", "{}"))
+    cluster_spec = tf_config.get("cluster", {})
+
+  if cluster_spec:
+    if isinstance(cluster_spec, (dict, cluster_pb2.ClusterDef)):
+      cluster_spec = server_lib.ClusterSpec(cluster_spec)
+    elif not isinstance(cluster_spec, server_lib.ClusterSpec):
+      raise ValueError(
+          "`cluster_spec' should be dict or a `tf.train.ClusterSpec` or a "
+          "`tf.train.ClusterDef` object")
+    # TODO(yuefengz): validate cluster_spec.
+
+  threads = []
+  if cluster_spec and _TaskType.EVALUATOR in cluster_spec.jobs:
+    t = threading.Thread(
+        target=_run,
+        args=(worker_fn, cluster_spec, _TaskType.EVALUATOR, 0, between_graph,
+              rpc_layer, None))
+    t.start()
+    threads.append(t)
+
+  if cluster_spec and between_graph:
+    worker_barrier = _Barrier(_get_num_workers(cluster_spec))
+    for task_type in [_TaskType.CHIEF, _TaskType.WORKER]:
+      for task_id in range(len(cluster_spec.as_dict().get(task_type, []))):
+        t = threading.Thread(
+            target=_run,
+            args=(worker_fn, cluster_spec, task_type, task_id, between_graph,
+                  rpc_layer, worker_barrier))
+        t.start()
+        threads.append(t)
+  else:
+    # Local or in-graph replicated training.
+    _run(worker_fn, cluster_spec, None, None, between_graph, rpc_layer, None)
+
+  # TODO(yuefengz): wrapper threads into thread coordinator?
+  for t in threads:
+    t.join()
diff --git a/tensorflow/python/distribute/distribute_coordinator_test.py b/tensorflow/python/distribute/distribute_coordinator_test.py
new file mode 100644
index 0000000000..82fd823352
--- /dev/null
+++ b/tensorflow/python/distribute/distribute_coordinator_test.py
@@ -0,0 +1,291 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for distribute coordinator."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+import copy
+import threading
+import six
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session
+from tensorflow.python.distribute import distribute_coordinator
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+CHIEF = distribute_coordinator._TaskType.CHIEF
+WORKER = distribute_coordinator._TaskType.WORKER
+PS = distribute_coordinator._TaskType.PS
+EVALUATOR = distribute_coordinator._TaskType.EVALUATOR
+
+NUM_WORKERS = 3
+NUM_PS = 2
+
+
+def _bytes_to_str(maybe_bytes):
+  if isinstance(maybe_bytes, six.string_types):
+    return maybe_bytes
+  else:
+    return str(maybe_bytes, "utf-8")
+
+
+class DistributeCoordinatorTest(test.TestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    # We have to create a global in-process cluster because once an in-process
+    # tensorflow server is created, there is no way to terminate it. Please see
+    # multi_worker_test_base.py for more details.
+    cls._workers, cls._ps = test_util.create_local_cluster(
+        NUM_WORKERS, num_ps=NUM_PS)
+    cls._cluster_spec = {
+        WORKER: [_bytes_to_str(w.target) for w in cls._workers],
+        PS: [_bytes_to_str(ps.target) for ps in cls._ps]
+    }
+
+  def setUp(self):
+    self._result_correct = 0
+    self._lock = threading.Lock()
+    self._task_context = {}
+
+  @contextlib.contextmanager
+  def _test_session(self, target):
+    config = config_pb2.ConfigProto(allow_soft_placement=True)
+    config.graph_options.optimizer_options.opt_level = -1
+    with session.Session(graph=None, config=config, target=target) as sess:
+      yield sess
+
+  def _in_graph_worker_fn(self):
+    context = distribute_coordinator.get_current_coordinator_context()
+    self.assertTrue(context is not None)
+    with self._test_session(target=context.master_target) as sess:
+      xs = []
+      expected = 0.0
+      for i in range(context.num_workers):
+        with ops.device("/job:worker/task:%d" % i):
+          x = variable_scope.get_variable("x_%d" % i, initializer=10.0)
+          x_add = x.assign_add(float(i))
+          xs.append(x_add)
+          expected += i + 10.0
+
+      with ops.device("/job:worker/task:0"):
+        result = math_ops.add_n(xs)
+
+      variables.global_variables_initializer().run()
+      result_value = sess.run(result)
+    self.assertEqual(result_value, expected)
+    if result_value == expected:
+      self._result_correct += 1
+
+  def testInGraph(self):
+    """Test it runs in-graph replicated training correctly."""
+    distribute_coordinator.run_distribute_coordinator(
+        self._in_graph_worker_fn,
+        cluster_spec=self._cluster_spec,
+        between_graph=False)
+    self.assertEqual(self._result_correct, 1)
+
+  def _between_graph_worker_fn(self):
+    context = distribute_coordinator.get_current_coordinator_context()
+    self.assertTrue(context is not None)
+    with self._test_session(target=context.master_target) as sess:
+      with ops.device("/job:ps/task:0"):
+        # TODO(yuefengz): investigate why not using resource variable will make
+        # the test flaky.
+        x = variable_scope.get_variable(
+            "x", initializer=10.0, use_resource=True)
+      with ops.device("/job:ps/task:1"):
+        y = variable_scope.get_variable(
+            "y", initializer=20.0, use_resource=True)
+
+      x_add = x.assign_add(2.0)
+      y_sub = y.assign_sub(2.0)
+      train_op = control_flow_ops.group([x_add, y_sub])
+
+      if context.is_chief:
+        variables.global_variables_initializer().run()
+
+      # Synchronize workers after initializaton.
+      context.wait_for_other_workers()
+
+      sess.run(train_op)
+
+      # Synchronize workers after one step to make sure they all have finished
+      # training.
+      context.wait_for_other_workers()
+
+      x_val, y_val = sess.run([x, y])
+
+      self.assertEqual(x_val, 16.0)
+      self.assertEqual(y_val, 14.0)
+      if x_val == 16.0 and y_val == 14.0:
+        with self._lock:
+          self._result_correct += 1
+
+  def testBetweenGraph(self):
+    """Test it runs between-graph replicated training correctly."""
+    distribute_coordinator.run_distribute_coordinator(
+        self._between_graph_worker_fn,
+        cluster_spec=self._cluster_spec,
+        between_graph=True)
+
+    # Each finished worker will increment self._result_correct.
+    self.assertEqual(self._result_correct, NUM_WORKERS)
+
+  def _dump_task_context(self):
+    """Dumps the propoerties of each coordinator context.
+
+    It dumps the context properties to a dict mapping from task_type to a list
+    of tuples of master_target, num_workers, is_chief and distribute_mode, where
+    the list is indexed by the task_id.
+    """
+    context = distribute_coordinator.get_current_coordinator_context()
+    self.assertTrue(context is not None)
+    task_type = str(context.task_type)
+    task_id = context.task_id or 0
+    with self._lock:
+      if task_type not in self._task_context:
+        self._task_context[task_type] = []
+      while len(self._task_context[task_type]) <= task_id:
+        self._task_context[task_type].append(None)
+      self._task_context[task_type][task_id] = (context.master_target,
+                                                context.num_workers,
+                                                context.is_chief,
+                                                context.distributed_mode)
+
+  def testBetweenGraphContext(self):
+    # Dumps the task contexts to the self._task_context dict.
+    distribute_coordinator.run_distribute_coordinator(
+        self._dump_task_context,
+        cluster_spec=self._cluster_spec,
+        between_graph=True)
+
+    # There is only one type of task and there three such tasks.
+    self.assertEqual(len(self._task_context), 1)
+    self.assertTrue(WORKER in self._task_context)
+    self.assertEqual(len(self._task_context[WORKER]), NUM_WORKERS)
+
+    # Check whether each task has the right master_target, num_workers, is_chief
+    # and distributed_mode.
+    self.assertEqual(
+        self._task_context[WORKER][0],
+        (_bytes_to_str(self._workers[0].target), NUM_WORKERS, True, True))
+    self.assertEqual(
+        self._task_context[WORKER][1],
+        (_bytes_to_str(self._workers[1].target), NUM_WORKERS, False, True))
+    self.assertEqual(
+        self._task_context[WORKER][2],
+        (_bytes_to_str(self._workers[2].target), NUM_WORKERS, False, True))
+
+  def testInGraphContext(self):
+    # Dumps the task contexts to the self._task_context dict.
+    distribute_coordinator.run_distribute_coordinator(
+        self._dump_task_context,
+        cluster_spec=self._cluster_spec,
+        between_graph=False)
+
+    # There is only a "None" task in the dumped task context.
+    self.assertEqual(len(self._task_context), 1)
+    self.assertTrue("None" in self._task_context)
+    self.assertEqual(len(self._task_context["None"]), 1)
+
+    # Check whether each task has the right master_target, num_workers, is_chief
+    # and distributed_mode.
+    self.assertEqual(
+        self._task_context["None"][0],
+        (_bytes_to_str(self._workers[0].target), NUM_WORKERS, True, True))
+
+  def testLocalContext(self):
+    # Dumps the task contexts to the self._task_context dict.
+    distribute_coordinator.run_distribute_coordinator(
+        self._dump_task_context, cluster_spec=None, between_graph=True)
+
+    # There is only a "None" task.
+    self.assertEqual(len(self._task_context), 1)
+    self.assertTrue("None" in self._task_context)
+    self.assertEqual(len(self._task_context["None"]), 1)
+
+    # Check whether each task has the right master_target, num_workers, is_chief
+    # and distributed_mode.
+    self.assertEqual(self._task_context["None"][0], ("local", 0, True, False))
+
+  def testBetweenGraphContextWithChief(self):
+    # Adds a chief node, so there are NUM_WORKERS + 1 workers in total.
+    cluster_spec = copy.deepcopy(self._cluster_spec)
+    cluster_spec[CHIEF] = ["fake_chief"]
+
+    # Dumps the task contexts to the self._task_context dict.
+    distribute_coordinator.run_distribute_coordinator(
+        self._dump_task_context,
+        cluster_spec=cluster_spec,
+        between_graph=True,
+        rpc_layer="grpc")
+
+    # There are one CHIEF and three workers.
+    self.assertEqual(len(self._task_context), 2)
+    self.assertTrue(CHIEF in self._task_context)
+    self.assertTrue(WORKER in self._task_context)
+    self.assertEqual(len(self._task_context[CHIEF]), 1)
+    self.assertEqual(len(self._task_context[WORKER]), NUM_WORKERS)
+
+    # Check whether each task has the right master_target, num_workers, is_chief
+    # and distributed_mode.
+    self.assertEqual(self._task_context[CHIEF][0],
+                     ("grpc://fake_chief", 4, True, True))
+    self.assertEqual(self._task_context[WORKER][0],
+                     ("grpc://" + _bytes_to_str(self._workers[0].target),
+                      NUM_WORKERS + 1, False, True))
+    self.assertEqual(self._task_context[WORKER][1],
+                     ("grpc://" + _bytes_to_str(self._workers[1].target),
+                      NUM_WORKERS + 1, False, True))
+    self.assertEqual(self._task_context[WORKER][2],
+                     ("grpc://" + _bytes_to_str(self._workers[2].target),
+                      NUM_WORKERS + 1, False, True))
+
+  def testInGraphContextWithEval(self):
+    # Adds a EVALUATOR job.
+    cluster_spec = copy.deepcopy(self._cluster_spec)
+    cluster_spec[EVALUATOR] = ["fake_evaluator"]
+
+    # Dumps the task contexts to the self._task_context dict.
+    distribute_coordinator.run_distribute_coordinator(
+        self._dump_task_context, cluster_spec=cluster_spec, between_graph=False)
+
+    # There are one "None" task and one EVALUATOR task.
+    self.assertEqual(len(self._task_context), 2)
+    self.assertTrue("None" in self._task_context)
+    self.assertTrue(EVALUATOR in self._task_context)
+    self.assertEqual(len(self._task_context["None"]), 1)
+    self.assertEqual(len(self._task_context[EVALUATOR]), 1)
+
+    # Check whether each task has the right master_target, num_workers, is_chief
+    # and distributed_mode.
+    self.assertEqual(self._task_context["None"][0],
+                     (_bytes_to_str(self._workers[0].target), 3, True, True))
+    self.assertEqual(self._task_context[EVALUATOR][0],
+                     ("fake_evaluator", 3, False, True))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index 6ede8e4f4d..32a8452f62 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -322,6 +322,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:pywrap_tensorflow",
         "//tensorflow/python:random_ops",
+        "//tensorflow/python/keras",
     ],
 )
 
@@ -404,6 +405,7 @@ cuda_py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
+        "@six_archive//:six",
     ],
     tags = [
         "optonly",  # The test is too slow in non-opt mode
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index 9e0bbce4a1..c59ad09bf1 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -599,15 +599,18 @@ def _fast_fill(value, shape, dtype):
 
 
 def _zeros(shape, dtype):
-  """Wraps array_ops.zeros to cache last zero for a given shape and dtype."""
-  device = context.context().device_name
+  """Helper to return (possibly cached) zero tensors in eager mode."""
   if dtype == dtypes.variant:
     # TODO(apassos): need to save enough information about variant tensors to do
     # a zeros
     return None
-  # pylint: disable=protected-access
-  cache_key = shape, dtype, device, context.context()._eager_context.mode
-  # pylint: enable=protected-access
+
+  ctx = context.context()
+  if not ctx.executing_eagerly():
+    return array_ops.zeros(shape, dtype)
+
+  device = ctx.device_name
+  cache_key = shape, dtype, device
   cached = _zeros_cache.get(cache_key)
   if cached is None:
     cached = _fast_fill(0, shape, dtype)
@@ -616,6 +619,9 @@ def _zeros(shape, dtype):
 
 
 def _ones(shape, dtype):
+  if not context.context().executing_eagerly():
+    return array_ops.ones(shape, dtype)
+
   if shape == ():  # pylint: disable=g-explicit-bool-comparison
     return constant_op.constant(1, dtype=dtype)
   return _fast_fill(1, shape, dtype)
@@ -643,10 +649,10 @@ class GradientTape(object):
   Operations are recorded if they are executed within this context manager and
   at least one of their inputs is being "watched".
 
-  Trainable variables (created by `tf.contrib.eager.Variable` or
-  @{tf.get_variable}, trainable=True is default in both cases) are automatically
-  watched. Tensors can be manually watched by invoking the `watch` method on
-  this context manager.
+  Trainable variables (created by `tf.Variable` or @{tf.get_variable},
+  trainable=True is default in both cases) are automatically watched. Tensors
+  can be manually watched by invoking the `watch` method on this context
+  manager.
 
   For example, consider the function `y = x * x`. The gradient at `x = 3.0` can
   be computed as:
diff --git a/tensorflow/python/eager/backprop_test.py b/tensorflow/python/eager/backprop_test.py
index bdda200ff6..3d3f54b9c4 100644
--- a/tensorflow/python/eager/backprop_test.py
+++ b/tensorflow/python/eager/backprop_test.py
@@ -96,6 +96,19 @@ class BackpropTest(test.TestCase):
     self.assertAllEqual(grads_and_vars[0][0], 1.0)
     self.assertAllEqual(id(grads_and_vars[0][1]), id(x))
 
+  def testGradientInsideLoop(self):
+    with ops.Graph().as_default():
+      v = resource_variable_ops.ResourceVariable(1.0)
+
+      def body(_):
+        _ = v + 1.0  # This reads the variable inside the loop context
+        with backprop.GradientTape() as t:
+          result = v * 2
+        self.assertTrue(t.gradient(result, v) is not None)
+        return 1.0
+
+      control_flow_ops.while_loop(lambda i: False, body, [1.0])
+
   def testWhereGradient(self):
     # Note: where is special because only some of its arguments are of
     # differentiable dtypes.
@@ -912,32 +925,23 @@ class BackpropTest(test.TestCase):
         'did you forget to return a value from fn?'):
       val_and_grads_fn(x, y)
 
-  def testZerosCacheDoesntLeakAcrossModes(self):
-    with ops.Graph().as_default():
-      t = random_ops.random_normal(shape=[100, 2])
-      x = random_ops.random_normal(shape=[100, 4])
-      dy = random_ops.random_normal(shape=[100, 4])
-      with backprop.GradientTape() as gradient_tape:
-        gradient_tape.watch(x)
-        x1, _ = array_ops.split(x, num_or_size_splits=2, axis=1)
-        y1 = x1 ** 2.
-        y = array_ops.concat([y1, t], axis=1)
-
-      dx = gradient_tape.gradient(y, x, output_gradients=dy)
-      with self.test_session() as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(dx)
-
-    t = random_ops.random_normal(shape=[100, 2])
-    x = random_ops.random_normal(shape=[100, 4])
-    dy = random_ops.random_normal(shape=[100, 4])
-    with backprop.GradientTape() as gradient_tape:
-      gradient_tape.watch(x)
-      x1, _ = array_ops.split(x, num_or_size_splits=2, axis=1)
-      y1 = x1 ** 2.
-      y = array_ops.concat([y1, t], axis=1)
-
-    dx = gradient_tape.gradient(y, x, output_gradients=dy)
+  def testZerosCacheDoesntLeakAcrossGraphs(self):
+    with context.graph_mode():
+      def get_grad():
+        with ops.Graph().as_default(), self.test_session():
+          t = constant_op.constant(1, dtype=dtypes.float32, shape=(10, 4))
+          x = constant_op.constant(2, dtype=dtypes.float32, shape=(10, 4))
+          with backprop.GradientTape() as gt:
+            tape.watch(x)
+            x1, _ = array_ops.split(x, num_or_size_splits=2, axis=1)
+            y1 = x1**2
+            y = array_ops.concat([y1, t], axis=1)
+          return self.evaluate(gt.gradient(y, x))
+
+      grad1 = get_grad()
+      grad2 = get_grad()
+
+      self.assertAllEqual(grad1, grad2)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index 3aad4a114a..afc4bf0066 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -31,6 +31,7 @@ import numpy as np
 import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.python import keras
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import backprop  # pylint: disable=unused-import
 from tensorflow.python.eager import context
@@ -70,6 +71,25 @@ def c_tfe_py_fastpath_execute(a,
     six.raise_from(core._status_to_exception(e.code, message), None)
 
 
+class SubclassedKerasModel(keras.Model):
+
+  def __init__(self):
+    super(SubclassedKerasModel, self).__init__()
+    self.layer = keras.layers.Dense(
+        10, kernel_initializer="ones", bias_initializer="zeros")
+
+  def call(self, x):
+    return self.layer(x)
+
+
+def make_keras_model():
+  x = keras.Input(shape=(10,))
+  y = keras.layers.Dense(
+      10, kernel_initializer="ones", bias_initializer="zeros")(
+          x)
+  return keras.Model(inputs=x, outputs=y)
+
+
 class MicroBenchmarks(test.Benchmark):
 
   def __init__(self):
@@ -115,6 +135,7 @@ class MicroBenchmarks(test.Benchmark):
 
     def func():
       ops.EagerTensor(value, context=handle, device=device, dtype=dtype)
+
     self._run(func, 30000)
 
   def benchmark_create_float_tensor_from_list_CPU(self):
@@ -211,8 +232,8 @@ class MicroBenchmarks(test.Benchmark):
     inputs = [m]
 
     def f():
-      pywrap_tensorflow.TFE_Py_Execute(
-          ctx_handle, None, "Identity", inputs, attrs, 1)
+      pywrap_tensorflow.TFE_Py_Execute(ctx_handle, None, "Identity", inputs,
+                                       attrs, 1)
 
     self._run(f, 30000)
 
@@ -234,14 +255,13 @@ class MicroBenchmarks(test.Benchmark):
     def f():
       with backprop.GradientTape():
         pass
+
     self._run(f, 30000)
 
   def benchmark_tf_gradient_function_no_op(self):
     with context.device(CPU):
       m = gen_array_ops.identity(self._m_2)
-      self._run(
-          lambda: backprop.gradients_function(lambda x: x, [0])(m),
-          30000)
+      self._run(lambda: backprop.gradients_function(lambda x: x, [0])(m), 30000)
 
   def _benchmark_np_matmul(self, m, transpose_b, num_iters):
     a = m.cpu().numpy()
@@ -255,6 +275,7 @@ class MicroBenchmarks(test.Benchmark):
     self._run(func, num_iters, execution_mode=execution_mode)
 
   def _benchmark_gen_math_ops_matmul(self, m, transpose_b, num_iters):
+
     def func():
       gen_math_ops.mat_mul(m, m, transpose_b=transpose_b)
 
@@ -276,9 +297,10 @@ class MicroBenchmarks(test.Benchmark):
     device = context.context().device_name
     attrs = ("transpose_a", False, "transpose_b", transpose_b, "T",
              m.dtype.as_datatype_enum)
+
     def func():
-      pywrap_tensorflow.TFE_Py_Execute(ctx_handle, device, "MatMul",
-                                       inputs, attrs, 1)
+      pywrap_tensorflow.TFE_Py_Execute(ctx_handle, device, "MatMul", inputs,
+                                       attrs, 1)
 
     self._run(func, num_iters)
 
@@ -542,6 +564,30 @@ class MicroBenchmarks(test.Benchmark):
       self._benchmark_read_variable_with_tape(
           m, num_iters=self._num_iters_2_by_2)
 
+  def benchmark_keras_model_subclassed(self):
+    model = SubclassedKerasModel()
+    data = random_ops.random_uniform((10, 10))
+
+    func = lambda: model(data)
+    # First call is more expensive (creates variables etc.), discount that.
+    func()
+
+    # The whole point of this test is to contrast subclassing with
+    # the functional style of keras model building, so validate that
+    # the models are equivalent.
+    assert np.equal(func(), make_keras_model()(data)).all()
+
+    self._run(func, 30000)
+
+  def benchmark_keras_model_functional(self):
+    model = make_keras_model()
+    data = random_ops.random_uniform((10, 10))
+    func = lambda: model(data)
+    # Symmetry with benchmark_keras_model_subclassed
+    func()
+    assert np.equal(func(), SubclassedKerasModel()(data)).all()
+    self._run(func, 30000)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index 85b9491903..495a674526 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -177,6 +177,11 @@ class Context(object):
         - tf.contrib.eager.SYNC: executes each operation synchronously.
         - tf.contrib.eager.ASYNC: executes each operation asynchronously. These
           operations may return "non-ready" handles.
+      server_def: (Optional.) A tensorflow::ServerDef proto.
+        Enables execution on remote devices. GrpcServers need to be started by
+        creating an identical server_def to this, and setting the appropriate
+        task_indexes, so that the servers can communicate. It will then be
+        possible to execute operations on remote devices.
 
     Raises:
      ValueError: If execution_mode is not valid.
diff --git a/tensorflow/python/eager/core_test.py b/tensorflow/python/eager/core_test.py
index 3fabe7060e..cc765725a4 100644
--- a/tensorflow/python/eager/core_test.py
+++ b/tensorflow/python/eager/core_test.py
@@ -610,6 +610,14 @@ class TFETest(test_util.TensorFlowTestCase):
       self.assertEquals(typ, dtypes.float32)
       self.assertIsInstance(t, ops.EagerTensor)
 
+  def testConvertMixedEagerTensorsWithVariables(self):
+    var = resource_variable_ops.ResourceVariable(1.0)
+    types, tensors = execute_lib.convert_to_mixed_eager_tensors(
+        ['foo', var], context.context())
+    self.assertAllEqual([dtypes.string, dtypes.float32], types)
+    for t in tensors:
+      self.assertIsInstance(t, ops.EagerTensor)
+
 
 class SendRecvTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/eager/execute.py b/tensorflow/python/eager/execute.py
index 2ff5b8d8f4..f9b8d2cb5d 100644
--- a/tensorflow/python/eager/execute.py
+++ b/tensorflow/python/eager/execute.py
@@ -198,11 +198,7 @@ def args_to_matching_eager(l, ctx, default_dtype=None):
 
 
 def convert_to_mixed_eager_tensors(values, ctx):
-  v = [
-      t if isinstance(t, ops.EagerTensor) else ops.EagerTensor(
-          t, context=ctx._handle, device=ctx.device_name)  # pylint: disable=protected-access
-      for t in values
-  ]
+  v = [ops.internal_convert_to_tensor(t, ctx=ctx) for t in values]
   types = [t._datatype_enum() for t in v]  # pylint: disable=protected-access
   return types, v
 
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index a6906f9efd..5e4f9e29da 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -470,37 +470,39 @@ class GraphModeFunction(object):
 
   def _construct_backprop_function(self):
     """Constructs the backprop function object for this function."""
-    with self._graph.as_default():
-      c_known_ops = set()
-      c_captured_tensors = set()
-
-      existing_op_len = len(self._graph.get_operations())
-      filtered_outputs = [x for x in self._python_returns if x is not None]
+    filtered_outputs = [x for x in self._python_returns if x is not None]
+    captures = {}
+    backwards_graph = CapturingGraph(captures)
+    backwards_graph._graph_key = self._graph._graph_key  # pylint: disable=protected-access
+    for collection in self._graph.collections:
+      backwards_graph.get_collection_ref(
+          collection)[:] = self._graph.get_collection(collection)
+    backwards_graph.seed = self._graph.seed
+    with backwards_graph.as_default():
       self._out_grad_placeholders = [
           graph_placeholder(x.dtype, x.shape) for x in filtered_outputs]
-      in_gradients = gradients_impl.gradients(
+      in_gradients = gradients_impl._GradientsHelper(  # pylint: disable=protected-access
           filtered_outputs,
           self._input_placeholders,
-          grad_ys=self._out_grad_placeholders)
-      for op in self._graph.get_operations()[existing_op_len:]:
-        if op.type in ["Variable", "VariableV2", "VarHandleOp"]:
-          raise ValueError("defun cannot capture variables created without "
-                           "using tf.get_variable. Op: %s" % op)
-        c_known_ops.add(op)
-        for i in op.inputs:
-          if i.op not in c_known_ops:
-            c_captured_tensors.add(i)
+          grad_ys=self._out_grad_placeholders,
+          src_graph=self._graph)
 
     backward_outputs = tuple(
         grad for grad in _flatten(in_gradients) if grad is not None)
     output_shapes = tuple(grad.shape for grad in backward_outputs)
 
-    captures = list(sorted(c_captured_tensors, key=lambda x: x.name))
+    ids = list(sorted(captures.keys()))
+    if ids:
+      extra_inputs, extra_placeholders = zip(*[captures[x] for x in ids])
+    else:
+      extra_inputs = []
+      extra_placeholders = []
+
     forward_name = _forward_name(self._func_name)
     self._forward_fdef = _EagerDefinedFunction(
         forward_name, self._graph, self._ops, self._input_placeholders,
-        filtered_outputs + captures, self._attrs)
-    all_inputs = self._out_grad_placeholders + captures
+        filtered_outputs + list(extra_inputs), self._attrs)
+    all_inputs = self._out_grad_placeholders + list(extra_placeholders)
     # Excluding input ops from the body as we do not intend to execute these
     # operations when the function is executed.
     all_ignored_ops = frozenset(x.op for x in all_inputs)
@@ -508,11 +510,12 @@ class GraphModeFunction(object):
     # means rerunning the function-defining code will always define the same
     # function, which is useful if we serialize this etc.
     function_def_ops = tuple(x
-                             for x in sorted(c_known_ops, key=lambda x: x.name)
+                             for x in sorted(backwards_graph.get_operations(),
+                                             key=lambda x: x.name)
                              if x not in all_ignored_ops)
     bname = _backward_name(self._func_name)
     self._backward_function = GraphModeFunction(
-        bname, all_inputs, [], self._graph, function_def_ops,
+        bname, all_inputs, [], backwards_graph, function_def_ops,
         backward_outputs, in_gradients, output_shapes, attrs=self._attrs)
 
   def _backprop_call(self, args):
@@ -1077,7 +1080,7 @@ def defun(func=None, compiled=False):
   tf.enable_eager_execution()
 
   def fn():
-    x = tf.contrib.eager.Variable(0.0)
+    x = tf.Variable(0.0)
     x.assign_add(1.0)
     return x.read_value()
 
@@ -1094,19 +1097,18 @@ def defun(func=None, compiled=False):
   ```
 
   Finally, because each input signature is bound to a unique graph, if your
-  Python function constructs `tf.contrib.eager.Variable` objects, then each
-  graph constructed for that Python function will reference a unique set of
-  variables. To circumvent this problem, we recommend against compiling Python
-  functions that create `tf.contrib.eager.Variable` objects. Instead, Python
-  functions should either lexically close over `tf.contrib.eager.Variable`
-  objects or accept them as arguments, preferably encapsulated in an
-  object-oriented container. If you must create variables inside your Python
-  function and you want each graph generated for it to reference the same set of
-  variables, add logic to your Python function that ensures that variables are
-  only created the first time it is called and are reused for every subsequent
-  invocation; note that this is precisely what @{tf.keras.layers.Layer} objects
-  do, so we recommend using them to represent variable-bearing computations
-  whenever possible.
+  Python function constructs `tf.Variable` objects, then each graph constructed
+  for that Python function will reference a unique set of variables. To
+  circumvent this problem, we recommend against compiling Python functions that
+  create `tf.Variable` objects. Instead, Python functions should either
+  lexically close over `tf.Variable` objects or accept them as arguments,
+  preferably encapsulated in an object-oriented container. If you must create
+  variables inside your Python function and you want each graph generated for it
+  to reference the same set of variables, add logic to your Python function that
+  ensures that variables are only created the first time it is called and are
+  reused for every subsequent invocation; note that this is precisely what
+  @{tf.keras.layers.Layer} objects do, so we recommend using them to represent
+  variable-bearing computations whenever possible.
 
   Args:
     func: function to be compiled. If `func` is None, returns a
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index e6592b2e37..2e86563a7d 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -213,6 +213,19 @@ class FunctionTest(test.TestCase):
     self.assertEqual(fn_op.output_shapes, None)
     self.assertAllEqual(fn_op(x, x), None)
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testDefunCondGradient(self):
+
+    @function.defun
+    def f(x):
+      return control_flow_ops.cond(x > 0.5, lambda: 2 * x, lambda: 3 * x)
+
+    with backprop.GradientTape() as t:
+      x = constant_op.constant(1.0)
+      t.watch(x)
+      y = f(x)
+    self.assertAllEqual(self.evaluate(t.gradient(y, x)), 2.0)
+
   def testDefunCapturedInt32(self):
     x = constant_op.constant(1, dtype=dtypes.int32)
 
diff --git a/tensorflow/python/eager/memory_test.py b/tensorflow/python/eager/memory_test.py
index 74c6cbdd31..a1a59d511f 100644
--- a/tensorflow/python/eager/memory_test.py
+++ b/tensorflow/python/eager/memory_test.py
@@ -24,6 +24,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import six
+
 from tensorflow.python import keras
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
@@ -63,7 +65,7 @@ class MemoryTest(test.TestCase):
 
       initial = memory_profiler.memory_usage(-1)[0]
 
-      for _ in xrange(num_iters):
+      for _ in six.moves.range(num_iters):
         f()
 
       increase = memory_profiler.memory_usage(-1)[0] - initial
diff --git a/tensorflow/python/eager/ops_test.py b/tensorflow/python/eager/ops_test.py
index fc76ede4c5..17a090d526 100644
--- a/tensorflow/python/eager/ops_test.py
+++ b/tensorflow/python/eager/ops_test.py
@@ -370,6 +370,10 @@ class OpsTest(test_util.TensorFlowTestCase):
     with self.assertRaises(TypeError):
       float(x)
 
+  def testRange(self):
+    x = constant_op.constant(2)
+    self.assertEqual([0, 1], list(range(x)))
+
   def testFormatString(self):
     x = constant_op.constant(3.1415)
     self.assertEqual('3.14', '{:.2f}'.format(x))
diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc
index cefd5b1206..15d2ccf9d2 100644
--- a/tensorflow/python/eager/pywrap_tensor.cc
+++ b/tensorflow/python/eager/pywrap_tensor.cc
@@ -154,6 +154,7 @@ TFE_TensorHandle* EagerCast(TFE_Context* ctx, TFE_TensorHandle* handle,
   if (TF_GetCode(out_status) != TF_OK) RETURN_ERROR
   TFE_OpSetAttrType(op, "SrcT", src_type_enum);
   TFE_OpSetAttrType(op, "DstT", dst_type_enum);
+  TFE_OpSetAttrBool(op, "Truncate", false);
   TFE_TensorHandle* output = nullptr;
   int num_outputs = 1;
   TFE_Execute(op, &output, &num_outputs, out_status);
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index 4d28e98961..0eabea321c 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -845,11 +845,9 @@ int64_t get_uid() {
 PyObject* TFE_Py_UID() { return PyLong_FromLongLong(get_uid()); }
 
 void TFE_DeleteContextCapsule(PyObject* context) {
-  TF_Status* status = TF_NewStatus();
   TFE_Context* ctx =
       reinterpret_cast<TFE_Context*>(PyCapsule_GetPointer(context, nullptr));
-  TFE_DeleteContext(ctx, status);
-  TF_DeleteStatus(status);
+  TFE_DeleteContext(ctx);
 }
 
 static tensorflow::int64 MakeInt(PyObject* integer) {
diff --git a/tensorflow/python/estimator/BUILD b/tensorflow/python/estimator/BUILD
index fd46163050..817c8e6848 100644
--- a/tensorflow/python/estimator/BUILD
+++ b/tensorflow/python/estimator/BUILD
@@ -171,6 +171,7 @@ py_test(
     name = "baseline_test",
     size = "medium",
     srcs = ["canned/baseline_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     tags = [
         "no_pip",
@@ -207,6 +208,7 @@ py_test(
     name = "boosted_trees_test",
     size = "medium",
     srcs = ["canned/boosted_trees_test.py"],
+    shard_count = 2,
     srcs_version = "PY2AND3",
     tags = [
         "optonly",
@@ -676,6 +678,7 @@ py_test(
     name = "keras_test",
     size = "large",
     srcs = ["keras_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     tags = [
         "no_windows",
diff --git a/tensorflow/python/estimator/canned/boosted_trees.py b/tensorflow/python/estimator/canned/boosted_trees.py
index 3292e2724d..8b423f76de 100644
--- a/tensorflow/python/estimator/canned/boosted_trees.py
+++ b/tensorflow/python/estimator/canned/boosted_trees.py
@@ -46,7 +46,7 @@ from tensorflow.python.util.tf_export import estimator_export
 # TODO(nponomareva): Reveal pruning params here.
 _TreeHParams = collections.namedtuple('TreeHParams', [
     'n_trees', 'max_depth', 'learning_rate', 'l1', 'l2', 'tree_complexity',
-    'min_node_weight', 'center_bias'
+    'min_node_weight', 'center_bias', 'pruning_mode'
 ])
 
 _HOLD_FOR_MULTI_CLASS_SUPPORT = object()
@@ -410,9 +410,20 @@ class _EnsembleGrower(object):
     Args:
       tree_ensemble: A TreeEnsemble variable.
       tree_hparams: TODO. collections.namedtuple for hyper parameters.
+    Raises:
+      ValueError: when pruning mode is invalid or pruning is used and no tree
+      complexity is set.
     """
     self._tree_ensemble = tree_ensemble
     self._tree_hparams = tree_hparams
+    # pylint: disable=protected-access
+    self._pruning_mode_parsed = boosted_trees_ops.PruningMode.from_str(
+        tree_hparams.pruning_mode)
+
+    if (self._pruning_mode_parsed != boosted_trees_ops.PruningMode.NO_PRUNING
+        and tree_hparams.tree_complexity <= 0):
+      raise ValueError('For pruning, tree_complexity must be positive.')
+    # pylint: enable=protected-access
 
   @abc.abstractmethod
   def center_bias(self, center_bias_var, gradients, hessians):
@@ -500,7 +511,7 @@ class _EnsembleGrower(object):
         right_node_contribs=right_node_contribs_list,
         learning_rate=self._tree_hparams.learning_rate,
         max_depth=self._tree_hparams.max_depth,
-        pruning_mode=boosted_trees_ops.PruningMode.NO_PRUNING)
+        pruning_mode=self._pruning_mode_parsed)
     return grow_op
 
 
@@ -675,6 +686,7 @@ def _bt_model_fn(
   is_single_machine = (config.num_worker_replicas <= 1)
   sorted_feature_columns = sorted(feature_columns, key=lambda tc: tc.name)
   center_bias = tree_hparams.center_bias
+
   if train_in_memory:
     assert n_batches_per_layer == 1, (
         'When train_in_memory is enabled, input_fn should return the entire '
@@ -925,7 +937,8 @@ class BoostedTreesClassifier(estimator.Estimator):
                tree_complexity=0.,
                min_node_weight=0.,
                config=None,
-               center_bias=False):
+               center_bias=False,
+               pruning_mode='none'):
     """Initializes a `BoostedTreesClassifier` instance.
 
     Example:
@@ -999,7 +1012,11 @@ class BoostedTreesClassifier(estimator.Estimator):
         regression problems, the first node will return the mean of the labels.
         For binary classification problems, it will return a logit for a prior
         probability of label 1.
-
+      pruning_mode: one of 'none', 'pre', 'post' to indicate no pruning, pre-
+        pruning (do not split a node if not enough gain is observed) and post
+        pruning (build the tree up to a max depth and then prune branches with
+        negative gain). For pre and post pruning, you MUST provide
+        tree_complexity >0.
 
     Raises:
       ValueError: when wrong arguments are given or unsupported functionalities
@@ -1012,9 +1029,9 @@ class BoostedTreesClassifier(estimator.Estimator):
         n_classes, weight_column, label_vocabulary=label_vocabulary)
 
     # HParams for the model.
-    tree_hparams = _TreeHParams(n_trees, max_depth, learning_rate,
-                                l1_regularization, l2_regularization,
-                                tree_complexity, min_node_weight, center_bias)
+    tree_hparams = _TreeHParams(
+        n_trees, max_depth, learning_rate, l1_regularization, l2_regularization,
+        tree_complexity, min_node_weight, center_bias, pruning_mode)
 
     def _model_fn(features, labels, mode, config):
       return _bt_model_fn(  # pylint: disable=protected-access
@@ -1058,7 +1075,8 @@ class BoostedTreesRegressor(estimator.Estimator):
                tree_complexity=0.,
                min_node_weight=0.,
                config=None,
-               center_bias=False):
+               center_bias=False,
+               pruning_mode='none'):
     """Initializes a `BoostedTreesRegressor` instance.
 
     Example:
@@ -1125,6 +1143,11 @@ class BoostedTreesRegressor(estimator.Estimator):
         regression problems, the first node will return the mean of the labels.
         For binary classification problems, it will return a logit for a prior
         probability of label 1.
+      pruning_mode: one of 'none', 'pre', 'post' to indicate no pruning, pre-
+        pruning (do not split a node if not enough gain is observed) and post
+        pruning (build the tree up to a max depth and then prune branches with
+        negative gain). For pre and post pruning, you MUST provide
+        tree_complexity >0.
 
     Raises:
       ValueError: when wrong arguments are given or unsupported functionalities
@@ -1136,9 +1159,9 @@ class BoostedTreesRegressor(estimator.Estimator):
     head = _create_regression_head(label_dimension, weight_column)
 
     # HParams for the model.
-    tree_hparams = _TreeHParams(n_trees, max_depth, learning_rate,
-                                l1_regularization, l2_regularization,
-                                tree_complexity, min_node_weight, center_bias)
+    tree_hparams = _TreeHParams(
+        n_trees, max_depth, learning_rate, l1_regularization, l2_regularization,
+        tree_complexity, min_node_weight, center_bias, pruning_mode)
 
     def _model_fn(features, labels, mode, config):
       return _bt_model_fn(  # pylint: disable=protected-access
diff --git a/tensorflow/python/estimator/canned/boosted_trees_test.py b/tensorflow/python/estimator/canned/boosted_trees_test.py
index f807641057..ec597e4686 100644
--- a/tensorflow/python/estimator/canned/boosted_trees_test.py
+++ b/tensorflow/python/estimator/canned/boosted_trees_test.py
@@ -1508,7 +1508,8 @@ class ModelFnTests(test_util.TensorFlowTestCase):
         l2=0.01,
         tree_complexity=0.,
         min_node_weight=0.,
-        center_bias=center_bias)
+        center_bias=center_bias,
+        pruning_mode='none')
 
     estimator_spec = boosted_trees._bt_model_fn(  # pylint:disable=protected-access
         features=features,
diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index 2fd6f6fab9..cc5a61b54e 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -567,14 +567,19 @@ class Estimator(object):
 
   def _assert_members_are_not_overridden(self):
     """Asserts members of `Estimator` are not overridden."""
+    # TPUEstimator is special cased (owned by TF).
+    if self.__class__.__name__ == 'TPUEstimator':
+      return
+
     allowed_overrides = set([
-        '_call_input_fn', '_create_global_step',
+        '_call_input_fn', '_call_model_fn',
         '_convert_train_steps_to_hooks', '_convert_eval_steps_to_hooks',
+        '_create_global_step', '_create_and_assert_global_step',
         '_tf_api_names', '_tf_api_names_v1', '_estimator_api_names',
         '_estimator_api_names_v1', '_estimator_api_constants',
         '_estimator_api_constants_v1',
         '_validate_features_in_predict_input',
-        '_call_model_fn', '_add_meta_graph_for_mode'
+        '_add_meta_graph_for_mode'
     ])
     estimator_members = set([m for m in Estimator.__dict__.keys()
                              if not m.startswith('__')])
@@ -901,9 +906,10 @@ class Estimator(object):
 
       with tf_session.Session(config=self._session_config) as session:
 
-        local_init_op = (
-            estimator_spec.scaffold.local_init_op or
-            monitored_session.Scaffold.default_local_init_op())
+        if estimator_spec.scaffold.local_init_op is not None:
+          local_init_op = estimator_spec.scaffold.local_init_op
+        else:
+          local_init_op = monitored_session.Scaffold.default_local_init_op()
 
         # This saver will be used both for restoring variables now,
         # and in saving out the metagraph below. This ensures that any
@@ -1155,13 +1161,19 @@ class Estimator(object):
     with ops.Graph().as_default() as g, g.device(self._device_fn):
       random_seed.set_random_seed(self._config.tf_random_seed)
       global_step_tensor = self._create_and_assert_global_step(g)
-      training_util._get_or_create_global_step_read()  # pylint: disable=protected-access
+
+      # Skip creating a read variable if _create_and_assert_global_step
+      # returns None (e.g. tf.contrib.estimator.SavedModelEstimator).
+      if global_step_tensor is not None:
+        training_util._get_or_create_global_step_read(g)  # pylint: disable=protected-access
+
       features, labels, input_hooks = (
           self._get_features_and_labels_from_input_fn(
               input_fn, model_fn_lib.ModeKeys.TRAIN))
       worker_hooks.extend(input_hooks)
       estimator_spec = self._call_model_fn(
           features, labels, model_fn_lib.ModeKeys.TRAIN, self.config)
+      global_step_tensor = training_util.get_global_step(g)
       return self._train_with_estimator_spec(estimator_spec, worker_hooks,
                                              hooks, global_step_tensor,
                                              saving_listeners)
@@ -1448,13 +1460,13 @@ class Estimator(object):
   def _evaluate_build_graph(self, input_fn, hooks=None, checkpoint_path=None):
     """Builds the graph and related hooks to run evaluation."""
     random_seed.set_random_seed(self._config.tf_random_seed)
-    global_step_tensor = self._create_and_assert_global_step(
-        ops.get_default_graph())
+    self._create_and_assert_global_step(ops.get_default_graph())
     features, labels, input_hooks = (
         self._get_features_and_labels_from_input_fn(input_fn,
                                                     model_fn_lib.ModeKeys.EVAL))
     estimator_spec = self._call_model_fn(
         features, labels, model_fn_lib.ModeKeys.EVAL, self.config)
+    global_step_tensor = training_util.get_global_step(ops.get_default_graph())
 
     # Call to warm_start has to be after model_fn is called.
     self._maybe_warm_start(checkpoint_path)
@@ -1480,7 +1492,21 @@ class Estimator(object):
     all_hooks.extend(hooks)
     all_hooks.extend(list(estimator_spec.evaluation_hooks or []))
 
-    return estimator_spec.scaffold, update_op, eval_dict, all_hooks
+    # New local variables have been added, so update the estimator spec's
+    # local init op if it was defined.
+    scaffold = estimator_spec.scaffold
+    if estimator_spec.scaffold and estimator_spec.scaffold.local_init_op:
+      # Ensure that eval step has been created before updating local init op.
+      evaluation._get_or_create_eval_step()  # pylint: disable=protected-access
+
+      scaffold = monitored_session.Scaffold(
+          local_init_op=control_flow_ops.group(
+              estimator_spec.scaffold.local_init_op,
+              monitored_session.Scaffold.default_local_init_op()),
+          copy_from_scaffold=scaffold
+      )
+
+    return scaffold, update_op, eval_dict, all_hooks
 
   def _evaluate_run(self, checkpoint_path, scaffold, update_op, eval_dict,
                     all_hooks, output_dir):
@@ -1911,6 +1937,19 @@ class WarmStartSettings(
     )
 
 
+def _get_saved_model_ckpt(saved_model_dir):
+  """Return path to variables checkpoint in a SavedModel directory."""
+  if not gfile.Exists(
+      os.path.join(compat.as_bytes(saved_model_dir),
+                   compat.as_bytes('variables/variables.index'))):
+    raise ValueError('Directory provided has an invalid SavedModel format: %s'
+                     % saved_model_dir)
+  return os.path.join(
+      compat.as_bytes(saved_model_dir),
+      compat.as_bytes('{}/{}'.format(constants.VARIABLES_DIRECTORY,
+                                     constants.VARIABLES_FILENAME)))
+
+
 def _get_default_warm_start_settings(warm_start_from):
   """Returns default WarmStartSettings.
 
@@ -1934,10 +1973,8 @@ def _get_default_warm_start_settings(warm_start_from):
     if gfile.Exists(os.path.join(compat.as_bytes(warm_start_from),
                                  compat.as_bytes('variables/variables.index'))):
       logging.info('Warm-starting from a SavedModel')
-      return WarmStartSettings(ckpt_to_initialize_from=os.path.join(
-          compat.as_bytes(warm_start_from),
-          compat.as_bytes('{}/{}'.format(constants.VARIABLES_DIRECTORY,
-                                         constants.VARIABLES_FILENAME))))
+      return WarmStartSettings(
+          ckpt_to_initialize_from=_get_saved_model_ckpt(warm_start_from))
     return WarmStartSettings(ckpt_to_initialize_from=warm_start_from)
   elif isinstance(warm_start_from, WarmStartSettings):
     return warm_start_from
diff --git a/tensorflow/python/estimator/keras.py b/tensorflow/python/estimator/keras.py
index 076359b503..70517ae278 100644
--- a/tensorflow/python/estimator/keras.py
+++ b/tensorflow/python/estimator/keras.py
@@ -21,11 +21,14 @@ from __future__ import print_function
 
 import os
 import re
+import tempfile
+
 from tensorflow.python.client import session
 from tensorflow.python.estimator import estimator as estimator_lib
 from tensorflow.python.estimator import export as export_lib
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.estimator import run_config as run_config_lib
+from tensorflow.python.estimator.run_config import RunConfig
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
@@ -39,6 +42,7 @@ from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import metrics as metrics_module
+from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.training import distribute as distribute_lib
@@ -180,7 +184,7 @@ def _in_place_subclassed_model_reset(model):
   # Replace layers on the model with fresh layers
   layers_to_names = {value: key for key, value in attributes_cache.items()}
   original_layers = model._layers[:]
-  model._layers = []
+  model._layers = data_structures.NoDependency([])
   for layer in original_layers:  # We preserve layer order.
     config = layer.get_config()
     # This will not work for nested subclassed models used as layers.
@@ -228,7 +232,8 @@ def _in_place_subclassed_model_reset(model):
       ]
       for name in attributes_to_cache:
         attributes_cache[name] = getattr(model, name)
-  model._original_attributes_cache = attributes_cache
+  model._original_attributes_cache = data_structures.NoDependency(
+      attributes_cache)
   # Reset built state
   model.built = False
   model.inputs = None
@@ -426,29 +431,34 @@ def _create_keras_model_fn(keras_model, custom_objects=None):
   return model_fn
 
 
-def _save_first_checkpoint(keras_model, estimator, custom_objects,
-                           keras_weights):
+def _save_first_checkpoint(keras_model, custom_objects, config):
   """Save first checkpoint for the keras Estimator.
 
   Args:
     keras_model: an instance of compiled keras model.
-    estimator: keras estimator.
     custom_objects: Dictionary for custom objects.
-    keras_weights: A flat list of Numpy arrays for weights of given keras_model.
+    config: Estimator config.
 
   Returns:
-    The model_fn for a keras Estimator.
+    The path where keras model checkpoint is saved.
   """
+  # save checkpoint into subdirectory to allow warm start
+  keras_model_dir = os.path.join(config.model_dir, 'keras')
   # Load weights and save to checkpoint if there is no checkpoint
-  latest_path = saver_lib.latest_checkpoint(estimator.model_dir)
+  latest_path = saver_lib.latest_checkpoint(keras_model_dir)
   if not latest_path:
+    keras_weights = None
+    if _any_weight_initialized(keras_model):
+      keras_weights = keras_model.get_weights()
+    if not gfile.IsDirectory(keras_model_dir):
+      gfile.MakeDirs(keras_model_dir)
     with ops.Graph().as_default():
-      random_seed.set_random_seed(estimator.config.tf_random_seed)
+      random_seed.set_random_seed(config.tf_random_seed)
       training_util.create_global_step()
       model = _clone_and_build_model(model_fn_lib.ModeKeys.TRAIN, keras_model,
                                      custom_objects)
       # save to checkpoint
-      with session.Session(config=estimator._session_config) as sess:
+      with session.Session(config=config.session_config) as sess:
         if keras_weights:
           model.set_weights(keras_weights)
         # Make update ops and initialize all variables.
@@ -458,7 +468,46 @@ def _save_first_checkpoint(keras_model, estimator, custom_objects,
           K._initialize_variables(sess)
           # pylint: enable=protected-access
         saver = saver_lib.Saver()
-        saver.save(sess, os.path.join(estimator.model_dir, 'keras_model.ckpt'))
+        latest_path = os.path.join(keras_model_dir, 'keras_model.ckpt')
+        saver.save(sess, latest_path)
+  return latest_path
+
+
+def _maybe_overwrite_model_dir_and_session_config(config, model_dir):
+  """Overwrite estimator config by `model_dir` and `session_config` if needed.
+
+  Args:
+    config: Original estimator config.
+    model_dir: Estimator model checkpoint directory.
+
+  Returns:
+    Overwritten estimator config.
+
+  Raises:
+    ValueError: Model directory inconsistent between `model_dir` and `config`.
+  """
+
+  default_session_config = run_config_lib.get_default_session_config()
+  if isinstance(config, dict):
+    config = RunConfig(**config)
+  elif config is None:
+    config = RunConfig(session_config=default_session_config)
+  if config.session_config is None:
+    config = RunConfig.replace(config, session_config=default_session_config)
+
+  if model_dir is not None:
+    if (getattr(config, 'model_dir', None) is not None and
+        config.model_dir != model_dir):
+      raise ValueError(
+          "`model_dir` are set both in constructor and `RunConfig`, but with "
+          "different values. In constructor: '{}', in `RunConfig`: "
+          "'{}' ".format(model_dir, config.model_dir))
+    config = RunConfig.replace(config, model_dir=model_dir)
+  elif getattr(config, 'model_dir', None) is None:
+    model_dir = tempfile.mkdtemp()
+    config = RunConfig.replace(config, model_dir=model_dir)
+
+  return config
 
 
 def model_to_estimator(keras_model=None,
@@ -517,45 +566,39 @@ def model_to_estimator(keras_model=None,
         'Please compile the model with `model.compile()` '
         'before calling `model_to_estimator()`.')
 
-  if isinstance(config, dict):
-    config = run_config_lib.RunConfig(**config)
+  config = _maybe_overwrite_model_dir_and_session_config(config, model_dir)
 
   keras_model_fn = _create_keras_model_fn(keras_model, custom_objects)
-  estimator = estimator_lib.Estimator(
-      keras_model_fn, model_dir=model_dir, config=config)
-
-  # Check if we need to call get_weights:
   if _any_weight_initialized(keras_model):
-    keras_weights = keras_model.get_weights()
     # Warn if config passed to estimator tries to update GPUOptions. If a
     # session has already been created, the GPUOptions passed to the first
     # session sticks.
-    if estimator._session_config.HasField('gpu_options'):
+    if config.session_config.HasField('gpu_options'):
       logging.warning(
           'The Keras backend session has already been set. '
           'The _session_config passed to model_to_estimator will not be used.')
   else:
     # Pass the config into keras backend's default session.
-    sess = session.Session(config=estimator._session_config)
+    sess = session.Session(config=config.session_config)
     K.set_session(sess)
-    keras_weights = None
 
+  warm_start_path = None
   if keras_model._is_graph_network:
-    # TODO(yifeif): move checkpoint initialization to scaffold.init_fn
-    _save_first_checkpoint(keras_model,
-                           estimator,
-                           custom_objects,
-                           keras_weights)
+    warm_start_path = _save_first_checkpoint(keras_model, custom_objects,
+                                             config)
   elif keras_model.built:
-    logging.warning('You are creating an Estimator from a Keras model '
-                    'manually subclassed from `Model`, that was '
-                    'already called on some inputs (and thus already had '
-                    'weights). We are currently unable to preserve '
-                    'the model\'s state (its weights) '
-                    'as part of the estimator '
-                    'in this case. Be warned that the estimator '
-                    'has been created using '
-                    'a freshly initialized version of your model.\n'
-                    'Note that this doesn\'t affect the state of the '
-                    'model instance you passed as `keras_model` argument.')
+    logging.warning('You are creating an Estimator from a Keras model manually '
+                    'subclassed from `Model`, that was already called on some '
+                    'inputs (and thus already had weights). We are currently '
+                    'unable to preserve the model\'s state (its weights) as '
+                    'part of the estimator in this case. Be warned that the '
+                    'estimator has been created using a freshly initialized '
+                    'version of your model.\n'
+                    'Note that this doesn\'t affect the state of the model '
+                    'instance you passed as `keras_model` argument.')
+
+  estimator = estimator_lib.Estimator(keras_model_fn,
+                                      config=config,
+                                      warm_start_from=warm_start_path)
+
   return estimator
diff --git a/tensorflow/python/estimator/keras_test.py b/tensorflow/python/estimator/keras_test.py
index 7a3c5a9bf1..cf4ec7f4da 100644
--- a/tensorflow/python/estimator/keras_test.py
+++ b/tensorflow/python/estimator/keras_test.py
@@ -33,11 +33,13 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.optimizers import SGD
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops.parsing_ops import gen_parsing_ops
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer_cache
 from tensorflow.python.training import rmsprop
+from tensorflow.python.training import session_run_hook
 
 
 try:
@@ -50,6 +52,8 @@ _TRAIN_SIZE = 200
 _INPUT_SIZE = (10,)
 _NUM_CLASS = 2
 
+_TMP_DIR = '/tmp'
+
 
 def simple_sequential_model():
   model = keras.models.Sequential()
@@ -167,6 +171,12 @@ def multi_inputs_multi_outputs_model():
   return model
 
 
+class MyHook(session_run_hook.SessionRunHook):
+
+  def begin(self):
+    _ = variable_scope.get_variable('temp', [1])
+
+
 class TestKerasEstimator(test_util.TensorFlowTestCase):
 
   def setUp(self):
@@ -203,6 +213,54 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
       writer_cache.FileWriterCache.clear()
       gfile.DeleteRecursively(self._config.model_dir)
 
+  # see b/109935364
+  @test_util.run_in_graph_and_eager_modes
+  def test_train_with_hooks(self):
+    for model_type in ['sequential', 'functional']:
+      keras_model, (_, _), (
+          _, _), train_input_fn, eval_input_fn = get_resource_for_simple_model(
+              model_type=model_type, is_evaluate=True)
+      keras_model.compile(
+          loss='categorical_crossentropy',
+          optimizer=rmsprop.RMSPropOptimizer(1e-3),
+          metrics=['mse', keras.metrics.categorical_accuracy])
+
+      my_hook = MyHook()
+      with self.test_session():
+        est_keras = keras_lib.model_to_estimator(
+            keras_model=keras_model, config=self._config)
+        before_eval_results = est_keras.evaluate(
+            input_fn=eval_input_fn, steps=1)
+        est_keras.train(input_fn=train_input_fn, hooks=[my_hook],
+                        steps=_TRAIN_SIZE / 16)
+        after_eval_results = est_keras.evaluate(input_fn=eval_input_fn, steps=1)
+        self.assertLess(after_eval_results['loss'], before_eval_results['loss'])
+
+      writer_cache.FileWriterCache.clear()
+      gfile.DeleteRecursively(self._config.model_dir)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_train_with_model_fit_and_hooks(self):
+    keras_model, (x_train, y_train), _, \
+      train_input_fn, eval_input_fn = get_resource_for_simple_model(
+          model_type='sequential', is_evaluate=True)
+
+    keras_model.compile(
+        loss='categorical_crossentropy',
+        optimizer=rmsprop.RMSPropOptimizer(1e-3),
+        metrics=['mse', keras.metrics.categorical_accuracy])
+    my_hook = MyHook()
+    with self.test_session():
+      keras_model.fit(x_train, y_train, epochs=1)
+
+      keras_est = keras_lib.model_to_estimator(
+          keras_model=keras_model, config=self._config)
+      before_eval_results = keras_est.evaluate(input_fn=eval_input_fn)
+      keras_est.train(input_fn=train_input_fn, hooks=[my_hook],
+                      steps=_TRAIN_SIZE / 16)
+      after_eval_results = keras_est.evaluate(input_fn=eval_input_fn, steps=1)
+      self.assertLess(after_eval_results['loss'], before_eval_results['loss'])
+
   @test_util.run_in_graph_and_eager_modes
   def test_train_with_tf_optimizer(self):
     for model_type in ['sequential', 'functional']:
@@ -473,27 +531,43 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
         est_keras.train(input_fn=invald_output_name_input_fn, steps=100)
 
   def test_custom_objects(self):
-    
+
     def relu6(x):
       return keras.backend.relu(x, max_value=6)
-    
+
     keras_model = simple_functional_model(activation=relu6)
     keras_model.compile(loss='categorical_crossentropy', optimizer='adam')
     custom_objects = {
         'relu6': relu6
     }
 
+    (x_train, y_train), _ = testing_utils.get_test_data(
+        train_samples=_TRAIN_SIZE,
+        test_samples=50,
+        input_shape=(10,),
+        num_classes=2)
+    y_train = keras.utils.to_categorical(y_train, 2)
+    input_name = keras_model.input_names[0]
+    output_name = keras_model.output_names[0]
+    train_input_fn = numpy_io.numpy_input_fn(
+        x=randomize_io_type(x_train, input_name),
+        y=randomize_io_type(y_train, output_name),
+        shuffle=False,
+        num_epochs=None,
+        batch_size=16)
     with self.assertRaisesRegexp(ValueError, 'relu6'):
       with self.test_session():
-        keras_lib.model_to_estimator(
+        est = keras_lib.model_to_estimator(
             keras_model=keras_model,
             model_dir=tempfile.mkdtemp(dir=self._base_dir))
+        est.train(input_fn=train_input_fn, steps=1)
 
     with self.test_session():
-      keras_lib.model_to_estimator(
+      est = keras_lib.model_to_estimator(
           keras_model=keras_model,
           model_dir=tempfile.mkdtemp(dir=self._base_dir),
           custom_objects=custom_objects)
+      est.train(input_fn=train_input_fn, steps=1)
 
   def test_tf_config(self):
     keras_model, (_, _), (_, _), _, _ = get_resource_for_simple_model()
@@ -530,12 +604,73 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
       gpu_options = config_pb2.GPUOptions(per_process_gpu_memory_fraction=0.3)
       sess_config = config_pb2.ConfigProto(gpu_options=gpu_options)
       self._config._session_config = sess_config
-      keras_lib.model_to_estimator(
-          keras_model=keras_model, config=self._config)
-      self.assertEqual(
-          keras.backend.get_session()
-          ._config.gpu_options.per_process_gpu_memory_fraction,
-          gpu_options.per_process_gpu_memory_fraction)
+      with self.test_session():
+        keras_lib.model_to_estimator(
+            keras_model=keras_model, config=self._config)
+        self.assertEqual(
+            keras.backend.get_session()
+            ._config.gpu_options.per_process_gpu_memory_fraction,
+            gpu_options.per_process_gpu_memory_fraction)
+
+  def test_with_empty_config(self):
+    keras_model, _, _, _, _ = get_resource_for_simple_model(
+        model_type='sequential', is_evaluate=True)
+    keras_model.compile(
+        loss='categorical_crossentropy',
+        optimizer='rmsprop',
+        metrics=['mse', keras.metrics.categorical_accuracy])
+
+    with self.test_session():
+      est_keras = keras_lib.model_to_estimator(
+          keras_model=keras_model, model_dir=self._base_dir,
+          config=run_config_lib.RunConfig())
+      self.assertEqual(run_config_lib.get_default_session_config(),
+                       est_keras._session_config)
+      self.assertEqual(est_keras._session_config,
+                       est_keras._config.session_config)
+      self.assertEqual(self._base_dir, est_keras._config.model_dir)
+      self.assertEqual(self._base_dir, est_keras._model_dir)
+
+    with self.test_session():
+      est_keras = keras_lib.model_to_estimator(
+          keras_model=keras_model, model_dir=self._base_dir,
+          config=None)
+      self.assertEqual(run_config_lib.get_default_session_config(),
+                       est_keras._session_config)
+      self.assertEqual(est_keras._session_config,
+                       est_keras._config.session_config)
+      self.assertEqual(self._base_dir, est_keras._config.model_dir)
+      self.assertEqual(self._base_dir, est_keras._model_dir)
+
+  def test_with_empty_config_and_empty_model_dir(self):
+    keras_model, _, _, _, _ = get_resource_for_simple_model(
+        model_type='sequential', is_evaluate=True)
+    keras_model.compile(
+        loss='categorical_crossentropy',
+        optimizer='rmsprop',
+        metrics=['mse', keras.metrics.categorical_accuracy])
+
+    with self.test_session():
+      with test.mock.patch.object(tempfile, 'mkdtemp', return_value=_TMP_DIR):
+        est_keras = keras_lib.model_to_estimator(
+            keras_model=keras_model,
+            config=run_config_lib.RunConfig())
+        self.assertEqual(est_keras._model_dir, _TMP_DIR)
+
+  def test_with_conflicting_model_dir_and_config(self):
+    keras_model, _, _, _, _ = get_resource_for_simple_model(
+        model_type='sequential', is_evaluate=True)
+    keras_model.compile(
+        loss='categorical_crossentropy',
+        optimizer='rmsprop',
+        metrics=['mse', keras.metrics.categorical_accuracy])
+
+    with self.test_session():
+      with self.assertRaisesRegexp(ValueError, '`model_dir` are set both in '
+                                   'constructor and `RunConfig`'):
+        keras_lib.model_to_estimator(
+            keras_model=keras_model, model_dir=self._base_dir,
+            config=run_config_lib.RunConfig(model_dir=_TMP_DIR))
 
   def test_pretrained_weights(self):
     keras_model, (_, _), (_, _), _, _ = get_resource_for_simple_model()
diff --git a/tensorflow/python/estimator/model_fn.py b/tensorflow/python/estimator/model_fn.py
index a9fd8f8e1a..9db9ccd01d 100644
--- a/tensorflow/python/estimator/model_fn.py
+++ b/tensorflow/python/estimator/model_fn.py
@@ -380,15 +380,12 @@ def _maybe_add_default_serving_output(export_outputs):
   return export_outputs
 
 
-class _TPUEstimatorSpec(collections.namedtuple('TPUEstimatorSpec', [
-    'mode',
-    'predictions',
-    'loss',
-    'train_op',
-    'eval_metrics',
-    'export_outputs',
-    'scaffold_fn',
-    'host_call'])):
+class _TPUEstimatorSpec(
+    collections.namedtuple('TPUEstimatorSpec', [
+        'mode', 'predictions', 'loss', 'train_op', 'eval_metrics',
+        'export_outputs', 'scaffold_fn', 'host_call', 'training_hooks',
+        'evaluation_hooks', 'prediction_hooks'
+    ])):
   """Ops and objects returned from a `model_fn` and passed to `TPUEstimator`.
 
   This is a simplified implementation of `tf.contrib.tpu.EstimatorSpec`. See
@@ -404,17 +401,24 @@ class _TPUEstimatorSpec(collections.namedtuple('TPUEstimatorSpec', [
               eval_metrics=None,
               export_outputs=None,
               scaffold_fn=None,
-              host_call=None):
+              host_call=None,
+              training_hooks=None,
+              evaluation_hooks=None,
+              prediction_hooks=None):
     """Creates a `_TPUEstimatorSpec` instance."""
-    return super(_TPUEstimatorSpec, cls).__new__(cls,
-                                                 mode=mode,
-                                                 predictions=predictions,
-                                                 loss=loss,
-                                                 train_op=train_op,
-                                                 eval_metrics=eval_metrics,
-                                                 export_outputs=export_outputs,
-                                                 scaffold_fn=scaffold_fn,
-                                                 host_call=host_call)
+    return super(_TPUEstimatorSpec, cls).__new__(
+        cls,
+        mode=mode,
+        predictions=predictions,
+        loss=loss,
+        train_op=train_op,
+        eval_metrics=eval_metrics,
+        export_outputs=export_outputs,
+        scaffold_fn=scaffold_fn,
+        host_call=host_call,
+        training_hooks=training_hooks,
+        evaluation_hooks=evaluation_hooks,
+        prediction_hooks=prediction_hooks)
 
   def as_estimator_spec(self):
     """Creates an equivalent `EstimatorSpec` used by CPU train/eval."""
@@ -423,12 +427,16 @@ class _TPUEstimatorSpec(collections.namedtuple('TPUEstimatorSpec', [
     else:
       metric_fn, tensors = self.eval_metrics
       eval_metric_ops = metric_fn(**tensors)
-    return EstimatorSpec(mode=self.mode,
-                         predictions=self.predictions,
-                         loss=self.loss,
-                         train_op=self.train_op,
-                         eval_metric_ops=eval_metric_ops,
-                         export_outputs=self.export_outputs)
+    return EstimatorSpec(
+        mode=self.mode,
+        predictions=self.predictions,
+        loss=self.loss,
+        train_op=self.train_op,
+        eval_metric_ops=eval_metric_ops,
+        export_outputs=self.export_outputs,
+        training_hooks=self.training_hooks,
+        evaluation_hooks=self.evaluation_hooks,
+        prediction_hooks=self.prediction_hooks)
 
 
 def _check_is_tensor_or_operation(x, name):
diff --git a/tensorflow/python/framework/error_interpolation.py b/tensorflow/python/framework/error_interpolation.py
index a79073b748..7719d03019 100644
--- a/tensorflow/python/framework/error_interpolation.py
+++ b/tensorflow/python/framework/error_interpolation.py
@@ -87,6 +87,53 @@ def _parse_message(message):
   return seps, tags
 
 
+def _compute_device_summary_from_list(device_assignment_list, prefix=""):
+  """Return a summary of an op's device function stack.
+
+  Args:
+    device_assignment_list: The op._device_assignments list.
+    prefix:  An optional string prefix used before each line of the multi-
+        line string returned by this function.
+
+  Returns:
+    A multi-line string similar to:
+        Device assignments active during op creation:
+          with tf.device(/cpu:0): <test_1.py:27>
+          with tf.device(some_func<foo.py, 123>): <test_2.py:38>
+    The first line will have no padding to its left by default.  Subsequent
+    lines will have two spaces of left-padding.  Use the prefix argument
+    to increase indentation.
+  """
+  if not device_assignment_list:
+    message = "No device assignments were active during op creation."
+    return prefix + message
+
+  str_list = []
+  str_list.append("%sDevice assignments active during op creation:" % prefix)
+
+  for traceable_obj in device_assignment_list:
+    location_summary = "<{file}:{line}>".format(file=traceable_obj.filename,
+                                                line=traceable_obj.lineno)
+    subs = {
+        "prefix": prefix,
+        "indent": "  ",
+        "dev_name": traceable_obj.obj,
+        "loc": location_summary,
+    }
+    str_list.append(
+        "{prefix}{indent}with tf.device({dev_name}): {loc}".format(**subs))
+
+  return "\n".join(str_list)
+
+
+def _compute_device_assignment_summary_from_op(op, prefix=""):
+  if not op:
+    return ""
+  # pylint: disable=protected-access
+  return _compute_device_summary_from_list(op._device_assignments, prefix)
+  # pylint: enable=protected-access
+
+
 def _compute_colocation_summary_from_dict(colocation_dict, prefix=""):
   """Return a summary of an op's colocation stack.
 
@@ -203,6 +250,7 @@ def _compute_field_dict(op):
       "file": default_value,
       "line": default_value,
       "colocations": default_value,
+      "devices": default_value,
   }
   frame = _get_defining_frame_from_op(op)
   if frame:
@@ -211,6 +259,9 @@ def _compute_field_dict(op):
   colocation_summary = _compute_colocation_summary_from_op(op)
   if colocation_summary:
     field_dict["colocations"] = colocation_summary
+  device_summary = _compute_device_assignment_summary_from_op(op)
+  if device_summary:
+    field_dict["devices"] = device_summary
 
   return field_dict
 
@@ -233,6 +284,8 @@ def interpolate(error_message, graph):
 
   node_name_to_substitution_dict = {}
   for name in [t.name for t in tags]:
+    if name in node_name_to_substitution_dict:
+      continue
     try:
       op = graph.get_operation_by_name(name)
     except KeyError:
diff --git a/tensorflow/python/framework/error_interpolation_test.py b/tensorflow/python/framework/error_interpolation_test.py
index 1e5cb73854..fbf182879b 100644
--- a/tensorflow/python/framework/error_interpolation_test.py
+++ b/tensorflow/python/framework/error_interpolation_test.py
@@ -57,13 +57,32 @@ def _modify_op_stack_with_filenames(op, num_user_frames, user_filename,
   op._traceback = stack
 
 
-def assert_node_in_colocation_summary(test_obj, colocation_summary_string,
-                                      name, filename="", lineno=""):
-  lineno = str(lineno)
-  name_phrase = "colocate_with(%s)" % name
-  for term in [name_phrase, filename, lineno]:
-    test_obj.assertIn(term, colocation_summary_string)
-  test_obj.assertNotIn("loc:@", colocation_summary_string)
+class ComputeDeviceSummaryFromOpTest(test.TestCase):
+
+  def testCorrectFormatWithActiveDeviceAssignments(self):
+    assignments = []
+    assignments.append(
+        traceable_stack.TraceableObject("/cpu:0",
+                                        filename="hope.py",
+                                        lineno=24))
+    assignments.append(
+        traceable_stack.TraceableObject("/gpu:2",
+                                        filename="please.py",
+                                        lineno=42))
+
+    summary = error_interpolation._compute_device_summary_from_list(
+        assignments, prefix="  ")
+
+    self.assertIn("tf.device(/cpu:0)", summary)
+    self.assertIn("<hope.py:24>", summary)
+    self.assertIn("tf.device(/gpu:2)", summary)
+    self.assertIn("<please.py:42>", summary)
+
+  def testCorrectFormatWhenNoColocationsWereActive(self):
+    device_assignment_list = []
+    summary = error_interpolation._compute_device_summary_from_list(
+        device_assignment_list, prefix="  ")
+    self.assertIn("No device assignments", summary)
 
 
 class ComputeColocationSummaryFromOpTest(test.TestCase):
@@ -81,15 +100,10 @@ class ComputeColocationSummaryFromOpTest(test.TestCase):
     }
     summary = error_interpolation._compute_colocation_summary_from_dict(
         colocation_dict, prefix="  ")
-    assert_node_in_colocation_summary(self,
-                                      summary,
-                                      name="test_node_1",
-                                      filename="test_1.py",
-                                      lineno=27)
-    assert_node_in_colocation_summary(self, summary,
-                                      name="test_node_2",
-                                      filename="test_2.py",
-                                      lineno=38)
+    self.assertIn("colocate_with(test_node_1)", summary)
+    self.assertIn("<test_1.py:27>", summary)
+    self.assertIn("colocate_with(test_node_2)", summary)
+    self.assertIn("<test_2.py:38>", summary)
 
   def testCorrectFormatWhenNoColocationsWereActive(self):
     colocation_dict = {}
@@ -98,9 +112,10 @@ class ComputeColocationSummaryFromOpTest(test.TestCase):
     self.assertIn("No node-device colocations", summary)
 
 
-class InterpolateTest(test.TestCase):
+class InterpolateFilenamesAndLineNumbersTest(test.TestCase):
 
   def setUp(self):
+    ops.reset_default_graph()
     # Add nodes to the graph for retrieval by name later.
     constant_op.constant(1, name="One")
     constant_op.constant(2, name="Two")
@@ -177,9 +192,57 @@ class InterpolateTest(test.TestCase):
     self.assertRegexpMatches(interpolated_string, expected_regex)
 
 
+class InterpolateDeviceSummaryTest(test.TestCase):
+
+  def _fancy_device_function(self, unused_op):
+    return "/cpu:*"
+
+  def setUp(self):
+    ops.reset_default_graph()
+    self.zero = constant_op.constant([0.0], name="zero")
+    with ops.device("/cpu"):
+      self.one = constant_op.constant([1.0], name="one")
+      with ops.device("/cpu:0"):
+        self.two = constant_op.constant([2.0], name="two")
+    with ops.device(self._fancy_device_function):
+      self.three = constant_op.constant(3.0, name="three")
+
+    self.graph = self.three.graph
+
+  def testNodeZeroHasNoDeviceSummaryInfo(self):
+    message = "^^node:zero:${devices}^^"
+    result = error_interpolation.interpolate(message, self.graph)
+    self.assertIn("No device assignments were active", result)
+
+  def testNodeOneHasExactlyOneInterpolatedDevice(self):
+    message = "^^node:one:${devices}^^"
+    result = error_interpolation.interpolate(message, self.graph)
+    num_devices = result.count("tf.device")
+    self.assertEqual(1, num_devices)
+    self.assertIn("tf.device(/cpu)", result)
+
+  def testNodeTwoHasTwoInterpolatedDevice(self):
+    message = "^^node:two:${devices}^^"
+    result = error_interpolation.interpolate(message, self.graph)
+    num_devices = result.count("tf.device")
+    self.assertEqual(2, num_devices)
+    self.assertIn("tf.device(/cpu)", result)
+    self.assertIn("tf.device(/cpu:0)", result)
+
+  def testNodeThreeHasFancyFunctionDisplayNameForInterpolatedDevice(self):
+    message = "^^node:three:${devices}^^"
+    result = error_interpolation.interpolate(message, self.graph)
+    num_devices = result.count("tf.device")
+    self.assertEqual(1, num_devices)
+    name_re = r"_fancy_device_function<.*error_interpolation_test.py, [0-9]+>"
+    expected_re = r"with tf.device\(.*%s\)" % name_re
+    self.assertRegexpMatches(result, expected_re)
+
+
 class InterpolateColocationSummaryTest(test.TestCase):
 
   def setUp(self):
+    ops.reset_default_graph()
     # Add nodes to the graph for retrieval by name later.
     node_one = constant_op.constant(1, name="One")
     node_two = constant_op.constant(2, name="Two")
@@ -203,12 +266,12 @@ class InterpolateColocationSummaryTest(test.TestCase):
   def testNodeThreeHasColocationInterpolation(self):
     message = "^^node:Three_with_one:${colocations}^^"
     result = error_interpolation.interpolate(message, self.graph)
-    assert_node_in_colocation_summary(self, result, name="One")
+    self.assertIn("colocate_with(One)", result)
 
   def testNodeFourHasColocationInterpolationForNodeThreeOnly(self):
     message = "^^node:Four_with_three:${colocations}^^"
     result = error_interpolation.interpolate(message, self.graph)
-    assert_node_in_colocation_summary(self, result, name="Three_with_one")
+    self.assertIn("colocate_with(Three_with_one)", result)
     self.assertNotIn(
         "One", result,
         "Node One should not appear in Four_with_three's summary:\n%s"
@@ -217,8 +280,8 @@ class InterpolateColocationSummaryTest(test.TestCase):
   def testNodeFiveHasColocationInterpolationForNodeOneAndTwo(self):
     message = "^^node:Five_with_one_with_two:${colocations}^^"
     result = error_interpolation.interpolate(message, self.graph)
-    assert_node_in_colocation_summary(self, result, name="One")
-    assert_node_in_colocation_summary(self, result, name="Two")
+    self.assertIn("colocate_with(One)", result)
+    self.assertIn("colocate_with(Two)", result)
 
   def testColocationInterpolationForNodeLackingColocation(self):
     message = "^^node:One:${colocations}^^"
diff --git a/tensorflow/python/framework/fast_tensor_util.pyx b/tensorflow/python/framework/fast_tensor_util.pyx
index 17d112a1ec..2e3e15f53a 100644
--- a/tensorflow/python/framework/fast_tensor_util.pyx
+++ b/tensorflow/python/framework/fast_tensor_util.pyx
@@ -6,6 +6,13 @@ cimport numpy as np
 
 from tensorflow.python.util import compat
 
+def AppendBFloat16ArrayToTensorProto(
+    tensor_proto, np.ndarray[np.uint16_t, ndim=1] nparray):
+  cdef long i, n
+  n = nparray.size
+  for i in range(n):
+    tensor_proto.half_val.append(nparray[i])
+
 
 def AppendFloat16ArrayToTensorProto(
     # For numpy, npy_half is a typedef for npy_uint16,
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index 6525607fae..c76743d2c6 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -38,8 +38,8 @@ from tensorflow.python.ops import cond_v2_impl
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.util import compat
+from tensorflow.python.util import function_utils
 from tensorflow.python.util import tf_contextlib
-from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
 
 # This is to avoid a circular dependency with cond_v2_impl.
@@ -255,9 +255,12 @@ class _DefinedFunction(object):
     # Constructed only when C API is enabled, lazily
     self._c_func = None
     self._sub_functions = dict()  # Constructed with _definition or _c_func
-    device_stack = ops.get_default_graph()._device_function_stack  # pylint: disable=protected-access
+    # pylint: disable=protected-access
+    device_funcs = ops.get_default_graph()._device_functions_outer_to_inner
+    # pylint: enable=protected-access
+
     # Get the innermost device if possbile.
-    self._caller_device = device_stack[-1] if device_stack else None
+    self._caller_device = device_funcs[-1] if device_funcs else None
 
     # Cached OpDef for this function. When C API is enabled, this is
     # the only part of FunctionDef that we cache in Python. When C API
@@ -354,7 +357,7 @@ class _DefinedFunction(object):
     if self._func_name:
       base_func_name = self._func_name
     else:
-      base_func_name = _get_func_name(self._func)
+      base_func_name = function_utils.get_func_name(self._func)
       if self._grad_func:
         base_func_name += ("_%s" % self._grad_func.name)
     kwargs_attr = _parse_kwargs_as_attrs(base_func_name, **self._extra_kwargs)
@@ -841,7 +844,7 @@ def func_graph_from_py_func(func, arg_names, arg_types, name=None,
     ValueError: if func returns None.
   """
   if not name:
-    name = _get_func_name(func)
+    name = function_utils.get_func_name(func)
   func_graph = _FuncGraph(name, capture_by_value)
 
   with func_graph.as_default(), ops.device(device):
@@ -1139,19 +1142,6 @@ def _parse_kwargs_as_attrs(func_name, **kwargs):
   return attrs
 
 
-def _get_func_name(func):
-  _, func = tf_decorator.unwrap(func)
-  if callable(func):
-    if tf_inspect.isfunction(func):
-      return func.__name__
-    elif tf_inspect.ismethod(func):
-      return "%s.%s" % (func.__self__.__name__, func.__name__)
-    else:  # Probably a class instance with __call__
-      return type(func)
-  else:
-    raise ValueError("Argument must be callable")
-
-
 def get_extra_vars():
   """Returns the captured variables by the function.
 
diff --git a/tensorflow/python/framework/function_def_to_graph.py b/tensorflow/python/framework/function_def_to_graph.py
index 46c9c4c14a..1b09506662 100644
--- a/tensorflow/python/framework/function_def_to_graph.py
+++ b/tensorflow/python/framework/function_def_to_graph.py
@@ -25,7 +25,7 @@ from tensorflow.core.framework import types_pb2
 from tensorflow.core.framework import versions_pb2
 from tensorflow.python.framework import function
 from tensorflow.python.framework import importer
-from tensorflow.python.framework import op_def_registry
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import versions
 from tensorflow.python.ops import cond_v2_impl
 
@@ -114,6 +114,10 @@ def function_def_to_graph_def(fdef, input_shapes=None):
           producer=versions.GRAPH_DEF_VERSION,
           min_consumer=versions.GRAPH_DEF_VERSION_MIN_CONSUMER))
 
+  # Copy *all* functions from outer graph to `graph_def` so that both direct
+  # and indirect references are safely handled.
+  ops.get_default_graph()._copy_functions_to_graph_def(graph_def, 0)  # pylint: disable=protected-access
+
   if input_shapes and len(input_shapes) != len(fdef.signature.input_arg):
     raise ValueError("Length of input_shapes must match the number of " +
                      "input_args. len(input_shapes): {} len(input_arg): {}".
@@ -142,24 +146,18 @@ def function_def_to_graph_def(fdef, input_shapes=None):
     nested_to_flat_tensor_name[arg_def.name] = "{}:0".format(arg_def.name)
 
   for node_def in fdef.node_def:
-    op_def = op_def_registry.get_registered_ops().get(node_def.op)
-    if not op_def:
-      # TODO(b/80470245): Support functions which refer other functions.
-      raise NotImplementedError(
-          "No op registered for {},".format(node_def.op) +
-          " it may be a function. function_def_to_graph_def " +
-          "currently does not support converting functions with " +
-          "references to other graph functions.")
+    op_def = ops.get_default_graph()._get_op_def(node_def.op)  # pylint: disable=protected-access
 
     for attr in op_def.attr:
-      if attr.type in ("func", "list(func)"):
-        # TODO(b/80470245): Support functions which refer other functions.
-        raise NotImplementedError("Unsupported attr {} ".format(attr.name) +
-                                  " with type {}".format(attr.type) +
-                                  " in op {}. ".format(op_def.name) +
-                                  "function_def_to_graph_def currently does " +
-                                  "not support converting functions with " +
-                                  "references to other graph functions.")
+      if attr.type == "func":
+        fname = node_def.attr[attr.name].func.name
+        if not ops.get_default_graph()._is_function(fname):  # pylint: disable=protected-access
+          raise ValueError("%s function not found." % fname)
+      elif attr.type == "list(func)":
+        for fn in node_def.attr[attr.name].list.func:
+          fname = fn.name
+          if not ops.get_default_graph()._is_function(fname):  # pylint: disable=protected-access
+            raise ValueError("%s function not found." % fname)
 
     # Iterate over output_args in op_def to build the map.
     # Index of the output tensor in the flattened list of *all* output
diff --git a/tensorflow/python/framework/function_def_to_graph_test.py b/tensorflow/python/framework/function_def_to_graph_test.py
index 0f4e6ef54f..cd2a16ed5a 100644
--- a/tensorflow/python/framework/function_def_to_graph_test.py
+++ b/tensorflow/python/framework/function_def_to_graph_test.py
@@ -18,7 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
 from tensorflow.python.framework import function_def_to_graph
 from tensorflow.python.framework import graph_to_function_def
 from tensorflow.python.framework import ops
@@ -79,7 +81,6 @@ class FunctionDefToGraphTest(test.TestCase):
 
     g = function_def_to_graph.function_def_to_graph(
         fdef, input_shapes=[None, tensor_shape.matrix(5, 7)])
-    print(g.as_graph_def())
     self.assertIsNone(g.inputs[0].shape.dims)
     self.assertSequenceEqual(g.inputs[1].shape.dims, [5, 7])
     self.assertSequenceEqual(g.outputs[0].shape.dims, [5, 7])
@@ -179,6 +180,37 @@ class FunctionDefToGraphDefTest(test.TestCase):
     self.assertEqual(g.node[0].attr["shape"].shape.unknown_rank, False)
     self.assertFalse("shape" in g.node[2].attr)
 
+  def testFunctionCallsFromFunction(self):
+    x = constant_op.constant(5.0)
+    y = constant_op.constant(10.0)
+
+    @function.Defun()
+    def fn():
+
+      @function.Defun()
+      def inner_fn():
+        return x + y
+
+      return inner_fn()
+
+    # Instantiate the function in this graph so that
+    # `function_def_to_graph` can find it.
+    fn()
+
+    def fn2():
+      return 2 * fn()
+
+    fdef = function._DefinedFunction(fn2, [], []).definition
+    func_graph = function_def_to_graph.function_def_to_graph(fdef)
+    with func_graph.as_default():
+      x_ph, y_ph = func_graph.inputs
+      with self.test_session(graph=func_graph) as sess:
+        self.assertEqual(
+            sess.run(func_graph.outputs[0], feed_dict={
+                x_ph: 5.0,
+                y_ph: 10.0
+            }), 30.0)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/framework/importer.py b/tensorflow/python/framework/importer.py
index 699d2b70d1..687bfebd43 100644
--- a/tensorflow/python/framework/importer.py
+++ b/tensorflow/python/framework/importer.py
@@ -205,7 +205,7 @@ def _PopulateTFImportGraphDefOptions(options, prefix, input_map,
   for input_src, input_dst in input_map.items():
     input_src = compat.as_str(input_src)
     if input_src.startswith('^'):
-      src_name = compat.as_bytes(input_src[1:])
+      src_name = compat.as_str(input_src[1:])
       dst_op = input_dst._as_tf_output().oper  # pylint: disable=protected-access
       c_api.TF_ImportGraphDefOptionsRemapControlDependency(
           options, src_name, dst_op)
diff --git a/tensorflow/python/framework/importer_test.py b/tensorflow/python/framework/importer_test.py
index c5a54470d2..7182c28666 100644
--- a/tensorflow/python/framework/importer_test.py
+++ b/tensorflow/python/framework/importer_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_ops  # pylint: disable=unused-import
 from tensorflow.python.framework import versions
 from tensorflow.python.ops import array_ops
@@ -419,6 +420,46 @@ class ImportGraphDefTest(test.TestCase):
       with self.test_session() as sess:
         self.assertEqual(sess.run(imported_r), 10)
 
+  def testImportWhileLoopInCond(self):
+    # Produce GraphDef containing while loop.
+    graph = ops.Graph()
+    with graph.as_default():
+      r = control_flow_ops.while_loop(lambda i: i < 10, lambda i: i + 1, [0])
+    graph_def = graph.as_graph_def()
+
+    # Import the GraphDef inside a cond and make sure it runs.
+    with ops.Graph().as_default():
+
+      def ImportFn():
+        return importer.import_graph_def(graph_def, return_elements=[r.name])[0]
+
+      pred = array_ops.placeholder(dtypes.bool)
+      out = control_flow_ops.cond(pred, ImportFn,
+                                  lambda: constant_op.constant(1))
+      with self.test_session() as sess:
+        self.assertEqual(sess.run(out, {pred: True}), 10)
+        self.assertEqual(sess.run(out, {pred: False}), 1)
+
+  def testImportWhileLoopInWhileLoop(self):
+    self.skipTest("b/111757448")
+    # Produce GraphDef containing while loop.
+    graph = ops.Graph()
+    with graph.as_default():
+      r = control_flow_ops.while_loop(lambda i: i < 10, lambda i: i + 1, [0])
+    graph_def = graph.as_graph_def()
+
+    # Import the GraphDef inside another loop and make sure it runs.
+    with ops.Graph().as_default():
+
+      def ImportFn(_):
+        return importer.import_graph_def(graph_def, return_elements=[r.name])[0]
+
+      out = control_flow_ops.while_loop(
+          lambda i: i < 2, ImportFn, [0],
+          shape_invariants=[tensor_shape.TensorShape(None)])
+      with self.test_session() as sess:
+        self.assertEqual(sess.run(out), 10)
+
   def testTypeMismatchInGraphDef(self):
     # TODO(skyewm): improve error message
     error_msg = ("Input 0 of node import/B was passed int32 from import/A:0 "
diff --git a/tensorflow/python/framework/kernels.py b/tensorflow/python/framework/kernels.py
new file mode 100644
index 0000000000..f7641f3442
--- /dev/null
+++ b/tensorflow/python/framework/kernels.py
@@ -0,0 +1,46 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functions for querying registered kernels."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.framework import kernel_def_pb2
+from tensorflow.python import pywrap_tensorflow as c_api
+from tensorflow.python.util import compat
+
+
+def get_all_registered_kernels():
+  """Returns a KernelList proto of all registered kernels.
+  """
+  buf = c_api.TF_GetAllRegisteredKernels()
+  data = c_api.TF_GetBuffer(buf)
+  kernel_list = kernel_def_pb2.KernelList()
+  kernel_list.ParseFromString(compat.as_bytes(data))
+  return kernel_list
+
+
+def get_registered_kernels_for_op(name):
+  """Returns a KernelList proto of registered kernels for a given op.
+
+  Args:
+    name: A string representing the name of the op whose kernels to retrieve.
+  """
+  buf = c_api.TF_GetRegisteredKernelsForOp(name)
+  data = c_api.TF_GetBuffer(buf)
+  kernel_list = kernel_def_pb2.KernelList()
+  kernel_list.ParseFromString(compat.as_bytes(data))
+  return kernel_list
diff --git a/tensorflow/python/framework/kernels_test.py b/tensorflow/python/framework/kernels_test.py
new file mode 100644
index 0000000000..c53500be73
--- /dev/null
+++ b/tensorflow/python/framework/kernels_test.py
@@ -0,0 +1,41 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for querying registered kernels."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import kernels
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import googletest
+
+
+class GetAllRegisteredKernelsTest(test_util.TensorFlowTestCase):
+
+  def testFindsAtLeastOneKernel(self):
+    kernel_list = kernels.get_all_registered_kernels()
+    self.assertGreater(len(kernel_list.kernel), 0)
+
+
+class GetRegisteredKernelsForOp(test_util.TensorFlowTestCase):
+
+  def testFindsAtLeastOneKernel(self):
+    kernel_list = kernels.get_registered_kernels_for_op("KernelLabel")
+    self.assertGreater(len(kernel_list.kernel), 0)
+    self.assertEqual(kernel_list.kernel[0].op, "KernelLabel")
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/python/framework/meta_graph.py b/tensorflow/python/framework/meta_graph.py
index 923e76fc9c..33631282bd 100644
--- a/tensorflow/python/framework/meta_graph.py
+++ b/tensorflow/python/framework/meta_graph.py
@@ -696,6 +696,67 @@ def import_scoped_meta_graph(meta_graph_or_file,
   Raises:
     ValueError: If the graph_def contains unbound inputs.
   """
+  return import_scoped_meta_graph_with_return_elements(
+      meta_graph_or_file, clear_devices, graph, import_scope, input_map,
+      unbound_inputs_col_name, restore_collections_predicate)[0]
+
+
+def import_scoped_meta_graph_with_return_elements(
+    meta_graph_or_file,
+    clear_devices=False,
+    graph=None,
+    import_scope=None,
+    input_map=None,
+    unbound_inputs_col_name="unbound_inputs",
+    restore_collections_predicate=(lambda key: True),
+    return_elements=None):
+  """Imports graph from `MetaGraphDef` and returns vars and return elements.
+
+  This function takes a `MetaGraphDef` protocol buffer as input. If
+  the argument is a file containing a `MetaGraphDef` protocol buffer ,
+  it constructs a protocol buffer from the file content. The function
+  then adds all the nodes from the `graph_def` field to the
+  current graph, recreates the desired collections, and returns a dictionary of
+  all the Variables imported into the name scope.
+
+  In combination with `export_scoped_meta_graph()`, this function can be used to
+
+  * Serialize a graph along with other Python objects such as `QueueRunner`,
+    `Variable` into a `MetaGraphDef`.
+
+  * Restart training from a saved graph and checkpoints.
+
+  * Run inference from a saved graph and checkpoints.
+
+  Args:
+    meta_graph_or_file: `MetaGraphDef` protocol buffer or filename (including
+      the path) containing a `MetaGraphDef`.
+    clear_devices: Boolean which controls whether to clear device information
+      from graph_def. Default false.
+    graph: The `Graph` to import into. If `None`, use the default graph.
+    import_scope: Optional `string`. Name scope into which to import the
+      subgraph. If `None`, the graph is imported to the root name scope.
+    input_map: A dictionary mapping input names (as strings) in `graph_def` to
+      `Tensor` objects. The values of the named input tensors in the imported
+      graph will be re-mapped to the respective `Tensor` values.
+    unbound_inputs_col_name: Collection name for looking up unbound inputs.
+    restore_collections_predicate: a predicate on collection names. A collection
+      named c (i.e whose key is c) will be restored iff
+      1) `restore_collections_predicate(c)` is True, and
+      2) `c != unbound_inputs_col_name`.
+    return_elements:  A list of strings containing operation names in the
+      `MetaGraphDef` that will be returned as `Operation` objects; and/or
+      tensor names in `MetaGraphDef` that will be returned as `Tensor` objects.
+
+  Returns:
+    A tuple of (
+      dictionary of all the `Variables` imported into the name scope,
+      list of `Operation` or `Tensor` objects from the `return_elements` list).
+
+  Raises:
+    ValueError: If the graph_def contains unbound inputs.
+
+  """
   if context.executing_eagerly():
     raise ValueError("Exporting/importing meta graphs is not supported when "
                      "eager execution is enabled.")
@@ -737,11 +798,12 @@ def import_scoped_meta_graph(meta_graph_or_file,
     scope_to_prepend_to_names = graph.unique_name(
         import_scope or "", mark_as_used=False)
 
-    importer.import_graph_def(
+    imported_return_elements = importer.import_graph_def(
         input_graph_def,
         name=(import_scope or scope_to_prepend_to_names),
         input_map=input_map,
-        producer_op_list=producer_op_list)
+        producer_op_list=producer_op_list,
+        return_elements=return_elements)
 
     # Restores all the other collections.
     variable_objects = {}
@@ -806,7 +868,7 @@ def import_scoped_meta_graph(meta_graph_or_file,
     for v in variables:
       var_list[ops.strip_name_scope(v.name, scope_to_prepend_to_names)] = v
 
-  return var_list
+  return var_list, imported_return_elements
 
 
 def export_scoped_meta_graph(filename=None,
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index b813cd6c06..c25e29b0f4 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -50,14 +50,15 @@ from tensorflow.python.framework import registry
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import traceable_stack
 from tensorflow.python.framework import versions
-from tensorflow.python.util import tf_stack
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.platform import app
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
 from tensorflow.python.util import decorator_utils
+from tensorflow.python.util import function_utils
 from tensorflow.python.util import lock_util
 from tensorflow.python.util import tf_contextlib
+from tensorflow.python.util import tf_stack
 from tensorflow.python.util.deprecation import deprecated_args
 from tensorflow.python.util.tf_export import tf_export
 
@@ -73,6 +74,31 @@ def tensor_id(tensor):
   return tensor._id  # pylint: disable=protected-access
 
 
+class _UserDeviceSpec(object):
+  """Store user-specified device and provide computation of merged device."""
+
+  def __init__(self, device_name_or_function):
+    self._device_name_or_function = device_name_or_function
+
+    self.display_name = str(self._device_name_or_function)
+    if callable(self._device_name_or_function):
+      dev_func = self._device_name_or_function
+      func_name = function_utils.get_func_name(dev_func)
+      func_code = function_utils.get_func_code(dev_func)
+      if func_code:
+        fname = func_code.co_filename
+        lineno = func_code.co_firstlineno
+      else:
+        fname = "unknown"
+        lineno = -1
+      self.display_name = "%s<%s, %d>" % (func_name, fname, lineno)
+
+    self.function = self._device_name_or_function
+    if not (self._device_name_or_function is None or
+            callable(self._device_name_or_function)):
+      self.function = pydev.merge_device(self._device_name_or_function)
+
+
 class _NullContextmanager(object):
 
   def __enter__(self):
@@ -709,7 +735,7 @@ class _EagerTensorBase(Tensor):
       raise ValueError("Resource handles are not convertible to numpy.")
     return self._cpu_nograd()._numpy()  # pylint: disable=protected-access
 
-  # __int__ and  __float__ may copy the tensor to CPU and
+  # __int__, __float__ and __index__ may copy the tensor to CPU and
   # only work for scalars; values are cast as per numpy.
   def __int__(self):
     return int(self.numpy())
@@ -717,6 +743,9 @@ class _EagerTensorBase(Tensor):
   def __float__(self):
     return float(self.numpy())
 
+  def __index__(self):
+    return int(self.numpy())
+
   def __array__(self, dtype=None):
     return np.array(self.numpy(), dtype=dtype)
 
@@ -1716,7 +1745,12 @@ class Operation(object):
     self._id_value = self._graph._next_id()
     self._original_op = original_op
     self._traceback = tf_stack.extract_stack()
-    # List of traceable_stack.TraceableObjects for colocation context managers.
+
+    # List of _UserDevSpecs holding code location of device context manager
+    # invocations and the users original argument to them.
+    self._device_code_locations = None
+    # Dict mapping op name to file and line information for op colocation
+    # context managers.
     self._colocation_code_locations = None
     self._control_flow_context = self.graph._get_control_flow_context()
     # pylint: enable=protected-access
@@ -1858,6 +1892,37 @@ class Operation(object):
     return c_api.TF_OperationDevice(self._c_op)
 
   @property
+  def _device_assignments(self):
+    """Code locations for device context managers active at op creation.
+
+    This property will return a list of traceable_stack.TraceableObject
+    instances where .obj is a string representing the assigned device
+    (or information about the function that would be applied to this op
+    to compute the desired device) and the filename and lineno members
+    record the location of the relevant device context manager.
+
+    For example, suppose file_a contained these lines:
+
+      file_a.py:
+        15: with tf.device('/gpu:0'):
+        16:   node_b = tf.constant(4, name='NODE_B')
+
+    Then a TraceableObject t_obj representing the device context manager
+    would have these member values:
+
+      t_obj.obj -> '/gpu:0'
+      t_obj.filename = 'file_a.py'
+      t_obj.lineno = 15
+
+    and node_b.op._device_assignments would return the list [t_obj].
+
+    Returns:
+      [str: traceable_stack.TraceableObject, ...] as per this method's
+      description, above.
+    """
+    return self._device_code_locations or []
+
+  @property
   def _colocation_dict(self):
     """Code locations for colocation context managers active at op creation.
 
@@ -1878,11 +1943,10 @@ class Operation(object):
     would have these member values:
 
       t_obj.obj -> None
-      t_obj.name = 'NODE_A'
       t_obj.filename = 'file_a.py'
       t_obj.lineno = 15
 
-    and node_b.op._colocation_code_locations would return the dictionary
+    and node_b.op._colocation_dict would return the dictionary
 
       { 'NODE_A': t_obj }
 
@@ -2732,7 +2796,7 @@ class Graph(object):
     # Functions that will be applied to choose a device if none is specified.
     # After switch_to_thread_local(), self._thread_local._device_function_stack
     # is used instead.
-    self._graph_device_function_stack = []
+    self._graph_device_function_stack = traceable_stack.TraceableStack()
     # Default original_op applied to new ops.
     self._default_original_op = None
     # Current control flow context. It could be either CondContext or
@@ -3776,8 +3840,8 @@ class Graph(object):
       Nothing.
     """
     old_original_op = self._default_original_op
+    self._default_original_op = op
     try:
-      self._default_original_op = op
       yield
     finally:
       self._default_original_op = old_original_op
@@ -3894,15 +3958,15 @@ class Graph(object):
         # op name regex, which constrains the initial character.
         if not _VALID_OP_NAME_REGEX.match(name):
           raise ValueError("'%s' is not a valid scope name" % name)
+    old_stack = self._name_stack
+    if not name:  # Both for name=None and name="" we re-set to empty scope.
+      new_stack = None
+    elif name[-1] == "/":
+      new_stack = _name_from_scope_name(name)
+    else:
+      new_stack = self.unique_name(name)
+    self._name_stack = new_stack
     try:
-      old_stack = self._name_stack
-      if not name:  # Both for name=None and name="" we re-set to empty scope.
-        new_stack = None
-      elif name[-1] == "/":
-        new_stack = _name_from_scope_name(name)
-      else:
-        new_stack = self.unique_name(name)
-      self._name_stack = new_stack
       yield "" if new_stack is None else new_stack + "/"
     finally:
       self._name_stack = old_stack
@@ -3983,8 +4047,8 @@ class Graph(object):
                                   ignore_existing=False):
     with self.colocate_with(op, ignore_existing):
       if gradient_uid is not None and self._control_flow_context is not None:
+        self._control_flow_context.EnterGradientColocation(op, gradient_uid)
         try:
-          self._control_flow_context.EnterGradientColocation(op, gradient_uid)
           yield
         finally:
           self._control_flow_context.ExitGradientColocation(op, gradient_uid)
@@ -4026,7 +4090,6 @@ class Graph(object):
     Yields:
       A context manager that specifies the op with which to colocate
       newly created ops.
-
     """
     if op is None and not ignore_existing:
       raise ValueError("Trying to reset colocation (op is None) but "
@@ -4044,7 +4107,7 @@ class Graph(object):
     # In the future, a caller may specify that device_functions win
     # over colocation, in which case we can add support.
     device_fn_tmp = self._device_function_stack
-    self._device_function_stack = []
+    self._device_function_stack = traceable_stack.TraceableStack()
 
     if ignore_existing:
       current_stack = self._colocation_stack
@@ -4068,6 +4131,13 @@ class Graph(object):
       if ignore_existing:
         self._colocation_stack = current_stack
 
+  def _add_device_to_stack(self, device_name_or_function, offset=0):
+    """Add device to stack manually, separate from a context manager."""
+    total_offset = 1 + offset
+    spec = _UserDeviceSpec(device_name_or_function)
+    self._device_function_stack.push_obj(spec, offset=total_offset)
+    return spec
+
   @tf_contextlib.contextmanager
   def device(self, device_name_or_function):
     # pylint: disable=line-too-long
@@ -4125,31 +4195,26 @@ class Graph(object):
     Yields:
       A context manager that specifies the default device to use for newly
       created ops.
-
     """
-    # pylint: enable=line-too-long
-    if (device_name_or_function is not None and
-        not callable(device_name_or_function)):
-      device_function = pydev.merge_device(device_name_or_function)
-    else:
-      device_function = device_name_or_function
-
+    self._add_device_to_stack(device_name_or_function, offset=2)
     try:
-      self._device_function_stack.append(device_function)
       yield
     finally:
-      self._device_function_stack.pop()
+      self._device_function_stack.pop_obj()
 
   def _apply_device_functions(self, op):
     """Applies the current device function stack to the given operation."""
-    # Apply any device functions in reverse order, so that the most recently
+    # Apply any device functions in LIFO order, so that the most recently
     # pushed function has the first chance to apply a device to the op.
     # We apply here because the result can depend on the Operation's
     # signature, which is computed in the Operation constructor.
-    for device_function in reversed(self._device_function_stack):
-      if device_function is None:
+    # pylint: disable=protected-access
+    for device_spec in self._device_function_stack.peek_objs():
+      if device_spec.function is None:
         break
-      op._set_device(device_function(op))  # pylint: disable=protected-access
+      op._set_device(device_spec.function(op))
+    op._device_code_locations = self._snapshot_device_function_stack_metadata()
+    # pylint: enable=protected-access
 
   # pylint: disable=g-doc-return-or-yield
   @tf_contextlib.contextmanager
@@ -4198,8 +4263,8 @@ class Graph(object):
         yields the container name.
     """
     original_container = self._container
+    self._container = container_name
     try:
-      self._container = container_name
       yield self._container
     finally:
       self._container = original_container
@@ -4673,17 +4738,45 @@ class Graph(object):
     if self._stack_state_is_thread_local:
       # This may be called from a thread where device_function_stack doesn't yet
       # exist.
+      # pylint: disable=protected-access
       if not hasattr(self._thread_local, "_device_function_stack"):
-        self._thread_local._device_function_stack = (
-            self._graph_device_function_stack[:])
+        stack_copy_for_this_thread = self._graph_device_function_stack.copy()
+        self._thread_local._device_function_stack = stack_copy_for_this_thread
       return self._thread_local._device_function_stack
+      # pylint: enable=protected-access
     else:
       return self._graph_device_function_stack
 
+  @property
+  def _device_functions_outer_to_inner(self):
+    user_device_specs = self._device_function_stack.peek_objs()
+    device_functions = [spec.function for spec in user_device_specs]
+    device_functions_outer_to_inner = list(reversed(device_functions))
+    return device_functions_outer_to_inner
+
+  def _snapshot_device_function_stack_metadata(self):
+    """Return device function stack as a list of TraceableObjects.
+
+    Returns:
+      [traceable_stack.TraceableObject, ...] where each TraceableObject's .obj
+      member is a displayable name for the user's argument to Graph.device, and
+      the filename and lineno members point to the code location where
+      Graph.device was called directly or indirectly by the user.
+    """
+    traceable_objects = self._device_function_stack.peek_traceable_objs()
+    snapshot = []
+    for obj in traceable_objects:
+      obj_copy = obj.copy_metadata()
+      obj_copy.obj = obj.obj.display_name
+      snapshot.append(obj_copy)
+    return snapshot
+
   @_device_function_stack.setter
   def _device_function_stack(self, device_function_stack):
     if self._stack_state_is_thread_local:
+      # pylint: disable=protected-access
       self._thread_local._device_function_stack = device_function_stack
+      # pylint: enable=protected-access
     else:
       self._graph_device_function_stack = device_function_stack
 
@@ -4693,12 +4786,12 @@ class Graph(object):
     if self._stack_state_is_thread_local:
       # This may be called from a thread where colocation_stack doesn't yet
       # exist.
+      # pylint: disable=protected-access
       if not hasattr(self._thread_local, "_colocation_stack"):
         stack_copy_for_this_thread = self._graph_colocation_stack.copy()
-        # pylint: disable=protected-access
         self._thread_local._colocation_stack = stack_copy_for_this_thread
-        # pylint: enable=protected-access
       return self._thread_local._colocation_stack
+      # pylint: enable=protected-access
     else:
       return self._graph_colocation_stack
 
@@ -4710,7 +4803,9 @@ class Graph(object):
   @_colocation_stack.setter
   def _colocation_stack(self, colocation_stack):
     if self._stack_state_is_thread_local:
+      # pylint: disable=protected-access
       self._thread_local._colocation_stack = colocation_stack
+      # pylint: enable=protected-access
     else:
       self._graph_colocation_stack = colocation_stack
 
@@ -4879,8 +4974,8 @@ class _DefaultStack(threading.local):
   @tf_contextlib.contextmanager
   def get_controller(self, default):
     """A context manager for manipulating a default stack."""
+    self.stack.append(default)
     try:
-      self.stack.append(default)
       yield default
     finally:
       # stack may be empty if reset() was called
@@ -5068,13 +5163,15 @@ class _DefaultGraphStack(_DefaultStack):  # pylint: disable=protected-access
 
   @tf_contextlib.contextmanager
   def get_controller(self, default):
+    context.context().context_switches.push(
+        default.building_function, default.as_default)
     try:
-      context.context().context_switches.push(
-          default.building_function, default.as_default)
       with super(_DefaultGraphStack, self).get_controller(
           default) as g, context.graph_mode():
         yield g
     finally:
+      # If an exception is raised here it may be hiding a related exception in
+      # the try-block (just above).
       context.context().context_switches.pop()
 
 
@@ -5110,6 +5207,9 @@ def init_scope():
         `init_scope` will simply install a fresh graph as the default one.
 
     (3) The gradient tape is paused while the scope is active.
+
+  Raises:
+    RuntimeError: if graph state is incompatible with this initialization.
   """
   # pylint: enable=g-doc-return-or-yield,line-too-long
 
@@ -5122,10 +5222,10 @@ def init_scope():
     # the name scope of the current context.
     default_graph = get_default_graph()
     scope = default_graph.get_name_scope()
-    if scope and scope[-1] != '/':
+    if scope and scope[-1] != "/":
       # Names that end with trailing slashes are treated by `name_scope` as
       # absolute.
-      scope = scope + '/'
+      scope = scope + "/"
     inner_device_stack = default_graph._device_function_stack  # pylint: disable=protected-access
 
     outer_context = None
@@ -5170,6 +5270,8 @@ def init_scope():
           outer_graph._device_function_stack = inner_device_stack  # pylint: disable=protected-access
         yield
     finally:
+      # If an exception is raised here it may be hiding a related exception in
+      # try-block (just above).
       if outer_graph is not None:
         outer_graph._device_function_stack = outer_device_stack  # pylint: disable=protected-access
 
@@ -5237,7 +5339,10 @@ def enable_eager_execution(config=None,
      to this function.
   """
   return enable_eager_execution_internal(
-      config, device_policy, execution_mode, None)
+      config=config,
+      device_policy=device_policy,
+      execution_mode=execution_mode,
+      server_def=None)
 
 
 def enable_eager_execution_internal(config=None,
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index f848b69782..48328a7f58 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import gc
+import os
 import threading
 import weakref
 
@@ -2542,6 +2543,56 @@ class StatisticsTest(test_util.TensorFlowTestCase):
     self.assertEqual(3, flops_total.value)
 
 
+class DeviceStackTest(test_util.TensorFlowTestCase):
+
+  def testBasicDeviceAssignmentMetadata(self):
+
+    def device_func(unused_op):
+      return "/cpu:*"
+
+    const_zero = constant_op.constant([0.0], name="zero")
+    with ops.device("/cpu"):
+      const_one = constant_op.constant([1.0], name="one")
+      with ops.device("/cpu:0"):
+        const_two = constant_op.constant([2.0], name="two")
+    with ops.device(device_func):
+      const_three = constant_op.constant(3.0, name="three")
+
+    self.assertEqual(0, len(const_zero.op._device_assignments))
+
+    one_list = const_one.op._device_assignments
+    self.assertEqual(1, len(one_list))
+    self.assertEqual("/cpu", one_list[0].obj)
+    self.assertEqual("ops_test.py", os.path.basename(one_list[0].filename))
+
+    two_list = const_two.op._device_assignments
+    self.assertEqual(2, len(two_list))
+    devices = [t.obj for t in two_list]
+    self.assertEqual(set(["/cpu", "/cpu:0"]), set(devices))
+
+    three_list = const_three.op._device_assignments
+    self.assertEqual(1, len(three_list))
+    func_description = three_list[0].obj
+    expected_regex = r"device_func<.*ops_test.py, [0-9]+"
+    self.assertRegexpMatches(func_description, expected_regex)
+
+  def testDeviceAssignmentMetadataForGraphDeviceAndTfDeviceFunctions(self):
+
+    with ops.device("/cpu"):
+      const_one = constant_op.constant([1.0], name="one")
+    with ops.get_default_graph().device("/cpu"):
+      const_two = constant_op.constant([2.0], name="two")
+
+    one_metadata = const_one.op._device_assignments[0]
+    two_metadata = const_two.op._device_assignments[0]
+
+    # Verify both types of device assignment return the right stack info.
+    self.assertRegexpMatches("ops_test.py",
+                             os.path.basename(one_metadata.filename))
+    self.assertEqual(one_metadata.filename, two_metadata.filename)
+    self.assertEqual(one_metadata.lineno + 2, two_metadata.lineno)
+
+
 class ColocationGroupTest(test_util.TensorFlowTestCase):
 
   def testBasic(self):
@@ -2554,13 +2605,17 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
     with self.assertRaises(ValueError):
       c.op.get_attr("_class")
 
-    # Roughly test that stack information is being saved correctly for the op.
-    locations_dict = b.op._colocation_dict
-    self.assertIn("a", locations_dict)
-    metadata = locations_dict["a"]
+  def testBasicColocationMetadata(self):
+    const_two = constant_op.constant([2.0], name="two")
+    with ops.colocate_with(const_two.op):
+      const_three = constant_op.constant(3.0, name="three")
+    locations_dict = const_three.op._colocation_dict
+    self.assertIn("two", locations_dict)
+    metadata = locations_dict["two"]
     self.assertIsNone(metadata.obj)
-    basename = metadata.filename.split("/")[-1]
-    self.assertEqual("ops_test.py", basename)
+    # Check that this test's filename is recorded as the file containing the
+    # colocation statement.
+    self.assertEqual("ops_test.py", os.path.basename(metadata.filename))
 
   def testColocationDeviceInteraction(self):
     with ops.device("/cpu:0"):
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index ca63efbc84..9a0f34fad2 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -67,10 +67,16 @@ def SlowAppendBFloat16ArrayToTensorProto(tensor_proto, proto_values):
       [ExtractBitsFromBFloat16(x) for x in proto_values])
 
 
+def FastAppendBFloat16ArrayToTensorProto(tensor_proto, proto_values):
+  fast_tensor_util.AppendBFloat16ArrayToTensorProto(
+      tensor_proto, np.asarray(
+          proto_values, dtype=dtypes.bfloat16.as_numpy_dtype).view(np.uint16))
+
+
 if _FAST_TENSOR_UTIL_AVAILABLE:
   _NP_TO_APPEND_FN = {
       dtypes.bfloat16.as_numpy_dtype:
-          SlowAppendBFloat16ArrayToTensorProto,
+          FastAppendBFloat16ArrayToTensorProto,
       np.float16:
           _MediumAppendFloat16ArrayToTensorProto,
       np.float32:
@@ -935,8 +941,10 @@ def constant_value_as_shape(tensor):  # pylint: disable=invalid-name
 def is_tensor(x):  # pylint: disable=invalid-name
   """Check whether `x` is of tensor type.
 
-  Check whether an object is a tensor. Equivalent to
-  `isinstance(x, [tf.Tensor, tf.SparseTensor, tf.Variable])`.
+  Check whether an object is a tensor. This check is equivalent to calling
+  `isinstance(x, [tf.Tensor, tf.SparseTensor, tf.Variable])` and also checks
+  if all the component variables of a MirroredVariable or a TowerLocalVariable
+  are tensors.
 
   Args:
     x: A python object to check.
@@ -944,4 +952,5 @@ def is_tensor(x):  # pylint: disable=invalid-name
   Returns:
     `True` if `x` is a tensor, `False` if not.
   """
-  return isinstance(x, ops._TensorLike) or ops.is_dense_tensor_like(x)  # pylint: disable=protected-access
+  return (isinstance(x, ops._TensorLike) or ops.is_dense_tensor_like(x) or  # pylint: disable=protected-access
+          (hasattr(x, "is_tensor_like") and x.is_tensor_like))
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 2bc2a189fa..fc47b1cca5 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -19,6 +19,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
+from collections import OrderedDict
 import contextlib
 import gc
 import itertools
@@ -571,6 +573,78 @@ def assert_no_garbage_created(f):
   return decorator
 
 
+def _combine_named_parameters(**kwargs):
+  """Generate combinations based on its keyword arguments.
+
+  Two sets of returned combinations can be concatenated using +.  Their product
+  can be computed using `times()`.
+
+  Args:
+    **kwargs: keyword arguments of form `option=[possibilities, ...]`
+         or `option=the_only_possibility`.
+
+  Returns:
+    a list of dictionaries for each combination. Keys in the dictionaries are
+    the keyword argument names.  Each key has one value - one of the
+    corresponding keyword argument values.
+  """
+  if not kwargs:
+    return [OrderedDict()]
+
+  sort_by_key = lambda k: k[0][0]
+  kwargs = OrderedDict(sorted(kwargs.items(), key=sort_by_key))
+  first = list(kwargs.items())[0]
+
+  rest = dict(list(kwargs.items())[1:])
+  rest_combined = _combine_named_parameters(**rest)
+
+  key = first[0]
+  values = first[1]
+  if not isinstance(values, list):
+    values = [values]
+
+  combinations = [
+      OrderedDict(sorted(list(combined.items()) + [(key, v)], key=sort_by_key))
+      for v in values
+      for combined in rest_combined
+  ]
+  return combinations
+
+
+def generate_combinations_with_testcase_name(**kwargs):
+  """Generate combinations based on its keyword arguments using combine().
+
+  This function calls combine() and appends a testcase name to the list of
+  dictionaries returned. The 'testcase_name' key is a required for named
+  parameterized tests.
+
+  Args:
+    **kwargs: keyword arguments of form `option=[possibilities, ...]`
+         or `option=the_only_possibility`.
+
+  Returns:
+    a list of dictionaries for each combination. Keys in the dictionaries are
+    the keyword argument names.  Each key has one value - one of the
+    corresponding keyword argument values.
+  """
+  combinations = _combine_named_parameters(**kwargs)
+  named_combinations = []
+  for combination in combinations:
+    assert isinstance(combination, OrderedDict)
+    name = "".join([
+        "_{}_{}".format(
+            "".join(filter(str.isalnum, key)),
+            "".join(filter(str.isalnum, str(value))))
+        for key, value in combination.items()
+    ])
+    named_combinations.append(
+        OrderedDict(
+            list(combination.items()) + [("testcase_name",
+                                          "_test{}".format(name))]))
+
+  return named_combinations
+
+
 def run_all_in_graph_and_eager_modes(cls):
   """Execute all test methods in the given class with and without eager."""
   base_decorator = run_in_graph_and_eager_modes
@@ -1227,8 +1301,8 @@ class TensorFlowTestCase(googletest.TestCase):
       a = a._asdict()
     if hasattr(b, "_asdict"):
       b = b._asdict()
-    a_is_dict = isinstance(a, dict)
-    if a_is_dict != isinstance(b, dict):
+    a_is_dict = isinstance(a, collections.Mapping)
+    if a_is_dict != isinstance(b, collections.Mapping):
       raise ValueError("Can't compare dict to non-dict, a%s vs b%s. %s" %
                        (path_str, path_str, msg))
     if a_is_dict:
diff --git a/tensorflow/python/framework/test_util_test.py b/tensorflow/python/framework/test_util_test.py
index 122c14c847..f983cbef04 100644
--- a/tensorflow/python/framework/test_util_test.py
+++ b/tensorflow/python/framework/test_util_test.py
@@ -73,7 +73,7 @@ class TestUtilTest(test_util.TensorFlowTestCase):
     test_util.assert_equal_graph_def(def_57, def_75)
     # Compare two unequal graphs
     with self.assertRaisesRegexp(AssertionError,
-                                 r"^Found unexpected node 'seven"):
+                                 r"^Found unexpected node '{{node seven}}"):
       test_util.assert_equal_graph_def(def_57, def_empty)
 
   def testIsGoogleCudaEnabled(self):
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index a495d48545..df409d2aa5 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -732,7 +732,6 @@ py_test(
     size = "medium",
     srcs = ["preprocessing/image_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["nomsan"],  # TODO(b/110990716) reenable
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/keras/activations.py b/tensorflow/python/keras/activations.py
index f608dea430..99645de736 100644
--- a/tensorflow/python/keras/activations.py
+++ b/tensorflow/python/keras/activations.py
@@ -128,20 +128,26 @@ def softsign(x):
 
 
 @tf_export('keras.activations.relu')
-def relu(x, alpha=0., max_value=None):
+def relu(x, alpha=0., max_value=None, threshold=0):
   """Rectified Linear Unit.
 
+  With default values, it returns element-wise `max(x, 0)`.
+
+  Otherwise, it follows:
+  `f(x) = max_value` for `x >= max_value`,
+  `f(x) = x` for `threshold <= x < max_value`,
+  `f(x) = alpha * (x - threshold)` otherwise.
+
   Arguments:
-      x: Input tensor.
-      alpha: Slope of the negative part. Defaults to zero.
-      max_value: Maximum value for the output.
+      x: A tensor or variable.
+      alpha: A scalar, slope of negative section (default=`0.`).
+      max_value: float. Saturation threshold.
+      threshold: float. Threshold value for thresholded activation.
 
   Returns:
-      The (leaky) rectified linear unit activation: `x` if `x > 0`,
-        `alpha * x` if `x < 0`. If `max_value` is defined, the result
-        is truncated to this value.
+      A tensor.
   """
-  return K.relu(x, alpha=alpha, max_value=max_value)
+  return K.relu(x, alpha=alpha, max_value=max_value, threshold=threshold)
 
 
 @tf_export('keras.activations.tanh')
diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index 333f927d2f..38794f1612 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -3372,26 +3372,48 @@ def in_test_phase(x, alt, training=None):
 
 
 @tf_export('keras.backend.relu')
-def relu(x, alpha=0., max_value=None):
+def relu(x, alpha=0., max_value=None, threshold=0):
   """Rectified linear unit.
 
   With default values, it returns element-wise `max(x, 0)`.
 
+  Otherwise, it follows:
+  `f(x) = max_value` for `x >= max_value`,
+  `f(x) = x` for `threshold <= x < max_value`,
+  `f(x) = alpha * (x - threshold)` otherwise.
+
   Arguments:
       x: A tensor or variable.
       alpha: A scalar, slope of negative section (default=`0.`).
-      max_value: Saturation threshold.
+      max_value: float. Saturation threshold.
+      threshold: float. Threshold value for thresholded activation.
 
   Returns:
       A tensor.
   """
+  clip_max = max_value is not None
+
   if alpha != 0.:
-    negative_part = nn.relu(-x)
-  x = nn.relu(x)
-  if max_value is not None:
+    if threshold != 0:
+      negative_part = nn.relu(-x + threshold)
+    else:
+      negative_part = nn.relu(-x)
+
+  if threshold != 0:
+    # computes x for x > threshold else 0
+    x = x * math_ops.cast(math_ops.greater(x, threshold), floatx())
+  elif max_value == 6:
+    # if no threshold, then can use nn.relu6 native TF op for performance
+    x = nn.relu6(x)
+    clip_max = False
+  else:
+    x = nn.relu(x)
+
+  if clip_max:
     max_value = _to_tensor(max_value, x.dtype.base_dtype)
     zero = _to_tensor(0., x.dtype.base_dtype)
     x = clip_ops.clip_by_value(x, zero, max_value)
+
   if alpha != 0.:
     alpha = _to_tensor(alpha, x.dtype.base_dtype)
     x -= alpha * negative_part
diff --git a/tensorflow/python/keras/backend_test.py b/tensorflow/python/keras/backend_test.py
index 36478ea089..40e7910061 100644
--- a/tensorflow/python/keras/backend_test.py
+++ b/tensorflow/python/keras/backend_test.py
@@ -23,6 +23,7 @@ import scipy.sparse
 
 from tensorflow.python import keras
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -490,6 +491,66 @@ class BackendLinearAlgebraTest(test.TestCase):
                                        input_shape_a=(4, 7),
                                        input_shape_b=(4, 7))
 
+  def test_relu(self):
+    x = ops.convert_to_tensor([[-4, 0], [2, 7]], 'float32')
+    with self.test_session():
+      # standard relu
+      relu_op = keras.backend.relu(x)
+      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 7]])
+
+      # alpha
+      relu_op = keras.backend.relu(x, alpha=0.5)
+      self.assertAllClose(keras.backend.eval(relu_op), [[-2, 0], [2, 7]])
+
+      # max_value < some elements
+      relu_op = keras.backend.relu(x, max_value=5)
+      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 5]])
+
+      # nn.relu6 used
+      relu_op = keras.backend.relu(x, max_value=6)
+      self.assertTrue('Relu6' in relu_op.name)  # uses tf.nn.relu6
+      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 6]])
+
+      # max value > 6
+      relu_op = keras.backend.relu(x, max_value=10)
+      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 7]])
+
+      # max value is float
+      relu_op = keras.backend.relu(x, max_value=4.3)
+      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 4.3]])
+
+      # max value == 0
+      relu_op = keras.backend.relu(x, max_value=0)
+      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [0, 0]])
+
+      # alpha and max_value
+      relu_op = keras.backend.relu(x, alpha=0.25, max_value=3)
+      self.assertAllClose(keras.backend.eval(relu_op), [[-1, 0], [2, 3]])
+
+      # threshold
+      relu_op = keras.backend.relu(x, threshold=3)
+      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [0, 7]])
+
+      # threshold is float
+      relu_op = keras.backend.relu(x, threshold=1.5)
+      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 7]])
+
+      # threshold is negative
+      relu_op = keras.backend.relu(x, threshold=-5)
+      self.assertAllClose(keras.backend.eval(relu_op), [[-4, 0], [2, 7]])
+
+      # threshold and max_value
+      relu_op = keras.backend.relu(x, threshold=3, max_value=5)
+      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [0, 5]])
+
+      # threshold and alpha
+      relu_op = keras.backend.relu(x, alpha=0.25, threshold=4)
+      self.assertAllClose(keras.backend.eval(relu_op), [[-2, -1], [-0.5, 7]])
+
+      # threshold, alpha, and max_value
+      relu_op = keras.backend.relu(x, alpha=0.25, threshold=4, max_value=5)
+      self.assertAllClose(keras.backend.eval(relu_op), [[-2, -1], [-0.5, 5]])
+
 
 class BackendShapeOpsTest(test.TestCase):
 
diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index 0857a3279f..070d41147d 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -31,12 +31,14 @@ import time
 import numpy as np
 import six
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.engine.training_utils import standardize_input_data
 from tensorflow.python.keras.utils.generic_utils import Progbar
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import summary_ops_v2
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary import summary as tf_summary
@@ -716,6 +718,15 @@ class TensorBoard(Callback):
           `embeddings_layer_names`. Numpy array (if the model has a single
           input) or list of Numpy arrays (if the model has multiple inputs).
           Learn [more about embeddings](https://www.tensorflow.org/programmers_guide/embedding)
+
+  Raises:
+      ValueError: If histogram_freq is set and no validation data is provided.
+
+  @compatbility(eager)
+  Using `Tensorboard` callback will work while eager execution is enabled,
+  however outputting histogram summaries of weights and gradients is not
+  supported, and thus `histogram_freq` will be ignored.
+  @end_compatibility
   """
 
   # pylint: enable=line-too-long
@@ -734,24 +745,34 @@ class TensorBoard(Callback):
     super(TensorBoard, self).__init__()
     self.log_dir = log_dir
     self.histogram_freq = histogram_freq
+    if self.histogram_freq and context.executing_eagerly():
+      logging.warning(
+          UserWarning('Weight and gradient histograms not supported for eager'
+                      'execution, setting `histogram_freq` to `0`.'))
+      self.histogram_freq = 0
     self.merged = None
     self.write_graph = write_graph
     self.write_grads = write_grads
     self.write_images = write_images
     self.batch_size = batch_size
     self._current_batch = 0
-    # abstracted writer class to be able to stub for testing
-    self._writer_class = tf_summary.FileWriter
+    self._total_batches_seen = 0
     self.embeddings_freq = embeddings_freq
     self.embeddings_layer_names = embeddings_layer_names
     self.embeddings_metadata = embeddings_metadata
     self.embeddings_data = embeddings_data
 
-  def set_model(self, model):
-    """Sets Keras model and creates summary ops."""
+  def _init_writer(self):
+    """Sets file writer."""
+    if context.executing_eagerly():
+      self.writer = summary_ops_v2.create_file_writer(self.log_dir)
+    elif self.write_graph:
+      self.writer = tf_summary.FileWriter(self.log_dir, K.get_session().graph)
+    else:
+      self.writer = tf_summary.FileWriter(self.log_dir)
 
-    self.model = model
-    self.sess = K.get_session()
+  def _make_histogram_ops(self, model):
+    """Defines histogram ops when histogram_freq > 0."""
     # only make histogram summary op if it hasn't already been made
     if self.histogram_freq and self.merged is None:
       for layer in self.model.layers:
@@ -792,8 +813,10 @@ class TensorBoard(Callback):
             def is_indexed_slices(grad):
               return type(grad).__name__ == 'IndexedSlices'
 
-            grads = [grad.values if is_indexed_slices(grad) else grad
-                     for grad in grads]
+            grads = [
+                grad.values if is_indexed_slices(grad) else grad
+                for grad in grads
+            ]
             tf_summary.histogram('{}_grad'.format(mapped_weight_name), grads)
 
         if hasattr(layer, 'output'):
@@ -802,12 +825,16 @@ class TensorBoard(Callback):
               tf_summary.histogram('{}_out_{}'.format(layer.name, i), output)
           else:
             tf_summary.histogram('{}_out'.format(layer.name), layer.output)
-    self.merged = tf_summary.merge_all()
 
-    if self.write_graph:
-      self.writer = self._writer_class(self.log_dir, self.sess.graph)
-    else:
-      self.writer = self._writer_class(self.log_dir)
+  def set_model(self, model):
+    """Sets Keras model and creates summary ops."""
+
+    self.model = model
+    self._init_writer()
+    # histogram summaries only enabled in graph mode
+    if not context.executing_eagerly():
+      self._make_histogram_ops(model)
+      self.merged = tf_summary.merge_all()
 
     # If both embedding_freq and embeddings_data are available, we will
     # visualize embeddings.
@@ -883,9 +910,34 @@ class TensorBoard(Callback):
         self._epoch + self._current_val_batch / self._validation_batches)
     self._current_val_batch += 1
 
+  def _write_custom_summaries(self, step, logs=None):
+    """Writes metrics out as custom scalar summaries.
+
+    Arguments:
+        step: the global step to use for Tensorboard.
+        logs: dict. Keys are scalar summary names, values are
+            NumPy scalars.
+
+    """
+    logs = logs or {}
+    if context.executing_eagerly():
+      # use v2 summary ops
+      with self.writer.as_default(), summary_ops_v2.always_record_summaries():
+        for name, value in logs.items():
+          summary_ops_v2.scalar(name, value.item(), step=step)
+    else:
+      # use FileWriter from v1 summary
+      for name, value in logs.items():
+        summary = tf_summary.Summary()
+        summary_value = summary.value.add()
+        summary_value.simple_value = value.item()
+        summary_value.tag = name
+        self.writer.add_summary(summary, step)
+    self.writer.flush()
+
   def on_train_begin(self, logs=None):
     """Checks if histogram summaries can be run."""
-
+    # will never be set when in eager
     if self.histogram_freq:
       if 'validation_steps' in self.params:
         self._validation_batches = self.params['validation_steps']
@@ -899,6 +951,16 @@ class TensorBoard(Callback):
         raise ValueError(
             'If printing histograms, validation data must have length > 0.')
 
+  def on_batch_end(self, batch, logs=None):
+    """Writes scalar summaries for metrics on every training batch."""
+    # Don't output batch_size and batch number as Tensorboard summaries
+    logs = logs or {}
+    batch_logs = {('batch_' + k): v
+                  for k, v in logs.items()
+                  if k not in ['batch', 'size']}
+    self._write_custom_summaries(self._total_batches_seen, batch_logs)
+    self._total_batches_seen += 1
+
   def on_epoch_begin(self, epoch, logs=None):
     """Add histogram op to Model test_function callbacks, reset batch count."""
 
@@ -915,7 +977,12 @@ class TensorBoard(Callback):
   def on_epoch_end(self, epoch, logs=None):
     """Checks if summary ops should run next epoch, logs scalar summaries."""
 
-    logs = logs or {}
+    # don't output batch_size and
+    # batch number as Tensorboard summaries
+    logs = {('epoch_' + k): v
+            for k, v in logs.items()
+            if k not in ['batch', 'size']}
+    self._write_custom_summaries(epoch, logs)
 
     # pop the histogram summary op after each epoch
     if self.histogram_freq:
@@ -964,16 +1031,6 @@ class TensorBoard(Callback):
 
           i += self.batch_size
 
-    for name, value in logs.items():
-      if name in ['batch', 'size']:
-        continue
-      summary = tf_summary.Summary()
-      summary_value = summary.value.add()
-      summary_value.simple_value = value.item()
-      summary_value.tag = name
-      self.writer.add_summary(summary, epoch)
-    self.writer.flush()
-
   def on_train_end(self, logs=None):
     self.writer.close()
 
diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py
index 45598cafd3..ffec197028 100644
--- a/tensorflow/python/keras/callbacks_test.py
+++ b/tensorflow/python/keras/callbacks_test.py
@@ -22,6 +22,7 @@ import csv
 import os
 import re
 import shutil
+import tempfile
 import threading
 import unittest
 
@@ -29,10 +30,12 @@ import numpy as np
 
 from tensorflow.core.framework import summary_pb2
 from tensorflow.python import keras
+from tensorflow.python.framework import test_util
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary.writer import writer_cache
+from tensorflow.python.training import adam
 
 try:
   import h5py  # pylint:disable=g-import-not-at-top
@@ -917,6 +920,9 @@ class KerasCallbacksTest(test.TestCase):
       def close(self):
         pass
 
+    def _init_writer(obj):
+      obj.writer = FileWriterStub(obj.log_dir)
+
     np.random.seed(1337)
     tmpdir = self.get_temp_dir()
     self.addCleanup(shutil.rmtree, tmpdir)
@@ -940,13 +946,13 @@ class KerasCallbacksTest(test.TestCase):
           loss='categorical_crossentropy',
           optimizer='sgd',
           metrics=['accuracy'])
+      keras.callbacks.TensorBoard._init_writer = _init_writer
       tsb = keras.callbacks.TensorBoard(
           log_dir=tmpdir,
           histogram_freq=1,
           write_images=True,
           write_grads=True,
           batch_size=5)
-      tsb._writer_class = FileWriterStub
       cbks = [tsb]
 
       # fit with validation data
@@ -1096,6 +1102,112 @@ class KerasCallbacksTest(test.TestCase):
 
       assert os.path.exists(temp_dir)
 
+  def test_Tensorboard_batch_logging(self):
+
+    class FileWriterStub(object):
+
+      def __init__(self, logdir, graph=None):
+        self.logdir = logdir
+        self.graph = graph
+        self.batches_logged = []
+        self.summary_values = []
+        self.summary_tags = []
+
+      def add_summary(self, summary, step):
+        self.summary_values.append(summary.value[0].simple_value)
+        self.summary_tags.append(summary.value[0].tag)
+        self.batches_logged.append(step)
+
+      def flush(self):
+        pass
+
+      def close(self):
+        pass
+
+    temp_dir = self.get_temp_dir()
+    self.addCleanup(shutil.rmtree, temp_dir)
+
+    tb_cbk = keras.callbacks.TensorBoard(temp_dir)
+    tb_cbk.writer = FileWriterStub(temp_dir)
+
+    for batch in range(5):
+      tb_cbk.on_batch_end(batch, {'acc': np.float32(batch)})
+    self.assertEqual(tb_cbk.writer.batches_logged, [0, 1, 2, 3, 4])
+    self.assertEqual(tb_cbk.writer.summary_values, [0., 1., 2., 3., 4.])
+    self.assertEqual(tb_cbk.writer.summary_tags, ['batch_acc'] * 5)
+
+  def test_Tensorboard_epoch_and_batch_logging(self):
+
+    class FileWriterStub(object):
+
+      def __init__(self, logdir, graph=None):
+        self.logdir = logdir
+        self.graph = graph
+
+      def add_summary(self, summary, step):
+        if 'batch_' in summary.value[0].tag:
+          self.batch_summary = (step, summary)
+        elif 'epoch_' in summary.value[0].tag:
+          self.epoch_summary = (step, summary)
+
+      def flush(self):
+        pass
+
+      def close(self):
+        pass
+
+    temp_dir = self.get_temp_dir()
+    self.addCleanup(shutil.rmtree, temp_dir)
+
+    tb_cbk = keras.callbacks.TensorBoard(temp_dir)
+    tb_cbk.writer = FileWriterStub(temp_dir)
+
+    tb_cbk.on_batch_end(0, {'acc': np.float32(5.0)})
+    tb_cbk.on_epoch_end(0, {'acc': np.float32(10.0)})
+    batch_step, batch_summary = tb_cbk.writer.batch_summary
+    self.assertEqual(batch_step, 0)
+    self.assertEqual(batch_summary.value[0].simple_value, 5.0)
+    epoch_step, epoch_summary = tb_cbk.writer.epoch_summary
+    self.assertEqual(epoch_step, 0)
+    self.assertEqual(epoch_summary.value[0].simple_value, 10.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_Tensorboard_eager(self):
+    with self.test_session():
+      temp_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
+      self.addCleanup(shutil.rmtree, temp_dir)
+
+      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+          train_samples=TRAIN_SAMPLES,
+          test_samples=TEST_SAMPLES,
+          input_shape=(INPUT_DIM,),
+          num_classes=NUM_CLASSES)
+      y_test = keras.utils.to_categorical(y_test)
+      y_train = keras.utils.to_categorical(y_train)
+
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.Dense(
+              NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
+      model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
+      model.compile(
+          loss='binary_crossentropy',
+          optimizer=adam.AdamOptimizer(0.01),
+          metrics=['accuracy'])
+
+      cbks = [keras.callbacks.TensorBoard(log_dir=temp_dir)]
+
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=2,
+          verbose=0)
+
+      self.assertTrue(os.path.exists(temp_dir))
+
   def test_RemoteMonitorWithJsonPayload(self):
     if requests is None:
       self.skipTest('`requests` required to run this test')
diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index e02792208b..c7ab6750cb 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -26,6 +26,7 @@ import numpy as np
 from six.moves import zip  # pylint: disable=redefined-builtin
 
 from tensorflow.python.eager import context
+from tensorflow.python.eager import function as eager_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -174,6 +175,12 @@ class Layer(checkpointable.CheckpointableBase):
 
     self.supports_masking = False
 
+    call_argspec = tf_inspect.getargspec(self.call)
+    if 'training' in call_argspec.args:
+      self._expects_training_arg = True
+    else:
+      self._expects_training_arg = False
+
     # Manage input shape information if passed.
     if 'input_shape' in kwargs or 'batch_input_shape' in kwargs:
       # In this case we will later create an input layer
@@ -723,9 +730,17 @@ class Layer(checkpointable.CheckpointableBase):
             self._dtype = input_list[0].dtype.base_dtype.name
           except AttributeError:
             pass
+
         if all(hasattr(x, 'shape') for x in input_list):
           input_shapes = nest.map_structure(lambda x: x.shape, inputs)
-        self.build(input_shapes)
+
+        if (not hasattr(self, '_is_graph_network') or
+            self.__class__.__name__ == 'Sequential'):
+          # Only if self is a layer or an instance of a sequential model do we
+          # need to build it.
+          self.build(input_shapes)
+        # We must set self.built since user defined build functions are not
+        # constrained to set self.built.
         self.built = True
 
       # Check input assumptions set after layer building, e.g. input shape.
@@ -778,17 +793,8 @@ class Layer(checkpointable.CheckpointableBase):
     if hasattr(self, '_initial_weights') and self._initial_weights is not None:
       self.set_weights(self._initial_weights)
       del self._initial_weights
-    self._post_build_cleanup()
     return outputs
 
-  def _post_build_cleanup(self):
-    """Hooks to run after all sub-Layers are built."""
-    # Note that in addition to Layer.__call__, this method is called by Model
-    # after building a graph network (which skips __call__). It should be called
-    # when possible if self.built may have switched from False to True, and is
-    # idempotent.
-    pass  # No-op for Layers which don't override this method.
-
   def apply(self, inputs, *args, **kwargs):
     """Apply the layer on a input.
 
@@ -962,6 +968,39 @@ class Layer(checkpointable.CheckpointableBase):
     Returns:
         An input shape tuple.
     """
+    if context.executing_eagerly():
+      # In this case we build the model first in order to do shape inference.
+      # This is acceptable because the framework only calls
+      # `compute_output_shape` on shape values that the layer would later be
+      # built for. It would however cause issues in case a user attempts to
+      # use `compute_output_shape` manually (these users will have to
+      # implement `compute_output_shape` themselves).
+      self.build(input_shape)
+
+      with context.graph_mode():
+        graph = eager_function.CapturingGraph({})
+        with graph.as_default():
+          if isinstance(input_shape, list):
+            inputs = [generate_placeholders_from_shape(shape)
+                      for shape in input_shape]
+          else:
+            inputs = generate_placeholders_from_shape(input_shape)
+
+          try:
+            if self._expects_training_arg:
+              outputs = self(inputs, training=False)
+            else:
+              outputs = self(inputs)
+          except TypeError:
+            raise NotImplementedError('We could not automatically infer '
+                                      'the static shape of the layer\'s output.'
+                                      ' Please implement the '
+                                      '`compute_output_shape` method on your '
+                                      'layer (%s).' % self.__class__.__name__)
+      if isinstance(outputs, list):
+        return [output.shape for output in outputs]
+      else:
+        return outputs.shape
     raise NotImplementedError
 
   def compute_mask(self, inputs, mask=None):  # pylint: disable=unused-argument
@@ -1917,3 +1956,18 @@ def make_variable(name,
       synchronization=synchronization,
       aggregation=aggregation)
   return v
+
+
+def generate_dummy_data_from_shape(shape):
+  if isinstance(shape, tensor_shape.TensorShape):
+    shape = shape.as_list()
+
+  # Replace Nones in input shape with dummy `1` value
+  shape = [x.value if isinstance(x, tensor_shape.Dimension) else x
+           for x in shape]
+  shape = [1 if x is None else x for x in shape]
+  return array_ops.ones(shape, dtype=backend.floatx())
+
+
+def generate_placeholders_from_shape(shape):
+  return array_ops.placeholder(shape=shape, dtype=backend.floatx())
diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/network.py
index a4d96de74f..d9b031d080 100644
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/network.py
@@ -20,7 +20,6 @@ from __future__ import division
 from __future__ import print_function
 
 import copy
-import functools
 import json
 import os
 import weakref
@@ -30,6 +29,7 @@ from six.moves import zip  # pylint: disable=redefined-builtin
 
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import context
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -144,10 +144,6 @@ class Network(base_layer.Layer):
 
     self._checkpointable_saver = checkpointable_utils.CheckpointableSaver(
         weakref.ref(self))
-    # A zero-argument function which should be called and set back to None as
-    # soon as the network is built (only applicable to subclassed Models). Runs
-    # restore operations when graph building.
-    self._in_progress_restore_finalizer = None
 
   @checkpointable.no_automatic_dependency_tracking
   def _init_graph_network(self, inputs, outputs, name=None):
@@ -318,8 +314,8 @@ class Network(base_layer.Layer):
     else:
       self._expects_training_arg = False
     self._call_convention = self._determine_call_convention(call_argspec)
-    self.outputs = None
-    self.inputs = None
+    self.outputs = []
+    self.inputs = []
     self.built = False
 
   def _determine_call_convention(self, call_argspec):
@@ -739,6 +735,86 @@ class Network(base_layer.Layer):
       return specs[0]
     return specs
 
+  def build(self, input_shape):
+    """Builds the model based on input shapes received.
+
+    This is to be used for subclassed models, which do not know at instantiation
+    time what their inputs look like.
+
+    Args:
+     input_shape: Single tuple, TensorShape, or list of shapes, where shapes
+         are tuples, integers, or TensorShapes.
+
+    Raises:
+      ValueError:
+        1. In case of invalid user-provided data (not of type tuple,
+           list, or TensorShape).
+        2. If the model requires call arguments that are agnostic
+           to the input shapes (positional or kwarg in call signature).
+        3. If not all layers were properly built.
+        4. If float type inputs are not supported within the layers.
+
+      In each of these cases, the user should build their model by calling it
+      on real tensor data.
+    """
+    if self._is_graph_network:
+      self.built = True
+      return
+
+    # If subclass network
+    if input_shape is None:
+      raise ValueError('Input shape must be defined when calling build on a '
+                       'model subclass network.')
+    valid_types = (tuple, list, tensor_shape.TensorShape)
+    if not isinstance(input_shape, valid_types):
+      raise ValueError('Specified input shape is not one of the valid types. '
+                       'Please specify a batch input shape of type tuple or '
+                       'list of input shapes. User provided '
+                       'input type: {}'.format(type(input_shape)))
+
+    if input_shape and not self.inputs:
+      if isinstance(input_shape, list):
+        # List of input shapes
+        x = [base_layer.generate_dummy_data_from_shape(shape)
+             for shape in input_shape]
+      else:
+        x = base_layer.generate_dummy_data_from_shape(input_shape)
+
+      kwargs = {}
+      num_call_args = len(tf_inspect.getargspec(self.call).args)
+      if self._expects_training_arg and num_call_args == 3:
+        # Has call signature of call(self, input, training)
+        kwargs['training'] = False
+      elif num_call_args > 2:
+        # Has invalid call signature of call(self, input, *args, **kwargs)
+        raise ValueError('Currently, you cannot build your model if it has '
+                         'positional or keyword arguments that are not '
+                         'inputs to the model, but are required for its '
+                         '`call` method. Instead, in order to instantiate '
+                         'and build your model, `call` your model on real '
+                         'tensor data with all expected call arguments.')
+
+      try:
+        self.call(x, **kwargs)
+      except (errors.InvalidArgumentError, TypeError):
+        raise ValueError('You cannot build your model by calling `build` '
+                         'if your layers do not support float type inputs. '
+                         'Instead, in order to instantiate and build your '
+                         'model, `call` your model on real tensor data (of '
+                         'the correct dtype).')
+
+    if self._layers:
+      self._track_layers(self._layers)
+    if self.layers:
+      for layer in self.layers:
+        if not layer.built:
+          raise ValueError('Layer: {} was not built in your model. Calling '
+                           '`build` manually on a subclassed model is only '
+                           'allowed for models with a static topology. '
+                           'In this case, you can build your model by '
+                           'calling it on real tensor data.'.format(layer))
+    self.built = True
+
   def call(self, inputs, training=None, mask=None):
     """Calls the model on new inputs.
 
@@ -779,6 +855,8 @@ class Network(base_layer.Layer):
 
   def compute_output_shape(self, input_shape):
     if not self._is_graph_network:
+      if context.executing_eagerly():
+        return super(Network, self).compute_output_shape(input_shape)
       raise NotImplementedError
 
     if isinstance(input_shape, list):
@@ -1423,13 +1501,9 @@ class Network(base_layer.Layer):
             'load_weights).')
       if not context.executing_eagerly():
         session = backend.get_session()
-        finalizer = functools.partial(status.run_restore_ops, session=session)
-        if self.built:
-          finalizer()
-        else:
-          # Hold on to this status object until the network is built (for
-          # subclassed Models). Then we'll run restore ops if necessary.
-          self._in_progress_restore_finalizer = finalizer
+        # Restore existing variables (if any) immediately, and set up a
+        # streaming restore for any variables created in the future.
+        checkpointable_utils.streaming_restore(status=status, session=session)
       return status
     if h5py is None:
       raise ImportError(
@@ -1447,14 +1521,6 @@ class Network(base_layer.Layer):
       else:
         saving.load_weights_from_hdf5_group(f, self.layers)
 
-  def _post_build_cleanup(self):
-    super(Network, self)._post_build_cleanup()
-    if self._in_progress_restore_finalizer is not None:
-      # Runs queued restore operations left over from load_weights when graph
-      # building.
-      self._in_progress_restore_finalizer()
-      self._in_progress_restore_finalizer = None
-
   def _updated_config(self):
     """Util shared between different serialization methods.
 
diff --git a/tensorflow/python/keras/engine/saving_test.py b/tensorflow/python/keras/engine/saving_test.py
index 030328f2a6..e029e614e0 100644
--- a/tensorflow/python/keras/engine/saving_test.py
+++ b/tensorflow/python/keras/engine/saving_test.py
@@ -722,18 +722,23 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase):
         self.assertEqual(len(graph.get_operations()), op_count)
 
   def _weight_loading_test_template(self, make_model_fn):
-    with self.test_session() as session:
+    with self.test_session():
       model = make_model_fn()
+      model.compile(
+          loss='mse',
+          optimizer=training_module.RMSPropOptimizer(0.1),
+          metrics=['acc'])
       temp_dir = self.get_temp_dir()
       prefix = os.path.join(temp_dir, 'ckpt')
+      train_x = np.random.random((3, 2))
+      train_y = np.random.random((3,))
+      x = constant_op.constant(train_x, dtype=dtypes.float32)
 
-      x = constant_op.constant(np.random.random((3, 2)), dtype=dtypes.float32)
-      executing_eagerly = context.executing_eagerly()
-      ref_y_tensor = model(x)
-      if not executing_eagerly:
-        session.run([v.initializer for v in model.variables])
-      ref_y = self.evaluate(ref_y_tensor)
+      model.train_on_batch(train_x, train_y)
       model.save_weights(prefix, save_format='tf')
+      ref_y_before_train = model.predict(train_x)
+      model.train_on_batch(train_x, train_y)
+      ref_y_after_train = model.predict(train_x)
       for v in model.variables:
         self.evaluate(
             v.assign(random_ops.random_normal(shape=array_ops.shape(v))))
@@ -741,16 +746,27 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase):
       self.addCleanup(shutil.rmtree, temp_dir)
 
       model.load_weights(prefix)
-      y = self.evaluate(model(x))
-      self.assertAllClose(ref_y, y)
+      self.assertAllClose(ref_y_before_train, self.evaluate(model(x)))
 
       # Test restore-on-create if this is a subclassed Model (graph Networks
       # will have already created their variables).
       load_model = make_model_fn()
       load_model.load_weights(prefix)
-      restore_on_create_y_tensor = load_model(x)
-      restore_on_create_y = self.evaluate(restore_on_create_y_tensor)
-      self.assertAllClose(ref_y, restore_on_create_y)
+      self.assertAllClose(
+          ref_y_before_train,
+          self.evaluate(load_model(x)))
+      load_model = make_model_fn()
+      load_model.load_weights(prefix)
+      # We need to run some of the restore ops for predict(), but not all
+      # variables have been created yet (optimizer slot variables). Tests
+      # incremental restore.
+      load_model.predict(train_x)
+      load_model.compile(
+          loss='mse',
+          optimizer=training_module.RMSPropOptimizer(0.1),
+          metrics=['acc'])
+      load_model.train_on_batch(train_x, train_y)
+      self.assertAllClose(ref_y_after_train, self.evaluate(load_model(x)))
 
   @test_util.run_in_graph_and_eager_modes
   def test_weight_loading_graph_model(self):
@@ -858,5 +874,6 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase):
         SubclassedModel, SubclassedModelRestore,
         _restore_init_fn)
 
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/engine/sequential.py b/tensorflow/python/keras/engine/sequential.py
index 371504a503..41cdfda660 100644
--- a/tensorflow/python/keras/engine/sequential.py
+++ b/tensorflow/python/keras/engine/sequential.py
@@ -213,13 +213,31 @@ class Sequential(Model):
       self.outputs = [self.layers[-1].output]
       self.build()
 
-  @checkpointable.no_automatic_dependency_tracking
   def build(self, input_shape=None):
-    if input_shape and not self.inputs:
-      batch_shape = tuple(input_shape)
+    self._set_inputs_and_outputs(input_shape=input_shape)
+
+  def symbolic_set_inputs(self, inputs):
+    self._set_inputs_and_outputs(tensor=inputs)
+
+  @checkpointable.no_automatic_dependency_tracking
+  def _set_inputs_and_outputs(self, input_shape=None, tensor=None):
+    """Set model's input and output specs based on the input received.
+
+    If `tensor` is provided, `input_shape` is not required.
+
+    Args:
+      input_shape: Optional shape of input.
+      tensor: Optional existing tensor to wrap into the `Input` layer.
+    """
+    if not self.inputs:
       dtype = K.floatx()
-      x = Input(
-          batch_shape=batch_shape, dtype=dtype, name=self.name + '_input')
+      if tensor is not None:
+        batch_shape = (None,) + tuple(tensor.get_shape().as_list()[1:])
+        x = Input(dtype=dtype, name=self.name + '_input', tensor=tensor)
+      elif input_shape is not None:
+        batch_shape = tuple(input_shape)
+        x = Input(
+            batch_shape=batch_shape, dtype=dtype, name=self.name + '_input')
       self.inputs = [x]
       for layer in self._layers:
         x = layer(x)
diff --git a/tensorflow/python/keras/engine/sequential_test.py b/tensorflow/python/keras/engine/sequential_test.py
index 0f54e29cee..4f4adca333 100644
--- a/tensorflow/python/keras/engine/sequential_test.py
+++ b/tensorflow/python/keras/engine/sequential_test.py
@@ -22,7 +22,6 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.eager import context
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
@@ -104,9 +103,6 @@ class TestSequential(test.TestCase):
 
   @tf_test_util.run_in_graph_and_eager_modes
   def test_sequential_deferred_build_with_dataset_iterators(self):
-    if not context.executing_eagerly():
-      # TODO(psv/fchollet): Add support for this use case in graph mode.
-      return
     num_hidden = 5
     input_dim = 3
     num_classes = 2
@@ -136,6 +132,48 @@ class TestSequential(test.TestCase):
                      [None, num_classes])
     self.assertEqual(len(model.weights), 2 * 2)
 
+  def test_training_and_eval_methods_on_symbolic_tensors(self):
+    with self.test_session():
+
+      def create_model():
+        model = keras.Sequential()
+        model.add(keras.layers.Dense(10, activation='relu'))
+        model.add(keras.layers.Dense(4, activation='softmax'))
+
+        model.compile(
+            optimizer=rmsprop.RMSPropOptimizer(1e-3),
+            loss='categorical_crossentropy',
+            metrics=['accuracy'])
+        return model
+
+      inputs = keras.backend.zeros(shape=(10, 3))
+      targets = keras.backend.zeros(shape=(10, 4))
+
+      model = create_model()
+      model.fit(inputs, targets, epochs=10, steps_per_epoch=30)
+
+      model = create_model()
+      model.evaluate(inputs, targets, steps=2, verbose=0)
+
+      model = create_model()
+      model.predict(inputs, steps=2)
+
+      model = create_model()
+      model.train_on_batch(inputs, targets)
+
+      model = create_model()
+      model.test_on_batch(inputs, targets)
+
+      model = create_model()
+      model.fit(
+          inputs,
+          targets,
+          epochs=1,
+          steps_per_epoch=2,
+          verbose=0,
+          validation_data=(inputs, targets),
+          validation_steps=2)
+
   @tf_test_util.run_in_graph_and_eager_modes
   def test_invalid_use_cases(self):
     # Added objects must be layer instances
diff --git a/tensorflow/python/keras/engine/topology_test.py b/tensorflow/python/keras/engine/topology_test.py
index 3eb69bd7f3..34f74db6ef 100644
--- a/tensorflow/python/keras/engine/topology_test.py
+++ b/tensorflow/python/keras/engine/topology_test.py
@@ -110,7 +110,6 @@ class TopologyConstructionTest(test.TestCase):
     layer = keras.layers.BatchNormalization()
     _ = layer.apply(x1)
 
-    print('BN updates', layer._updates)
     self.assertEqual(len(layer.updates), 2)
     self.assertEqual(len(layer.get_updates_for(x1)), 2)
     self.assertEqual(len(layer.get_updates_for(None)), 0)
@@ -960,9 +959,6 @@ class DeferredModeTest(test.TestCase):
       def call(self, inputs):
         return inputs[0] + inputs[1]
 
-      def compute_output_shape(self, input_shape):
-        return input_shape[0]
-
     c = AddLayer()([a, input_b])  # pylint: disable=not-callable
     c = keras.layers.Dense(2)(c)
 
@@ -978,6 +974,101 @@ class DeferredModeTest(test.TestCase):
       self.assertEqual(outputs[1].shape.as_list(), [10, 2])
 
 
+class DefaultShapeInferenceBehaviorTest(test.TestCase):
+
+  def _testShapeInference(self, model, input_shape, expected_output_shape):
+    input_value = np.random.random(input_shape)
+    output_value = model.predict(input_value)
+    self.assertEqual(output_value.shape, expected_output_shape)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testSingleInputCase(self):
+
+    class LayerWithOneInput(keras.layers.Layer):
+
+      def build(self, input_shape):
+        self.w = array_ops.ones(shape=(3, 4))
+
+      def call(self, inputs):
+        return keras.backend.dot(inputs, self.w)
+
+    inputs = input_layer_lib.Input(shape=(3,))
+    layer = LayerWithOneInput()
+
+    if context.executing_eagerly():
+      self.assertEqual(
+          layer.compute_output_shape((None, 3)).as_list(), [None, 4])
+      # As a side-effect, compute_output_shape builds the layer.
+      self.assertTrue(layer.built)
+      # We can still query the layer's compute_output_shape with compatible
+      # input shapes.
+      self.assertEqual(
+          layer.compute_output_shape((6, 3)).as_list(), [6, 4])
+
+    outputs = layer(inputs)
+    model = keras.Model(inputs, outputs)
+    self._testShapeInference(model, (2, 3), (2, 4))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testMultiInputOutputCase(self):
+
+    class MultiInputOutputLayer(keras.layers.Layer):
+
+      def build(self, input_shape):
+        self.w = array_ops.ones(shape=(3, 4))
+
+      def call(self, inputs):
+        a = keras.backend.dot(inputs[0], self.w)
+        b = a + inputs[1]
+        return [a, b]
+
+    input_a = input_layer_lib.Input(shape=(3,))
+    input_b = input_layer_lib.Input(shape=(4,))
+    output_a, output_b = MultiInputOutputLayer()([input_a, input_b])
+    model = keras.Model([input_a, input_b], [output_a, output_b])
+    output_a_val, output_b_val = model.predict(
+        [np.random.random((2, 3)), np.random.random((2, 4))])
+    self.assertEqual(output_a_val.shape, (2, 4))
+    self.assertEqual(output_b_val.shape, (2, 4))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testTrainingArgument(self):
+
+    class LayerWithTrainingArg(keras.layers.Layer):
+
+      def build(self, input_shape):
+        self.w = array_ops.ones(shape=(3, 4))
+
+      def call(self, inputs, training):
+        return keras.backend.dot(inputs, self.w)
+
+    inputs = input_layer_lib.Input(shape=(3,))
+    outputs = LayerWithTrainingArg()(inputs, training=False)
+    model = keras.Model(inputs, outputs)
+    self._testShapeInference(model, (2, 3), (2, 4))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testUnsupportedSignature(self):
+
+    class LayerWithAdditionalArg(keras.layers.Layer):
+
+      def build(self, input_shape):
+        self.w = array_ops.ones(shape=(3, 4))
+
+      def call(self, inputs, some_arg):
+        return keras.backend.dot(inputs, self.w) + some_arg
+
+    inputs = input_layer_lib.Input(shape=(3,))
+    if context.executing_eagerly():
+      with self.assertRaises(NotImplementedError):
+        outputs = LayerWithAdditionalArg()(inputs, some_arg=0)
+    else:
+      # Works with graph mode because the graph of ops is built together with
+      # the graph of layers.
+      outputs = LayerWithAdditionalArg()(inputs, some_arg=0)
+      _ = keras.Model(inputs, outputs)
+
+
 class GraphUtilsTest(test.TestCase):
 
   def testGetReachableFromInputs(self):
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index fbc2a11eda..0fe14e99e0 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -24,13 +24,11 @@ import numpy as np
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
-from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import losses
-from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import optimizers
 from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import training_arrays
@@ -39,7 +37,6 @@ from tensorflow.python.keras.engine import training_generator
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.engine.network import Network
 from tensorflow.python.keras.utils.generic_utils import slice_arrays
-from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import optimizer as tf_optimizer_module
 from tensorflow.python.training.checkpointable import base as checkpointable
@@ -372,21 +369,14 @@ class Model(Network):
               'sample_weight_mode dictionary: "' + name + '". '
               'Only expected the following keys: ' + str(self.output_names))
       for i, name in enumerate(self.output_names):
-        if i in skip_target_weighing_indices:
-          weight = None
-          sample_weight_modes.append(None)
-        else:
-          if name not in sample_weight_mode:
-            raise ValueError(
-                'Output "' + name + '" missing from sample_weight_modes '
-                'dictionary')
-          if sample_weight_mode.get(name) == 'temporal':
-            weight = K.placeholder(ndim=2, name=name + '_sample_weights')
-            sample_weight_modes.append('temporal')
-          else:
-            weight = K.placeholder(ndim=1, name=name + 'sample_weights')
-            sample_weight_modes.append(None)
+        if (i not in skip_target_weighing_indices and
+            name not in sample_weight_mode):
+          raise ValueError('Output "' + name +
+                           '" missing from sample_weight_modes dictionary')
+        weight, mode = training_utils.get_output_sample_weight_and_mode(
+            skip_target_weighing_indices, sample_weight_mode.get(name), name, i)
         sample_weights.append(weight)
+        sample_weight_modes.append(mode)
     elif isinstance(sample_weight_mode, list):
       if len(sample_weight_mode) != len(self.outputs):
         raise ValueError('When passing a list as sample_weight_mode, '
@@ -394,36 +384,17 @@ class Model(Network):
                          'The model has ' + str(len(self.outputs)) +
                          ' outputs, but you passed '
                          'sample_weight_mode=' + str(sample_weight_mode))
-      for i in range(len(self.output_names)):
-        if i in skip_target_weighing_indices:
-          weight = None
-          sample_weight_modes.append(None)
-        else:
-          mode = sample_weight_mode[i]
-          name = self.output_names[i]
-          if mode == 'temporal':
-            weight = K.placeholder(ndim=2, name=name + '_sample_weights')
-            sample_weight_modes.append('temporal')
-          else:
-            weight = K.placeholder(ndim=1, name=name + '_sample_weights')
-            sample_weight_modes.append(None)
+      for i, name in enumerate(self.output_names):
+        weight, mode = training_utils.get_output_sample_weight_and_mode(
+            skip_target_weighing_indices, sample_weight_mode[i], name, i)
         sample_weights.append(weight)
+        sample_weight_modes.append(mode)
     else:
       for i, name in enumerate(self.output_names):
-        if i in skip_target_weighing_indices:
-          sample_weight_modes.append(None)
-          sample_weights.append(None)
-        else:
-          if sample_weight_mode == 'temporal':
-            sample_weights.append(array_ops.placeholder_with_default(
-                constant_op.constant([[1.]], dtype=K.floatx()),
-                shape=[None, None], name=name + '_sample_weights'))
-            sample_weight_modes.append('temporal')
-          else:
-            sample_weights.append(array_ops.placeholder_with_default(
-                constant_op.constant([1.], dtype=K.floatx()),
-                shape=[None], name=name + '_sample_weights'))
-            sample_weight_modes.append(None)
+        weight, mode = training_utils.get_output_sample_weight_and_mode(
+            skip_target_weighing_indices, sample_weight_mode, name, i)
+        sample_weights.append(weight)
+        sample_weight_modes.append(mode)
     self.sample_weight_modes = sample_weight_modes
     self._feed_sample_weight_modes = []
     for i in range(len(self.outputs)):
@@ -486,43 +457,21 @@ class Model(Network):
         weights = sample_weights[i]
         output_metrics = nested_metrics[i]
         output_weighted_metrics = nested_weighted_metrics[i]
+        output_shape = self.outputs[i].get_shape().as_list()
+        loss_fn = self.loss_functions[i]
 
-        def handle_metrics(metrics, weights=None):
+        def handle_metrics(metrics, output_shape, loss_fn, weights=None):
+          """Invokes metric functions for the output."""
 
           for metric in metrics:
-            if metric in ('accuracy', 'acc', 'crossentropy', 'ce'):
-              # custom handling of accuracy/crossentropy
-              # (because of class mode duality)
-              output_shape = self.outputs[i].get_shape().as_list()
-              if (output_shape[-1] == 1 or
-                  self.loss_functions[i] == losses.binary_crossentropy):
-                # case: binary accuracy/crossentropy
-                if metric in ('accuracy', 'acc'):
-                  metric_fn = metrics_module.binary_accuracy
-                elif metric in ('crossentropy', 'ce'):
-                  metric_fn = metrics_module.binary_crossentropy
-              elif self.loss_functions[
-                  i] == losses.sparse_categorical_crossentropy:
-                # case: categorical accuracy/crossentropy with sparse targets
-                if metric in ('accuracy', 'acc'):
-                  metric_fn = metrics_module.sparse_categorical_accuracy
-                elif metric in ('crossentropy', 'ce'):
-                  metric_fn = metrics_module.sparse_categorical_crossentropy
-              else:
-                # case: categorical accuracy/crossentropy
-                if metric in ('accuracy', 'acc'):
-                  metric_fn = metrics_module.categorical_accuracy
-                elif metric in ('crossentropy', 'ce'):
-                  metric_fn = metrics_module.categorical_crossentropy
-              weighted_metric_fn = training_utils.weighted_masked_objective(
-                  metric_fn)
-            else:
-              metric_fn = metrics_module.get(metric)
-              weighted_metric_fn = training_utils.weighted_masked_objective(
-                  metric_fn)
-            metric_name = training_utils.get_base_metric_name(
+            metric_fn = training_utils.get_metric_function(
+                metric, output_shape=output_shape, loss_fn=loss_fn)
+            metric_name = training_utils.get_metric_name(
                 metric, weighted=weights is not None)
+
             with K.name_scope(metric_name):
+              weighted_metric_fn = training_utils.weighted_masked_objective(
+                  metric_fn)
               metric_result = weighted_metric_fn(
                   y_true, y_pred, weights=weights, mask=masks[i])
 
@@ -536,8 +485,9 @@ class Model(Network):
               self.stateful_metric_functions.append(metric_fn)
               self.metrics_updates += metric_fn.updates
 
-        handle_metrics(output_metrics)
-        handle_metrics(output_weighted_metrics, weights=weights)
+        handle_metrics(output_metrics, output_shape, loss_fn)
+        handle_metrics(
+            output_weighted_metrics, output_shape, loss_fn, weights=weights)
 
     # Prepare gradient updates and state updates.
     self.total_loss = total_loss
@@ -607,7 +557,6 @@ class Model(Network):
             updates=updates,
             name='train_function',
             **self._function_kwargs)
-    self._post_build_cleanup()
 
   def _make_test_function(self):
     if not hasattr(self, 'test_function'):
@@ -625,7 +574,6 @@ class Model(Network):
           updates=self.state_updates + self.metrics_updates,
           name='test_function',
           **self._function_kwargs)
-    self._post_build_cleanup()
 
   def _make_predict_function(self):
     if not hasattr(self, 'predict_function'):
@@ -644,7 +592,6 @@ class Model(Network):
           updates=self.state_updates,
           name='predict_function',
           **kwargs)
-    self._post_build_cleanup()
 
   def _get_iterator_get_next_tensors(self, iterator):
     get_next_op = self._iterator_get_next.get(iterator, None)
@@ -991,10 +938,14 @@ class Model(Network):
         inputs = inputs[0]
 
       if tensor_util.is_tensor(inputs):
-        input_shape = (None,) + tuple(inputs.get_shape().as_list()[1:])
+        if context.executing_eagerly():
+          input_shape = (None,) + tuple(inputs.get_shape().as_list()[1:])
+          self.build(input_shape=input_shape)
+        else:
+          self.symbolic_set_inputs(inputs)
       else:
         input_shape = (None,) + inputs.shape[1:]
-      self.build(input_shape=input_shape)
+        self.build(input_shape=input_shape)
     elif context.executing_eagerly():
       self._eager_set_inputs(inputs)
     else:
diff --git a/tensorflow/python/keras/engine/training_eager.py b/tensorflow/python/keras/engine/training_eager.py
index 397de42985..b1149ab5b5 100644
--- a/tensorflow/python/keras/engine/training_eager.py
+++ b/tensorflow/python/keras/engine/training_eager.py
@@ -30,35 +30,11 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import callbacks as cbks
-from tensorflow.python.keras import losses
-from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.platform import tf_logging as logging
 
 
-def _get_metrics_info(metric, internal_output_shapes=None, loss_func=None):
-  if metric == 'accuracy' or metric == 'acc':
-    # custom handling of accuracy
-    # (because of class mode duality)
-    output_shape = internal_output_shapes
-    if output_shape[-1] == 1 or loss_func == losses.binary_crossentropy:
-      # case: binary accuracy
-      acc_fn = metrics_module.binary_accuracy
-    elif loss_func == losses.sparse_categorical_crossentropy:
-      # case: categorical accuracy with sparse targets
-      acc_fn = metrics_module.sparse_categorical_accuracy
-    else:
-      acc_fn = metrics_module.categorical_accuracy
-
-    metric_name = 'acc'
-    return metric_name, acc_fn
-  else:
-    metric_fn = metrics_module.get(metric)
-    metric_name = metric_fn.__name__
-    return metric_name, metric_fn
-
-
 def _eager_loss_fn(outputs, targets, loss_fn, output_name):
   with backend.name_scope(output_name + '_loss'):
     loss = loss_fn(targets, outputs)
@@ -74,9 +50,8 @@ def _eager_metrics_fn(model, outputs, targets):
       targets: The predictions or targets of the given model.
 
   Returns:
-      Returns the metric names and metric results for each output of the model.
+      Returns the metric results for each output of the model.
   """
-  metric_names = []
   metric_results = []
   if not isinstance(outputs, list):
     outputs = [outputs]
@@ -87,18 +62,15 @@ def _eager_metrics_fn(model, outputs, targets):
   for i in range(len(model.outputs)):
     output_metrics = model.nested_metrics[i]
     for nested_output_metric in output_metrics:
-      metric_name, metric_fn = _get_metrics_info(
+      metric_fn = training_utils.get_metric_function(
           nested_output_metric, backend.int_shape(model.outputs[i]),
           model.loss_functions[i])
-
-      if len(model.output_names) > 1:
-        metric_name = model.output_names[i] + '_' + metric_name
-        if metric_name not in model.metrics_names:
-          model.metrics_names.append(metric_name)
+      # weighted metrics are not supported in eager mode
+      metric_name = training_utils.get_metric_name(
+          nested_output_metric, weighted=False)
 
       with backend.name_scope(metric_name):
         metric_result = metric_fn(targets[i], outputs[i])
-        metric_names.append(metric_name)
         metric_results.append(backend.mean(metric_result))
 
   return metric_results
diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index c621a88fb3..129441d159 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -731,6 +731,54 @@ class LossWeightingTest(test.TestCase):
         model.fit(x_np, [y_np, y_np], epochs=1,
                   sample_weight={'1': bad_w_np})
 
+  def test_default_sample_weight(self):
+    """Verifies that fit works without having to set sample_weight."""
+
+    num_classes = 5
+    input_dim = 5
+    timesteps = 3
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.TimeDistributed(
+              keras.layers.Dense(num_classes),
+              input_shape=(timesteps, input_dim)))
+
+      x = np.random.random((10, timesteps, input_dim))
+      y = np.random.random((10, timesteps, num_classes))
+
+      # sample_weight_mode is a list and mode value is None
+      model.compile(loss='mse', optimizer='rmsprop', sample_weight_mode=[None])
+      model.fit(x, y, epochs=1, batch_size=10)
+
+      # sample_weight_mode is a list and mode value is `temporal`
+      model.compile(
+          loss='mse', optimizer='rmsprop', sample_weight_mode=['temporal'])
+      model.fit(x, y, epochs=1, batch_size=10)
+
+      # sample_weight_mode is a dict and mode value is None
+      model.compile(
+          loss='mse',
+          optimizer='rmsprop',
+          sample_weight_mode={'time_distributed': None})
+      model.fit(x, y, epochs=1, batch_size=10)
+
+      # sample_weight_mode is a dict and mode value is `temporal`
+      model.compile(
+          loss='mse',
+          optimizer='rmsprop',
+          sample_weight_mode={'time_distributed': 'temporal'})
+      model.fit(x, y, epochs=1, batch_size=10)
+
+      # sample_weight_mode is a not a list/dict and mode value is None
+      model.compile(loss='mse', optimizer='rmsprop', sample_weight_mode=None)
+      model.fit(x, y, epochs=1, batch_size=10)
+
+      # sample_weight_mode is a not a list/dict and mode value is `temporal`
+      model.compile(
+          loss='mse', optimizer='rmsprop', sample_weight_mode='temporal')
+      model.fit(x, y, epochs=1, batch_size=10)
+
 
 class LossMaskingTest(test.TestCase):
 
@@ -767,6 +815,22 @@ class LossMaskingTest(test.TestCase):
               keras.backend.variable(weights), keras.backend.variable(mask)))
 
 
+class LearningPhaseTest(test.TestCase):
+
+  def test_empty_model_no_learning_phase(self):
+    with self.test_session():
+      model = keras.models.Sequential()
+      self.assertFalse(model.uses_learning_phase)
+
+  def test_dropout_has_learning_phase(self):
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(2, input_dim=3))
+      model.add(keras.layers.Dropout(0.5))
+      model.add(keras.layers.Dense(2))
+      self.assertTrue(model.uses_learning_phase)
+
+
 class TestDynamicTrainability(test.TestCase):
 
   def test_trainable_warning(self):
diff --git a/tensorflow/python/keras/engine/training_utils.py b/tensorflow/python/keras/engine/training_utils.py
index dbbc87daf9..f2cd9c89da 100644
--- a/tensorflow/python/keras/engine/training_utils.py
+++ b/tensorflow/python/keras/engine/training_utils.py
@@ -26,10 +26,12 @@ import numpy as np
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import losses
 from tensorflow.python.keras import metrics as metrics_module
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 
 
@@ -700,17 +702,16 @@ def populate_metric_names(model):
   for i in range(len(model.outputs)):
     metrics = model.nested_metrics[i]
     for metric in metrics:
-      base_metric_name = get_base_metric_name(metric)
+      base_metric_name = get_metric_name(metric)
       add_metric_name(model, base_metric_name, i)
 
 
-def get_base_metric_name(metric, weighted=False):
-  """Returns the metric name given the metric function.
+def get_metric_name(metric, weighted=False):
+  """Returns the metric name corresponding to the given metric input.
 
   Arguments:
       metric: Metric function name or reference.
-      weighted: Boolean indicating if the metric for which we are adding
-          names is weighted.
+      weighted: Boolean indicating if the given metric is weighted.
 
   Returns:
       a metric name.
@@ -734,6 +735,36 @@ def get_base_metric_name(metric, weighted=False):
   return metric_name
 
 
+def get_metric_function(metric, output_shape=None, loss_fn=None):
+  """Returns the metric function corresponding to the given metric input.
+
+  Arguments:
+      metric: Metric function name or reference.
+      output_shape: The shape of the output that this metric
+          will be calculated for.
+      loss_fn: The loss function used.
+
+  Returns:
+      The metric function.
+  """
+  if metric in ['accuracy', 'acc']:
+    if output_shape[-1] == 1 or loss_fn == losses.binary_crossentropy:
+      return metrics_module.binary_accuracy  # case: binary accuracy
+    elif loss_fn == losses.sparse_categorical_crossentropy:
+      # case: categorical accuracy with sparse targets
+      return metrics_module.sparse_categorical_accuracy
+    return metrics_module.categorical_accuracy  # case: categorical accuracy
+  elif metric in ['crossentropy', 'ce']:
+    if output_shape[-1] == 1 or loss_fn == losses.binary_crossentropy:
+      return metrics_module.binary_crossentropy  # case: binary cross-entropy
+    elif loss_fn == losses.sparse_categorical_crossentropy:
+      # case: categorical cross-entropy with sparse targets
+      return metrics_module.sparse_categorical_crossentropy
+    # case: categorical cross-entropy
+    return metrics_module.categorical_crossentropy
+  return metrics_module.get(metric)
+
+
 def add_metric_name(model, metric_name, index):
   """Makes the metric name unique and adds it to the model's metric name list.
 
@@ -856,3 +887,25 @@ def cast_if_floating_dtype(x):
         for val in x
     ]
   return math_ops.cast(x, dtype=K.floatx()) if x.dtype.is_floating else x
+
+
+def get_output_sample_weight_and_mode(skip_target_weighing_indices,
+                                      sample_weight_mode, output_name,
+                                      output_index):
+  """Returns the sample weight and weight mode for a single output."""
+  if output_index in skip_target_weighing_indices:
+    return None, None
+
+  if sample_weight_mode == 'temporal':
+    default_value = [[1.]]
+    shape = [None, None]
+    mode = 'temporal'
+  else:
+    default_value = [1.]
+    shape = [None]
+    mode = None
+  weight = array_ops.placeholder_with_default(
+      constant_op.constant(default_value, dtype=K.floatx()),
+      shape=shape,
+      name=output_name + '_sample_weights')
+  return weight, mode
diff --git a/tensorflow/python/keras/layers/advanced_activations.py b/tensorflow/python/keras/layers/advanced_activations.py
index eba10da6f3..61ab69c16f 100644
--- a/tensorflow/python/keras/layers/advanced_activations.py
+++ b/tensorflow/python/keras/layers/advanced_activations.py
@@ -284,6 +284,13 @@ class Softmax(Layer):
 class ReLU(Layer):
   """Rectified Linear Unit activation function.
 
+  With default values, it returns element-wise `max(x, 0)`.
+
+  Otherwise, it follows:
+  `f(x) = max_value` for `x >= max_value`,
+  `f(x) = x` for `threshold <= x < max_value`,
+  `f(x) = negative_slope * (x - threshold)` otherwise.
+
   Input shape:
       Arbitrary. Use the keyword argument `input_shape`
       (tuple of integers, does not include the samples axis)
@@ -294,21 +301,39 @@ class ReLU(Layer):
 
   Arguments:
       max_value: float >= 0. Maximum activation value.
+      negative_slope: float >= 0. Negative slope coefficient.
+      threshold: float. Threshold value for thresholded activation.
   """
 
-  def __init__(self, max_value=None, **kwargs):
+  def __init__(self, max_value=None, negative_slope=0, threshold=0, **kwargs):
     super(ReLU, self).__init__(**kwargs)
-    self.support_masking = True
-    self.max_value = K.cast_to_floatx(max_value)
-    if self.max_value < 0.:
+    if max_value is not None and max_value < 0.:
       raise ValueError('max_value of Relu layer '
                        'cannot be negative value: ' + str(max_value))
+    if negative_slope < 0.:
+      raise ValueError('negative_slope of Relu layer '
+                       'cannot be negative value: ' + str(negative_slope))
+
+    self.support_masking = True
+    self.max_value = K.cast_to_floatx(max_value)
+    self.negative_slope = K.cast_to_floatx(negative_slope)
+    self.threshold = K.cast_to_floatx(threshold)
 
   def call(self, inputs):
-    return activations.relu(inputs, max_value=self.max_value)
+    # alpha is used for leaky relu slope in activations instead of
+    # negative_slope.
+    return activations.relu(
+        inputs,
+        alpha=self.negative_slope,
+        max_value=self.max_value,
+        threshold=self.threshold)
 
   def get_config(self):
-    config = {'max_value': self.max_value}
+    config = {
+        'max_value': self.max_value,
+        'negative_slope': self.negative_slope,
+        'threshold': self.threshold
+    }
     base_config = super(ReLU, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
diff --git a/tensorflow/python/keras/layers/advanced_activations_test.py b/tensorflow/python/keras/layers/advanced_activations_test.py
index 9e1f15b1bc..53c1baa2bb 100644
--- a/tensorflow/python/keras/layers/advanced_activations_test.py
+++ b/tensorflow/python/keras/layers/advanced_activations_test.py
@@ -75,6 +75,14 @@ class AdvancedActivationsTest(test.TestCase):
         testing_utils.layer_test(keras.layers.ReLU,
                                  kwargs={'max_value': -10},
                                  input_shape=(2, 3, 4))
+    with self.assertRaisesRegexp(
+        ValueError,
+        'negative_slope of Relu layer cannot be negative value: -2'):
+      with self.test_session():
+        testing_utils.layer_test(
+            keras.layers.ReLU,
+            kwargs={'negative_slope': -2},
+            input_shape=(2, 3, 4))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/layers/cudnn_recurrent_test.py b/tensorflow/python/keras/layers/cudnn_recurrent_test.py
index 8fd970239f..2ed0aa8f26 100644
--- a/tensorflow/python/keras/layers/cudnn_recurrent_test.py
+++ b/tensorflow/python/keras/layers/cudnn_recurrent_test.py
@@ -220,7 +220,7 @@ class CuDNNTest(test.TestCase, parameterized.TestCase):
         self.assertNotEqual(out4.max(), out5.max())
 
   @parameterized.named_parameters(
-      *testing_utils.generate_combinations_with_testcase_name(
+      *test_util.generate_combinations_with_testcase_name(
           rnn_type=['LSTM', 'GRU'], to_cudnn=[True, False],
           bidirectional=[True, False], implementation=[1, 2],
           model_nest_level=[1, 2], model_type=['seq', 'func']))
@@ -301,7 +301,7 @@ class CuDNNTest(test.TestCase, parameterized.TestCase):
     os.remove(fname)
 
   @parameterized.named_parameters(
-      *testing_utils.generate_combinations_with_testcase_name(
+      *test_util.generate_combinations_with_testcase_name(
           rnn_type=['LSTM', 'GRU'], to_cudnn=[True, False]))
   def test_load_weights_between_noncudnn_rnn_time_distributed(self, rnn_type,
                                                               to_cudnn):
diff --git a/tensorflow/python/keras/metrics.py b/tensorflow/python/keras/metrics.py
index 72e15763cb..7d8b1fec45 100644
--- a/tensorflow/python/keras/metrics.py
+++ b/tensorflow/python/keras/metrics.py
@@ -21,6 +21,8 @@ from __future__ import print_function
 
 from abc import ABCMeta
 from abc import abstractmethod
+
+import types
 import six
 
 from tensorflow.python.eager import context
@@ -58,7 +60,14 @@ from tensorflow.python.util import tf_decorator
 from tensorflow.python.util.tf_export import tf_export
 
 
-def update_state(update_state_fn):
+def check_is_tensor_or_operation(x, name):
+  """Raises type error if the given input is not a tensor or operation."""
+  if not (isinstance(x, ops.Tensor) or isinstance(x, ops.Operation)):
+    raise TypeError('{0} must be a Tensor or Operation, given: {1}'.format(
+        name, x))
+
+
+def update_state_wrapper(update_state_fn):
   """Decorator to wrap metric `update_state()` with `defun()`, `add_update()`.
 
   Args:
@@ -70,7 +79,7 @@ def update_state(update_state_fn):
       executed to update the metric state with the given inputs.
   """
 
-  def decorated(*args, **kwargs):
+  def decorated(metric_obj, *args, **kwargs):
     """Decorated function with `defun()` and `add_update()`."""
 
     # Converting update_state_fn() into a graph function, so that
@@ -79,14 +88,15 @@ def update_state(update_state_fn):
     defuned_update_state_fn = function.defun(update_state_fn)
     update_op = defuned_update_state_fn(*args, **kwargs)
     if update_op is not None:  # update_op will be None in eager execution.
-      metric_obj = args[0]
       metric_obj.add_update(update_op, inputs=True)
+      check_is_tensor_or_operation(
+          update_op, 'Metric {0}\'s update'.format(metric_obj.name))
     return update_op
 
   return tf_decorator.make_decorator(update_state_fn, decorated)
 
 
-def result(result_fn):
+def result_wrapper(result_fn):
   """Decorator to wrap metric `result()` function in `merge_call()`.
 
   Result computation is an idempotent operation that simply calculates the
@@ -104,26 +114,29 @@ def result(result_fn):
     The metric result tensor.
   """
 
-  def decorated(*args):
+  def decorated(metric_obj, *args):
     """Decorated function with merge_call."""
     tower_context = distribute_lib.get_tower_context()
     if tower_context is None:  # if in cross tower context already
-      return result_fn()
-
-    # TODO(psv): Test distribution of metrics using different distribution
-    # strategies.
-
-    # Creating a wrapper for merge_fn. merge_call invokes the given merge_fn
-    # with distribution object as the first parameter. We create a wrapper here
-    # so that the result function need not have that parameter.
-    def merge_fn_wrapper(distribution, merge_fn, *args):
-      # We will get `PerDevice` merge function. Taking the first one as all are
-      # identical copies of the function that we had passed below.
-      return distribution.unwrap(merge_fn)[0](*args)
-
-    # Wrapping result in merge_call. merge_call is used when we want to leave
-    # tower mode and compute a value in cross tower mode.
-    return tower_context.merge_call(merge_fn_wrapper, result_fn, *args)
+      result_t = result_fn(*args)
+    else:
+      # TODO(psv): Test distribution of metrics using different distribution
+      # strategies.
+
+      # Creating a wrapper for merge_fn. merge_call invokes the given merge_fn
+      # with distribution object as the first parameter. We create a wrapper
+      # here so that the result function need not have that parameter.
+      def merge_fn_wrapper(distribution, merge_fn, *args):
+        # We will get `PerDevice` merge function. Taking the first one as all
+        # are identical copies of the function that we had passed below.
+        return distribution.unwrap(merge_fn)[0](*args)
+
+      # Wrapping result in merge_call. merge_call is used when we want to leave
+      # tower mode and compute a value in cross tower mode.
+      result_t = tower_context.merge_call(merge_fn_wrapper, result_fn, *args)
+    check_is_tensor_or_operation(result_t,
+                                 'Metric {0}\'s result'.format(metric_obj.name))
+    return result_t
 
   return tf_decorator.make_decorator(result_fn, decorated)
 
@@ -246,24 +259,19 @@ class Metric(Layer):
   * `__init__()`: All state variables should be created in this method by
     calling `self.add_weight()` like: `self.var = self.add_weight(...)`
   * `update_state()`: Has all updates to the state variables like:
-    self.var.assign_add(...). Please decorate the function with:
-    @update_state: Converts `update_state()` into a graph function, so that
-    we can return a single op that performs all of the variable updates and
-    adds the update op to the metric layer.
+    self.var.assign_add(...).
   * `result()`: Computes and returns a value for the metric
-    from the state variables. Please decorate the function with:
-    @result: Wraps `result()` in a distribution strategy merge_call().
+    from the state variables.
 
   Example subclass implementation:
 
   ```
   class BinaryTruePositives(Metric):
-    def __init__(self, name='binary-true-positives', dtype=dtypes.float64):
+    def __init__(self, name='binary-true-positives', dtype=None):
       super(BinaryTruePositives, self).__init__(name=name, dtype=dtype)
       self.true_positives = self.add_weight(
           'true_positives', initializer=init_ops.zeros_initializer)
 
-    @update_state
     def update_state(self, y_true, y_pred, sample_weight=None):
       y_true = math_ops.cast(y_true, dtypes.bool)
       y_pred = math_ops.cast(y_pred, dtypes.bool)
@@ -278,17 +286,24 @@ class Metric(Layer):
         values = math_ops.multiply(values, sample_weight)
       state_ops.assign_add(self.true_positives, math_ops.reduce_sum(values))
 
-    @result
     def result(self):
       return array_ops.identity(self.true_positives)
   ```
   """
   __metaclass__ = ABCMeta
 
-  def __init__(self, name=None, dtype=dtypes.float64):
+  def __init__(self, name=None, dtype=None):
     super(Metric, self).__init__(name=name, dtype=dtype)
     self.stateful = True  # All metric layers are stateful.
     self.built = True
+    self._dtype = K.floatx() if dtype is None else dtypes.as_dtype(dtype).name
+
+  def __new__(cls, *args, **kwargs):
+    obj = super(Metric, cls).__new__(cls, *args, **kwargs)
+    obj.update_state = types.MethodType(
+        update_state_wrapper(obj.update_state), obj)
+    obj.result = types.MethodType(result_wrapper(obj.result), obj)
+    return obj
 
   def __call__(self, *args, **kwargs):
     """Accumulates statistics and then computes metric result value.
@@ -301,9 +316,9 @@ class Metric(Layer):
     Returns:
       The metric value tensor.
     """
-    update_op = self.update_state(*args, **kwargs)
+    update_op = self.update_state(*args, **kwargs)  # pylint: disable=not-callable
     with ops.control_dependencies([update_op]):
-      return self.result()
+      return self.result()  # pylint: disable=not-callable
 
   def reset_states(self):
     """Resets all of the metric state variables.
@@ -318,10 +333,8 @@ class Metric(Layer):
   def update_state(self, *args, **kwargs):
     """Accumulates statistics for the metric.
 
-    Please decorate the function with:
-    @update_state: Converts `update_state()` into a graph function, so that
-    we can return a single op that performs all of the variable updates
-      This means:
+    Note: This function is executed as a graph function in graph mode.
+    This means:
       a) Operations on the same resource are executed in textual order.
          This should make it easier to do things like add the updated
          value of a variable to another, for example.
@@ -343,9 +356,6 @@ class Metric(Layer):
 
     Result computation is an idempotent operation that simply calculates the
     metric value using the state variables.
-
-    Please decorate the function with:
-    @result: Wraps `result()` in a distribution strategy merge_call().
     """
     NotImplementedError('Must be implemented in subclasses.')
 
@@ -380,7 +390,13 @@ class Mean(Metric):
   Use `sample_weight` of 0 to mask values.
   """
 
-  def __init__(self, name='mean', dtype=dtypes.float64):
+  def __init__(self, name='mean', dtype=None):
+    """Creates a `Mean` instance.
+
+    Args:
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
     super(Mean, self).__init__(name=name, dtype=dtype)
     # Create new state variables
     self.total = self.add_weight(
@@ -388,7 +404,6 @@ class Mean(Metric):
     self.count = self.add_weight(
         'count', initializer=init_ops.zeros_initializer)
 
-  @update_state
   def update_state(self, values, sample_weight=None):
     """Accumulates statistics for computing the mean.
 
@@ -418,14 +433,84 @@ class Mean(Metric):
     state_ops.assign_add(self.total, values)
     state_ops.assign_add(self.count, num_values)
 
-  @result
   def result(self):
     return _safe_div(self.total, self.count)
 
 
+class MeanMetricWrapper(Mean):
+  """Wraps a stateless metric function with the Mean metric."""
+
+  def __init__(self, fn, name=None, dtype=None, **kwargs):
+    """Creates a `MeanMetricWrapper` instance.
+
+    Args:
+      fn: The metric function to wrap, with signature
+        `fn(y_true, y_pred, **kwargs)`.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+      **kwargs: The keyword arguments that are passed on to `fn`.
+    """
+    super(MeanMetricWrapper, self).__init__(name=name, dtype=dtype)
+    self._fn = fn
+    self._fn_kwargs = kwargs
+
+  def update_state(self, y_true, y_pred, sample_weight=None):
+    """Accumulates metric statistics.
+
+    `y_true` and `y_pred` should have the same shape.
+
+    Args:
+      y_true: The ground truth values.
+      y_pred: The predicted values.
+      sample_weight: Optional weighting of each example. Defaults to 1. Can be
+        a `Tensor` whose rank is either 0, or the same rank as `y_true`,
+        and must be broadcastable to `y_true`.
+    """
+    y_true = math_ops.cast(y_true, self._dtype)
+    y_pred = math_ops.cast(y_pred, self._dtype)
+    y_pred, y_true, sample_weight = _squeeze_or_expand_dimensions(
+        y_pred, y_true, sample_weight)
+
+    matches = self._fn(y_true, y_pred, **self._fn_kwargs)
+    super(MeanMetricWrapper, self).update_state(
+        matches, sample_weight=sample_weight)
+
+  def get_config(self):
+    config = self._fn_kwargs
+    base_config = super(MeanMetricWrapper, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class BinaryAccuracy(MeanMetricWrapper):
+  """Calculates how often predictions matches labels.
+
+  This metric creates two local variables, `total` and `count` that are used to
+  compute the frequency with which `y_pred` matches `y_true`. This frequency is
+  ultimately returned as `binary accuracy`: an idempotent operation that simply
+  divides `total` by `count`.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use `sample_weight` of 0 to mask values.
+  """
+
+  def __init__(self, name='binary-accuracy', dtype=None, threshold=0.5):
+    """Creates a `BinaryAccuracy` instance.
+
+    Args:
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+      threshold: (Optional) Float representing the threshold for deciding
+      whether prediction values are 1 or 0.
+    """
+    super(BinaryAccuracy, self).__init__(
+        binary_accuracy, name, dtype=dtype, threshold=threshold)
+
+
 @tf_export('keras.metrics.binary_accuracy')
-def binary_accuracy(y_true, y_pred):
-  return K.mean(math_ops.equal(y_true, math_ops.round(y_pred)), axis=-1)
+def binary_accuracy(y_true, y_pred, threshold=0.5):
+  threshold = math_ops.cast(threshold, y_pred.dtype)
+  y_pred = math_ops.cast(y_pred > threshold, y_pred.dtype)
+  return K.mean(math_ops.equal(y_true, y_pred), axis=-1)
 
 
 @tf_export('keras.metrics.categorical_accuracy')
diff --git a/tensorflow/python/keras/metrics_test.py b/tensorflow/python/keras/metrics_test.py
index 6d8269f34d..d583379708 100644
--- a/tensorflow/python/keras/metrics_test.py
+++ b/tensorflow/python/keras/metrics_test.py
@@ -196,7 +196,7 @@ class KerasMetricsTest(test.TestCase):
     # check config
     self.assertEqual(m.name, 'my_mean')
     self.assertTrue(m.stateful)
-    self.assertEqual(m.dtype, dtypes.float64)
+    self.assertEqual(m.dtype, dtypes.float32)
     self.assertEqual(len(m.variables), 2)
     self.evaluate(variables.global_variables_initializer())
 
@@ -212,7 +212,7 @@ class KerasMetricsTest(test.TestCase):
     # check update_state() and result() + state accumulation + tensor input
     update_op = m.update_state(ops.convert_n_to_tensor([1, 5]))
     self.evaluate(update_op)
-    self.assertEqual(self.evaluate(m.result()), 106 / 3)
+    self.assertAlmostEqual(self.evaluate(m.result()), 106 / 3, 2)
     self.assertEqual(self.evaluate(m.total), 106)  # 100 + 1 + 5
     self.assertEqual(self.evaluate(m.count), 3)
 
@@ -223,7 +223,8 @@ class KerasMetricsTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def test_mean_with_sample_weight(self):
-    m = metrics.Mean()
+    m = metrics.Mean(dtype=dtypes.float64)
+    self.assertEqual(m.dtype, dtypes.float64)
     self.evaluate(variables.global_variables_initializer())
 
     # check scalar weight
@@ -308,6 +309,94 @@ class KerasMetricsTest(test.TestCase):
     self.assertEqual(200., self.evaluate(restore_mean.result()))
     self.assertEqual(3, self.evaluate(restore_mean.count))
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_binary_accuracy(self):
+    acc_obj = metrics.BinaryAccuracy(name='my acc')
+
+    # check config
+    self.assertEqual(acc_obj.name, 'my acc')
+    self.assertTrue(acc_obj.stateful)
+    self.assertEqual(len(acc_obj.variables), 2)
+    self.assertEqual(acc_obj.dtype, dtypes.float32)
+    self.evaluate(variables.global_variables_initializer())
+
+    # verify that correct value is returned
+    update_op = acc_obj.update_state([[1], [0]], [[1], [0]])
+    self.evaluate(update_op)
+    result = self.evaluate(acc_obj.result())
+    self.assertEqual(result, 1)  # 2/2
+
+    # check y_pred squeeze
+    update_op = acc_obj.update_state([[1], [1]], [[[1]], [[0]]])
+    self.evaluate(update_op)
+    result = self.evaluate(acc_obj.result())
+    self.assertAlmostEqual(result, 0.75, 2)  # 3/4
+
+    # check y_true squeeze
+    result_t = acc_obj([[[1]], [[1]]], [[1], [0]])
+    result = self.evaluate(result_t)
+    self.assertAlmostEqual(result, 0.67, 2)  # 4/6
+
+    # check with sample_weight
+    result_t = acc_obj([[1], [1]], [[1], [0]], [[0.5], [0.2]])
+    result = self.evaluate(result_t)
+    self.assertAlmostEqual(result, 0.67, 2)  # 4.5/6.7
+
+    # check incompatible shapes
+    with self.assertRaisesRegexp(ValueError,
+                                 r'Shapes \(1,\) and \(2,\) are incompatible'):
+      acc_obj.update_state([1, 1], [1])
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_binary_accuracy_threshold(self):
+    acc_obj = metrics.BinaryAccuracy(threshold=0.7)
+    self.evaluate(variables.global_variables_initializer())
+    result_t = acc_obj([[1], [1], [0], [0]], [[0.9], [0.6], [0.4], [0.8]])
+    result = self.evaluate(result_t)
+    self.assertAlmostEqual(result, 0.5, 2)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_invalid_result(self):
+
+    class InvalidResult(metrics.Metric):
+
+      def __init__(self, name='invalid-result', dtype=dtypes.float64):
+        super(InvalidResult, self).__init__(name=name, dtype=dtype)
+
+      def update_state(self, *args, **kwargs):
+        pass
+
+      def result(self):
+        return 1
+
+    invalid_result_obj = InvalidResult()
+    with self.assertRaisesRegexp(
+        TypeError,
+        'Metric invalid-result\'s result must be a Tensor or Operation, given:'
+    ):
+      invalid_result_obj.result()
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_invalid_update(self):
+
+    class InvalidUpdate(metrics.Metric):
+
+      def __init__(self, name='invalid-update', dtype=dtypes.float64):
+        super(InvalidUpdate, self).__init__(name=name, dtype=dtype)
+
+      def update_state(self, *args, **kwargs):
+        return [1]
+
+      def result(self):
+        pass
+
+    invalid_update_obj = InvalidUpdate()
+    with self.assertRaisesRegexp(
+        TypeError,
+        'Metric invalid-update\'s update must be a Tensor or Operation, given:'
+    ):
+      invalid_update_obj.update_state()
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/model_subclassing_test.py b/tensorflow/python/keras/model_subclassing_test.py
index 3ac4852eff..3a153573f8 100644
--- a/tensorflow/python/keras/model_subclassing_test.py
+++ b/tensorflow/python/keras/model_subclassing_test.py
@@ -29,6 +29,8 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import test
 from tensorflow.python.training.checkpointable import data_structures
@@ -65,6 +67,22 @@ class SimpleTestModel(keras.Model):
     return self.dense2(x)
 
 
+class SimpleConvTestModel(keras.Model):
+
+  def __init__(self, num_classes=10):
+    super(SimpleConvTestModel, self).__init__(name='test_model')
+    self.num_classes = num_classes
+
+    self.conv1 = keras.layers.Conv2D(32, (3, 3), activation='relu')
+    self.flatten = keras.layers.Flatten()
+    self.dense1 = keras.layers.Dense(num_classes, activation='softmax')
+
+  def call(self, x):
+    x = self.conv1(x)
+    x = self.flatten(x)
+    return self.dense1(x)
+
+
 class MultiIOTestModel(keras.Model):
 
   def __init__(self, use_bn=False, use_dp=False, num_classes=(2, 3)):
@@ -162,9 +180,6 @@ def get_nested_model_3(input_dim, num_classes):
       x = self.dense2(x)
       return self.bn(x)
 
-    def compute_output_shape(self, input_shape):
-      return tensor_shape.TensorShape((input_shape[0], 5))
-
   test_model = Inner()
   x = test_model(x)
   outputs = keras.layers.Dense(num_classes)(x)
@@ -174,6 +189,213 @@ def get_nested_model_3(input_dim, num_classes):
 class ModelSubclassingTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
+  def test_invalid_input_shape_build(self):
+    num_classes = 2
+    input_dim = 50
+
+    model = SimpleTestModel(num_classes=num_classes,
+                            use_dp=True,
+                            use_bn=True)
+
+    self.assertFalse(model.built, 'Model should not have been built')
+    self.assertFalse(model.weights, ('Model should have no weights since it '
+                                     'has not been built.'))
+    with self.assertRaisesRegexp(
+        ValueError, 'input shape is not one of the valid types'):
+      model.build(input_shape=tensor_shape.Dimension(input_dim))
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_embed_dtype_with_subclass_build(self):
+    class Embedding(keras.layers.Layer):
+      """An Embedding layer."""
+
+      def __init__(self, vocab_size, embedding_dim, **kwargs):
+        super(Embedding, self).__init__(**kwargs)
+        self.vocab_size = vocab_size
+        self.embedding_dim = embedding_dim
+
+      def build(self, _):
+        self.embedding = self.add_variable(
+            'embedding_kernel',
+            shape=[self.vocab_size, self.embedding_dim],
+            dtype=np.float32,
+            initializer=init_ops.random_uniform_initializer(-0.1, 0.1),
+            trainable=True)
+
+      def call(self, x):
+        return embedding_ops.embedding_lookup(self.embedding, x)
+
+    class EmbedModel(keras.Model):
+
+      def __init__(self, vocab_size, embed_size):
+        super(EmbedModel, self).__init__()
+        self.embed1 = Embedding(vocab_size, embed_size)
+
+      def call(self, inputs):
+        return self.embed1(inputs)
+
+    model = EmbedModel(100, 20)
+    self.assertFalse(model.built, 'Model should not have been built')
+    self.assertFalse(model.weights, ('Model should have no weights since it '
+                                     'has not been built.'))
+    with self.assertRaisesRegexp(
+        ValueError, 'if your layers do not support float type inputs'):
+      model.build(input_shape=(35, 20))
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_single_time_step_rnn_build(self):
+    dim = 4
+    timesteps = 1
+    batch_input_shape = (None, timesteps, dim)
+    units = 3
+
+    class SimpleRNNModel(keras.Model):
+
+      def __init__(self):
+        super(SimpleRNNModel, self).__init__()
+        self.lstm = keras.layers.LSTM(units)
+
+      def call(self, inputs):
+        return self.lstm(inputs)
+
+    model = SimpleRNNModel()
+    self.assertFalse(model.built, 'Model should not have been built')
+    self.assertFalse(model.weights, ('Model should have no weights since it '
+                                     'has not been built.'))
+    model.build(batch_input_shape)
+    self.assertTrue(model.weights, ('Model should have weights now that it '
+                                    'has been properly built.'))
+    self.assertTrue(model.built, 'Model should be built after calling `build`.')
+    model(array_ops.ones((32, timesteps, dim)))
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_single_io_subclass_build(self):
+    num_classes = 2
+    input_dim = 50
+    batch_size = None
+
+    model = SimpleTestModel(num_classes=num_classes,
+                            use_dp=True,
+                            use_bn=True)
+
+    self.assertFalse(model.built, 'Model should not have been built')
+    self.assertFalse(model.weights, ('Model should have no weights since it '
+                                     'has not been built.'))
+    model.build(input_shape=(batch_size, input_dim))
+    self.assertTrue(model.weights, ('Model should have weights now that it '
+                                    'has been properly built.'))
+    self.assertTrue(model.built, 'Model should be built after calling `build`.')
+    model(array_ops.ones((32, input_dim)))
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_single_io_dimension_subclass_build(self):
+    num_classes = 2
+    input_dim = tensor_shape.Dimension(50)
+    batch_size = tensor_shape.Dimension(None)
+
+    model = SimpleTestModel(num_classes=num_classes,
+                            use_dp=True,
+                            use_bn=True)
+
+    self.assertFalse(model.built, 'Model should not have been built')
+    self.assertFalse(model.weights, ('Model should have no weights since it '
+                                     'has not been built.'))
+    model.build(input_shape=(batch_size, input_dim))
+    self.assertTrue(model.weights, ('Model should have weights now that it '
+                                    'has been properly built.'))
+    self.assertTrue(model.built, 'Model should be built after calling `build`.')
+    model(array_ops.ones((32, input_dim)))
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_multidim_io_subclass_build(self):
+    num_classes = 10
+    # Input size, e.g. image
+    batch_size = 32
+    input_shape = (32, 32, 3)
+
+    model = SimpleConvTestModel(num_classes)
+    self.assertFalse(model.built, 'Model should not have been built')
+    self.assertFalse(model.weights, ('Model should have no weights since it '
+                                     'has not been built.'))
+    batch_input_shape = (batch_size,) + input_shape
+    model.build(input_shape=batch_input_shape)
+    self.assertTrue(model.weights, ('Model should have weights now that it '
+                                    'has been properly built.'))
+    self.assertTrue(model.built, 'Model should be built after calling `build`.')
+
+    model(array_ops.ones(batch_input_shape))
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_tensorshape_io_subclass_build(self):
+    num_classes = 10
+    # Input size, e.g. image
+    batch_size = None
+    input_shape = (32, 32, 3)
+
+    model = SimpleConvTestModel(num_classes)
+    self.assertFalse(model.built, 'Model should not have been built')
+    self.assertFalse(model.weights, ('Model should have no weights since it '
+                                     'has not been built.'))
+    model.build(
+        input_shape=tensor_shape.TensorShape((batch_size,) + input_shape))
+    self.assertTrue(model.weights, ('Model should have weights now that it '
+                                    'has been properly built.'))
+    self.assertTrue(model.built, 'Model should be built after calling `build`.')
+
+    model(array_ops.ones((32,) + input_shape))
+
+  def test_subclass_save_model(self):
+    num_classes = 10
+    # Input size, e.g. image
+    batch_size = None
+    input_shape = (32, 32, 3)
+
+    model = SimpleConvTestModel(num_classes)
+    self.assertFalse(model.built, 'Model should not have been built')
+    self.assertFalse(model.weights, ('Model should have no weights since it '
+                                     'has not been built.'))
+    model.build(
+        input_shape=tensor_shape.TensorShape((batch_size,) + input_shape))
+    self.assertTrue(model.weights, ('Model should have weights now that it '
+                                    'has been properly built.'))
+    self.assertTrue(model.built, 'Model should be built after calling `build`.')
+    weights = model.get_weights()
+
+    tf_format_name = os.path.join(self.get_temp_dir(), 'ckpt')
+    model.save_weights(tf_format_name)
+    if h5py is not None:
+      hdf5_format_name = os.path.join(self.get_temp_dir(), 'weights.h5')
+      model.save_weights(hdf5_format_name)
+
+    model = SimpleConvTestModel(num_classes)
+    model.build(
+        input_shape=tensor_shape.TensorShape((batch_size,) + input_shape))
+    if h5py is not None:
+      model.load_weights(hdf5_format_name)
+      self.assertAllClose(weights, model.get_weights())
+    model.load_weights(tf_format_name)
+    self.assertAllClose(weights, model.get_weights())
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_multi_io_subclass_build(self):
+    batch_size = None
+    num_samples = 1000
+    input_dim = 50
+    model = MultiIOTestModel()
+    self.assertFalse(model.built, 'Model should not have been built')
+    self.assertFalse(model.weights, ('Model should have no weights since it '
+                                     'has not been built.'))
+    batch_input_shape = tensor_shape.TensorShape((batch_size, input_dim))
+    model.build(
+        input_shape=[batch_input_shape, batch_input_shape])
+    self.assertTrue(model.weights, ('Model should have weights now that it '
+                                    'has been properly built.'))
+    self.assertTrue(model.built, 'Model should be built after calling `build`.')
+    x1 = array_ops.ones((num_samples, input_dim))
+    x2 = array_ops.ones((num_samples, input_dim))
+    model([x1, x2])
+
+  @test_util.run_in_graph_and_eager_modes
   def test_single_io_workflow_with_np_arrays(self):
     num_classes = 2
     num_samples = 100
@@ -750,6 +972,16 @@ class CustomCallModel(keras.Model):
       return combined
 
 
+class TrainingNoDefaultModel(keras.Model):
+
+  def __init__(self):
+    super(TrainingNoDefaultModel, self).__init__()
+    self.dense1 = keras.layers.Dense(1)
+
+  def call(self, x, training):
+    return self.dense1(x)
+
+
 class CustomCallSignatureTests(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
@@ -767,6 +999,32 @@ class CustomCallSignatureTests(test.TestCase):
     self.assertAllClose(expected_output, self.evaluate(output))
 
   @test_util.run_in_graph_and_eager_modes
+  def test_training_args_call_build(self):
+    input_dim = 2
+
+    model = TrainingNoDefaultModel()
+    self.assertFalse(model.built, 'Model should not have been built')
+    self.assertFalse(model.weights, ('Model should have no weights since it '
+                                     'has not been built.'))
+    model.build((None, input_dim))
+    self.assertTrue(model.weights, ('Model should have weights now that it '
+                                    'has been properly built.'))
+    self.assertTrue(model.built, 'Model should be built after calling `build`.')
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_custom_call_kwargs_and_build(self):
+    first_input_shape = (2, 3)
+    second_input_shape = (2, 5)
+
+    model = CustomCallModel()
+    self.assertFalse(model.built, 'Model should not have been built')
+    self.assertFalse(model.weights, ('Model should have no weights since it '
+                                     'has not been built.'))
+    with self.assertRaisesRegexp(
+        ValueError, 'cannot build your model if it has positional'):
+      model.build(input_shape=[first_input_shape, second_input_shape])
+
+  @test_util.run_in_graph_and_eager_modes
   def test_inputs_in_signature(self):
 
     class HasInputsAndOtherPositional(keras.Model):
@@ -829,14 +1087,9 @@ class CustomCallSignatureTests(test.TestCase):
 
   def test_training_no_default(self):
 
-    class TrainingNoDefault(keras.Model):
-
-      def call(self, x, training):
-        return x
-
     with context.graph_mode():
-      model = TrainingNoDefault()
-      arg = array_ops.ones([])
+      model = TrainingNoDefaultModel()
+      arg = array_ops.ones([1, 1])
       model(arg, True)
       six.assertCountEqual(self, [arg], model.inputs)
 
diff --git a/tensorflow/python/keras/testing_utils.py b/tensorflow/python/keras/testing_utils.py
index 17aba7d86c..6e8ee06ff5 100644
--- a/tensorflow/python/keras/testing_utils.py
+++ b/tensorflow/python/keras/testing_utils.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from collections import OrderedDict
 import numpy as np
 
 from tensorflow.python import keras
@@ -185,75 +184,3 @@ def layer_test(layer_cls, kwargs=None, input_shape=None, input_dtype=None,
   # for further checks in the caller function
   return actual_output
 
-
-def _combine_named_parameters(**kwargs):
-  """Generate combinations based on its keyword arguments.
-
-  Two sets of returned combinations can be concatenated using +.  Their product
-  can be computed using `times()`.
-
-  Args:
-    **kwargs: keyword arguments of form `option=[possibilities, ...]`
-         or `option=the_only_possibility`.
-
-  Returns:
-    a list of dictionaries for each combination. Keys in the dictionaries are
-    the keyword argument names.  Each key has one value - one of the
-    corresponding keyword argument values.
-  """
-  if not kwargs:
-    return [OrderedDict()]
-
-  sort_by_key = lambda k: k[0][0]
-  kwargs = OrderedDict(sorted(kwargs.items(), key=sort_by_key))
-  first = list(kwargs.items())[0]
-
-  rest = dict(list(kwargs.items())[1:])
-  rest_combined = _combine_named_parameters(**rest)
-
-  key = first[0]
-  values = first[1]
-  if not isinstance(values, list):
-    values = [values]
-
-  combinations = [
-      OrderedDict(sorted(list(combined.items()) + [(key, v)], key=sort_by_key))
-      for v in values
-      for combined in rest_combined
-  ]
-  return combinations
-
-
-def generate_combinations_with_testcase_name(**kwargs):
-  """Generate combinations based on its keyword arguments using combine().
-
-  This function calls combine() and appends a testcase name to the list of
-  dictionaries returned. The 'testcase_name' key is a required for named
-  parameterized tests.
-
-  Args:
-    **kwargs: keyword arguments of form `option=[possibilities, ...]`
-         or `option=the_only_possibility`.
-
-  Returns:
-    a list of dictionaries for each combination. Keys in the dictionaries are
-    the keyword argument names.  Each key has one value - one of the
-    corresponding keyword argument values.
-  """
-  combinations = _combine_named_parameters(**kwargs)
-  named_combinations = []
-  for combination in combinations:
-    assert isinstance(combination, OrderedDict)
-    name = ''.join([
-        '_{}_{}'.format(
-            ''.join(filter(str.isalnum, key)),
-            ''.join(filter(str.isalnum, str(value))))
-        for key, value in combination.items()
-    ])
-    named_combinations.append(
-        OrderedDict(
-            list(combination.items()) + [('testcase_name',
-                                          '_test{}'.format(name))]))
-
-  return named_combinations
-
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 838cf836f1..adf97569ab 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -2845,7 +2845,6 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
     ],
     shard_count = 20,
-    tags = ["nomsan"],  # TODO(b/110990716) reenable
 )
 
 cuda_py_test(
@@ -3095,7 +3094,7 @@ tf_py_test(
 
 tf_py_test(
     name = "cond_v2_test",
-    size = "small",
+    size = "medium",
     srcs = ["cond_v2_test.py"],
     additional_deps = [
         "//tensorflow/python:array_ops",
@@ -3110,4 +3109,5 @@ tf_py_test(
         "//tensorflow/python:training",
     ],
     grpc_enabled = True,
+    tags = ["no_gpu"],  # TODO(b/111656070)
 )
diff --git a/tensorflow/python/kernel_tests/argmax_op_test.py b/tensorflow/python/kernel_tests/argmax_op_test.py
index ce06769902..1202c463e8 100644
--- a/tensorflow/python/kernel_tests/argmax_op_test.py
+++ b/tensorflow/python/kernel_tests/argmax_op_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
@@ -115,6 +116,12 @@ class ArgMaxTest(test.TestCase):
         ans = op([1]).eval()
         self.assertAllEqual(ans, 0)
 
+  def testOutputEmpty(self):
+    with self.test_session():
+      for op in math_ops.argmin, math_ops.argmax:
+        ret = op(array_ops.zeros(shape=[1, 0, 2]), axis=-1).eval()
+        self.assertEqual(ret.shape, (1, 0))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/cond_v2_test.py b/tensorflow/python/kernel_tests/cond_v2_test.py
index 759db5d5f4..97ce245fc8 100644
--- a/tensorflow/python/kernel_tests/cond_v2_test.py
+++ b/tensorflow/python/kernel_tests/cond_v2_test.py
@@ -22,6 +22,7 @@ from __future__ import print_function
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import cond_v2
@@ -35,10 +36,12 @@ from tensorflow.python.training import saver
 from tensorflow.python.util import compat
 
 
-class NewCondTest(test.TestCase):
+class CondV2Test(test.TestCase):
 
-  def _testCond(self, true_fn, false_fn, train_vals):
-    with self.test_session() as sess:
+  def _testCond(self, true_fn, false_fn, train_vals, feed_dict=None):
+    if not feed_dict:
+      feed_dict = {}
+    with self.test_session(graph=ops.get_default_graph()) as sess:
       pred = array_ops.placeholder(dtypes.bool, name="pred")
 
       expected = control_flow_ops.cond(pred, true_fn, false_fn, name="expected")
@@ -47,13 +50,17 @@ class NewCondTest(test.TestCase):
       expected_grad = gradients_impl.gradients(expected, train_vals)
       actual_grad = gradients_impl.gradients(actual, train_vals)
 
+      sess_run_args = {pred: True}
+      sess_run_args.update(feed_dict)
       expected_val, actual_val, expected_grad_val, actual_grad_val = sess.run(
-          (expected, actual, expected_grad, actual_grad), {pred: True})
+          (expected, actual, expected_grad, actual_grad), sess_run_args)
       self.assertEqual(expected_val, actual_val)
       self.assertEqual(expected_grad_val, actual_grad_val)
 
+      sess_run_args = {pred: False}
+      sess_run_args.update(feed_dict)
       expected_val, actual_val, expected_grad_val, actual_grad_val = sess.run(
-          (expected, actual, expected_grad, actual_grad), {pred: False})
+          (expected, actual, expected_grad, actual_grad), sess_run_args)
       self.assertEqual(expected_val, actual_val)
       self.assertEqual(expected_grad_val, actual_grad_val)
 
@@ -131,6 +138,349 @@ class NewCondTest(test.TestCase):
         self.assertIn("foo_cond_1_true", ops.get_default_graph()._functions)
         self.assertIn("foo_cond_1_false", ops.get_default_graph()._functions)
 
+  def testDefunInCond(self):
+    x = constant_op.constant(1.0, name="x")
+    y = constant_op.constant(2.0, name="y")
+
+    def true_fn():
+
+      @function.Defun()
+      def fn():
+        return x * y * 2.0
+
+      return fn()
+
+    def false_fn():
+      return 2.0
+
+    self._testCond(true_fn, false_fn, [x])
+    self._testCond(true_fn, false_fn, [x, y])
+    self._testCond(true_fn, false_fn, [y])
+
+  def testNestedDefunInCond(self):
+    x = constant_op.constant(1.0, name="x")
+    y = constant_op.constant(2.0, name="y")
+
+    def true_fn():
+      return 2.0
+
+    def false_fn():
+
+      @function.Defun()
+      def fn():
+
+        @function.Defun()
+        def nested_fn():
+          return x * y * 2.0
+
+        return nested_fn()
+
+      return fn()
+
+    self._testCond(true_fn, false_fn, [x])
+    self._testCond(true_fn, false_fn, [x, y])
+    self._testCond(true_fn, false_fn, [y])
+
+  def testDoubleNestedDefunInCond(self):
+    x = constant_op.constant(1.0, name="x")
+    y = constant_op.constant(2.0, name="y")
+
+    def true_fn():
+
+      @function.Defun()
+      def fn():
+
+        @function.Defun()
+        def nested_fn():
+
+          @function.Defun()
+          def nested_nested_fn():
+            return x * y * 2.0
+
+          return nested_nested_fn()
+
+        return nested_fn()
+
+      return fn()
+
+    def false_fn():
+      return 2.0
+
+    self._testCond(true_fn, false_fn, [x])
+    self._testCond(true_fn, false_fn, [x, y])
+    self._testCond(true_fn, false_fn, [y])
+
+  def testNestedCond(self):
+
+    def run_test(pred_value):
+
+      def build_graph():
+        pred = array_ops.placeholder(dtypes.bool, name="pred")
+        x = constant_op.constant(1.0, name="x")
+        y = constant_op.constant(2.0, name="y")
+
+        def true_fn():
+          return 2.0
+
+        def false_fn():
+
+          def false_true_fn():
+            return x * y * 2.0
+
+          def false_false_fn():
+            return x * 5.0
+
+          return _cond(pred, false_true_fn, false_false_fn, "inside_false_fn")
+
+        return x, y, pred, true_fn, false_fn
+
+      with ops.Graph().as_default():
+        x, y, pred, true_fn, false_fn = build_graph()
+        self._testCond(true_fn, false_fn, [x, y], {pred: pred_value})
+        self._testCond(true_fn, false_fn, [x], {pred: pred_value})
+        self._testCond(true_fn, false_fn, [y], {pred: pred_value})
+
+    run_test(True)
+    run_test(False)
+
+  def testDoubleNestedCond(self):
+
+    def run_test(pred1_value, pred2_value):
+
+      def build_graph():
+        pred1 = array_ops.placeholder(dtypes.bool, name="pred1")
+        pred2 = array_ops.placeholder(dtypes.bool, name="pred2")
+        x = constant_op.constant(1.0, name="x")
+        y = constant_op.constant(2.0, name="y")
+
+        def true_fn():
+          return 2.0
+
+        def false_fn():
+
+          def false_true_fn():
+
+            def false_true_true_fn():
+              return x * y * 2.0
+
+            def false_true_false_fn():
+              return x * 10.0
+
+            return _cond(
+                pred1,
+                false_true_true_fn,
+                false_true_false_fn,
+                name="inside_false_true_fn")
+
+          def false_false_fn():
+            return x * 5.0
+
+          return _cond(
+              pred2, false_true_fn, false_false_fn, name="inside_false_fn")
+
+        return x, y, pred1, pred2, true_fn, false_fn
+
+      with ops.Graph().as_default():
+        x, y, pred1, pred2, true_fn, false_fn = build_graph()
+        self._testCond(true_fn, false_fn, [x, y], {
+            pred1: pred1_value,
+            pred2: pred2_value
+        })
+        x, y, pred1, pred2, true_fn, false_fn = build_graph()
+        self._testCond(true_fn, false_fn, [x], {
+            pred1: pred1_value,
+            pred2: pred2_value
+        })
+        x, y, pred1, pred2, true_fn, false_fn = build_graph()
+        self._testCond(true_fn, false_fn, [y], {
+            pred1: pred1_value,
+            pred2: pred2_value
+        })
+
+    run_test(True, True)
+    run_test(True, False)
+    run_test(False, False)
+    run_test(False, True)
+
+  def testGradientFromInsideDefun(self):
+
+    def build_graph():
+      pred_outer = array_ops.placeholder(dtypes.bool, name="pred_outer")
+      pred_inner = array_ops.placeholder(dtypes.bool, name="pred_inner")
+      x = constant_op.constant(1.0, name="x")
+      y = constant_op.constant(2.0, name="y")
+
+      def true_fn():
+        return 2.0
+
+      def false_fn():
+
+        def inner_true_fn():
+          return x * y * 2.0
+
+        def inner_false_fn():
+          return x * 5.0
+
+        return cond_v2.cond_v2(
+            pred_inner, inner_true_fn, inner_false_fn, name="inner_cond")
+
+      cond_outer = cond_v2.cond_v2(
+          pred_outer, true_fn, false_fn, name="outer_cond")
+
+      # Compute grads inside a Defun.
+      @function.Defun()
+      def nesting_fn():
+        return gradients_impl.gradients(cond_outer, [x, y])
+
+      grads = nesting_fn()
+
+      return grads, pred_outer, pred_inner
+
+    with ops.Graph().as_default():
+      grads, pred_outer, pred_inner = build_graph()
+      with self.test_session(graph=ops.get_default_graph()) as sess:
+        self.assertSequenceEqual(
+            sess.run(grads, {
+                pred_outer: True,
+                pred_inner: True
+            }), [0., 0.])
+        self.assertSequenceEqual(
+            sess.run(grads, {
+                pred_outer: True,
+                pred_inner: False
+            }), [0., 0.])
+        self.assertSequenceEqual(
+            sess.run(grads, {
+                pred_outer: False,
+                pred_inner: True
+            }), [4., 2.])
+        self.assertSequenceEqual(
+            sess.run(grads, {
+                pred_outer: False,
+                pred_inner: False
+            }), [5., 0.])
+
+  def testGradientFromInsideNestedDefun(self):
+
+    def build_graph():
+      pred_outer = array_ops.placeholder(dtypes.bool, name="pred_outer")
+      pred_inner = array_ops.placeholder(dtypes.bool, name="pred_inner")
+      x = constant_op.constant(1.0, name="x")
+      y = constant_op.constant(2.0, name="y")
+
+      def true_fn():
+        return 2.0
+
+      def false_fn():
+
+        def inner_true_fn():
+          return x * y * 2.0
+
+        def inner_false_fn():
+          return x * 5.0
+
+        return cond_v2.cond_v2(
+            pred_inner, inner_true_fn, inner_false_fn, name="inner_cond")
+
+      cond_outer = cond_v2.cond_v2(
+          pred_outer, true_fn, false_fn, name="outer_cond")
+
+      # Compute grads inside a Defun.
+      @function.Defun()
+      def nesting_fn():
+
+        @function.Defun()
+        def inner_nesting_fn():
+          return gradients_impl.gradients(cond_outer, [x, y])
+
+        return inner_nesting_fn()
+
+      grads = nesting_fn()
+
+      return grads, pred_outer, pred_inner
+
+    with ops.Graph().as_default():
+      grads, pred_outer, pred_inner = build_graph()
+      with self.test_session(graph=ops.get_default_graph()) as sess:
+        self.assertSequenceEqual(
+            sess.run(grads, {
+                pred_outer: True,
+                pred_inner: True
+            }), [0., 0.])
+        self.assertSequenceEqual(
+            sess.run(grads, {
+                pred_outer: True,
+                pred_inner: False
+            }), [0., 0.])
+        self.assertSequenceEqual(
+            sess.run(grads, {
+                pred_outer: False,
+                pred_inner: True
+            }), [4., 2.])
+        self.assertSequenceEqual(
+            sess.run(grads, {
+                pred_outer: False,
+                pred_inner: False
+            }), [5., 0.])
+
+  def testBuildCondAndGradientInsideDefun(self):
+
+    def build_graph():
+      pred_outer = array_ops.placeholder(dtypes.bool, name="pred_outer")
+      pred_inner = array_ops.placeholder(dtypes.bool, name="pred_inner")
+      x = constant_op.constant(1.0, name="x")
+      y = constant_op.constant(2.0, name="y")
+
+      # Build cond and its gradient inside a Defun.
+      @function.Defun()
+      def fn():
+
+        def true_fn():
+          return 2.0
+
+        def false_fn():
+
+          def inner_true_fn():
+            return x * y * 2.0
+
+          def inner_false_fn():
+            return x * 5.0
+
+          return cond_v2.cond_v2(
+              pred_inner, inner_true_fn, inner_false_fn, name="inner_cond")
+
+        cond_outer = cond_v2.cond_v2(
+            pred_outer, true_fn, false_fn, name="outer_cond")
+        return gradients_impl.gradients(cond_outer, [x, y])
+
+      grads = fn()
+
+      return grads, pred_outer, pred_inner
+
+    with ops.Graph().as_default():
+      grads, pred_outer, pred_inner = build_graph()
+      with self.test_session(graph=ops.get_default_graph()) as sess:
+        self.assertSequenceEqual(
+            sess.run(grads, {
+                pred_outer: True,
+                pred_inner: True
+            }), [0., 0.])
+        self.assertSequenceEqual(
+            sess.run(grads, {
+                pred_outer: True,
+                pred_inner: False
+            }), [0., 0.])
+        self.assertSequenceEqual(
+            sess.run(grads, {
+                pred_outer: False,
+                pred_inner: True
+            }), [4., 2.])
+        self.assertSequenceEqual(
+            sess.run(grads, {
+                pred_outer: False,
+                pred_inner: False
+            }), [5., 0.])
+
   def testSecondDerivative(self):
     with self.test_session() as sess:
       pred = array_ops.placeholder(dtypes.bool, name="pred")
@@ -532,5 +882,17 @@ class CondV2ColocationGroupAndDeviceTest(test.TestCase):
         self.assertTrue(len(run_metadata.partition_graphs) >= 2)
 
 
+def _cond(pred, true_fn, false_fn, name):
+  if _is_old_cond():
+    return control_flow_ops.cond(pred, true_fn, false_fn, name=name)
+  else:
+    return cond_v2.cond_v2(pred, true_fn, false_fn, name=name)
+
+
+def _is_old_cond():
+  return isinstance(ops.get_default_graph()._get_control_flow_context(),
+                    control_flow_ops.CondContext)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/depthwise_conv_op_test.py b/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
index 7134e02c34..58845552db 100644
--- a/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
+++ b/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
@@ -90,7 +90,7 @@ def CheckGradConfigsToTest():
 class DepthwiseConv2DTest(test.TestCase):
 
   # This is testing that depthwise_conv2d and depthwise_conv2d_native
-  # produce the same results.  It also tests that NCHW and NWHC
+  # produce the same results.  It also tests that NCHW and NHWC
   # formats agree, by comparing the depthwise_conv2d_native with
   # 'NCHW' format (with transposition) matches the 'NHWC' format using
   # the higher level interface.
@@ -142,7 +142,7 @@ class DepthwiseConv2DTest(test.TestCase):
       native_t1 = t1
       strides = [1, stride, stride, 1]
       if data_format == "NCHW":
-        # Transpose from NWHC input to NCHW
+        # Transpose from NHWC input to NCHW
         # Ex. [4, 5, 5, 48] to [4, 48, 5, 5]
         native_t1 = array_ops.transpose(t1, [0, 3, 1, 2])
         strides = [1, 1, stride, stride]
@@ -368,7 +368,7 @@ class DepthwiseConv2DTest(test.TestCase):
       native_input = input_tensor
       strides = [1, stride, stride, 1]
       if data_format == "NCHW":
-        # Transpose from NWHC input to NCHW
+        # Transpose from NHWC input to NCHW
         # Ex. [4, 5, 5, 48] to [4, 48, 5, 5]
         native_input = array_ops.transpose(input_tensor, [0, 3, 1, 2])
         input_shape = [
diff --git a/tensorflow/python/kernel_tests/distributions/util_test.py b/tensorflow/python/kernel_tests/distributions/util_test.py
index 9d38ffcb4a..61faa8466e 100644
--- a/tensorflow/python/kernel_tests/distributions/util_test.py
+++ b/tensorflow/python/kernel_tests/distributions/util_test.py
@@ -311,8 +311,10 @@ class EmbedCheckCategoricalEventShapeTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testUnsupportedDtype(self):
     with self.test_session():
+      param = ops.convert_to_tensor(
+          np.ones([2**11 + 1]).astype(dtypes.qint16.as_numpy_dtype),
+          dtype=dtypes.qint16)
       with self.assertRaises(TypeError):
-        param = array_ops.ones([int(2**11+1)], dtype=dtypes.qint16)
         du.embed_check_categorical_event_shape(param)
 
 
diff --git a/tensorflow/python/kernel_tests/gather_nd_op_test.py b/tensorflow/python/kernel_tests/gather_nd_op_test.py
index 58e2a8ac2a..c0b419e1d1 100644
--- a/tensorflow/python/kernel_tests/gather_nd_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_nd_op_test.py
@@ -203,8 +203,7 @@ class GatherNdTest(test.TestCase):
       indices = [[[0], [7]]]  # Make this one higher rank
       gather_nd = array_ops.gather_nd(params, indices)
       with self.assertRaisesOpError(
-          r"flat indices\[1, :\] = \[7\] does not index into param "
-          r"\(shape: \[3\]\)"):
+          r"indices\[0,1\] = \[7\] does not index into param shape \[3\]"):
         gather_nd.eval()
 
   def _disabledTestBadIndicesGPU(self):
@@ -217,8 +216,7 @@ class GatherNdTest(test.TestCase):
       indices = [[[0], [7]]]  # Make this one higher rank
       gather_nd = array_ops.gather_nd(params, indices)
       with self.assertRaisesOpError(
-          r"flat indices\[1, :\] = \[7\] does not index into param "
-          r"\(shape: \[3\]\)"):
+          r"indices\[0,1\] = \[7\] does not index into param shape \[3\]"):
         gather_nd.eval()
 
   def testBadIndicesWithSlicesCPU(self):
@@ -227,8 +225,7 @@ class GatherNdTest(test.TestCase):
       indices = [[[0], [0], [1]]]  # Make this one higher rank
       gather_nd = array_ops.gather_nd(params, indices)
       with self.assertRaisesOpError(
-          r"flat indices\[2, :\] = \[1\] does not index into param "
-          r"\(shape: \[1,3\]\)"):
+          r"indices\[0,2\] = \[1\] does not index into param shape \[1,3\]"):
         gather_nd.eval()
 
   def _disabledTestBadIndicesWithSlicesGPU(self):
@@ -241,8 +238,7 @@ class GatherNdTest(test.TestCase):
       indices = [[[0], [0], [1]]]  # Make this one higher rank
       gather_nd = array_ops.gather_nd(params, indices)
       with self.assertRaisesOpError(
-          r"flat indices\[2, :\] = \[1\] does not index into param "
-          r"\(shape: \[1,3\]\)"):
+          r"indices\[0,2\] = \[1\] does not index into param shape \[1,3\]"):
         gather_nd.eval()
 
   def testGradientsRank2Elements(self):
diff --git a/tensorflow/python/kernel_tests/linalg/BUILD b/tensorflow/python/kernel_tests/linalg/BUILD
index 487418e694..f4ec3e3996 100644
--- a/tensorflow/python/kernel_tests/linalg/BUILD
+++ b/tensorflow/python/kernel_tests/linalg/BUILD
@@ -234,3 +234,21 @@ cuda_py_test(
         "optonly",
     ],
 )
+
+cuda_py_test(
+    name = "linear_operator_zeros_test",
+    size = "medium",
+    srcs = ["linear_operator_zeros_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/linalg",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:linalg_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:random_ops",
+    ],
+    shard_count = 5,
+    tags = ["optonly"],  # Test is flaky without optimization.
+)
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py
new file mode 100644
index 0000000000..8f60b55e0a
--- /dev/null
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py
@@ -0,0 +1,192 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import random_seed
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.linalg import linalg as linalg_lib
+from tensorflow.python.ops.linalg import linear_operator_test_util
+from tensorflow.python.platform import test
+
+
+random_seed.set_random_seed(23)
+rng = np.random.RandomState(2016)
+
+
+class LinearOperatorZerosTest(
+    linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
+  """Most tests done in the base class LinearOperatorDerivedClassTest."""
+
+  @property
+  def _tests_to_skip(self):
+    return ["log_abs_det", "solve", "solve_with_broadcast"]
+
+  @property
+  def _operator_build_infos(self):
+    build_info = linear_operator_test_util.OperatorBuildInfo
+    return [
+        build_info((1, 1)),
+        build_info((1, 3, 3)),
+        build_info((3, 4, 4)),
+        build_info((2, 1, 4, 4))]
+
+  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+    del use_placeholder
+    shape = list(build_info.shape)
+    assert shape[-1] == shape[-2]
+
+    batch_shape = shape[:-2]
+    num_rows = shape[-1]
+
+    operator = linalg_lib.LinearOperatorZeros(
+        num_rows, batch_shape=batch_shape, dtype=dtype)
+    matrix = array_ops.zeros(shape=shape, dtype=dtype)
+
+    return operator, matrix
+
+  def test_assert_positive_definite(self):
+    operator = linalg_lib.LinearOperatorZeros(num_rows=2)
+    with self.assertRaisesOpError("non-positive definite"):
+      operator.assert_positive_definite()
+
+  def test_assert_non_singular(self):
+    with self.assertRaisesOpError("non-invertible"):
+      operator = linalg_lib.LinearOperatorZeros(num_rows=2)
+      operator.assert_non_singular()
+
+  def test_assert_self_adjoint(self):
+    with self.test_session():
+      operator = linalg_lib.LinearOperatorZeros(num_rows=2)
+      operator.assert_self_adjoint().run()  # Should not fail
+
+  def test_non_scalar_num_rows_raises_static(self):
+    with self.assertRaisesRegexp(ValueError, "must be a 0-D Tensor"):
+      linalg_lib.LinearOperatorZeros(num_rows=[2])
+    with self.assertRaisesRegexp(ValueError, "must be a 0-D Tensor"):
+      linalg_lib.LinearOperatorZeros(num_rows=2, num_columns=[2])
+
+  def test_non_integer_num_rows_raises_static(self):
+    with self.assertRaisesRegexp(TypeError, "must be integer"):
+      linalg_lib.LinearOperatorZeros(num_rows=2.)
+    with self.assertRaisesRegexp(TypeError, "must be integer"):
+      linalg_lib.LinearOperatorZeros(num_rows=2, num_columns=2.)
+
+  def test_negative_num_rows_raises_static(self):
+    with self.assertRaisesRegexp(ValueError, "must be non-negative"):
+      linalg_lib.LinearOperatorZeros(num_rows=-2)
+    with self.assertRaisesRegexp(ValueError, "must be non-negative"):
+      linalg_lib.LinearOperatorZeros(num_rows=2, num_columns=-2)
+
+  def test_non_1d_batch_shape_raises_static(self):
+    with self.assertRaisesRegexp(ValueError, "must be a 1-D"):
+      linalg_lib.LinearOperatorZeros(num_rows=2, batch_shape=2)
+
+  def test_non_integer_batch_shape_raises_static(self):
+    with self.assertRaisesRegexp(TypeError, "must be integer"):
+      linalg_lib.LinearOperatorZeros(num_rows=2, batch_shape=[2.])
+
+  def test_negative_batch_shape_raises_static(self):
+    with self.assertRaisesRegexp(ValueError, "must be non-negative"):
+      linalg_lib.LinearOperatorZeros(num_rows=2, batch_shape=[-2])
+
+  def test_non_scalar_num_rows_raises_dynamic(self):
+    with self.test_session():
+      num_rows = array_ops.placeholder(dtypes.int32)
+      operator = linalg_lib.LinearOperatorZeros(
+          num_rows, assert_proper_shapes=True)
+      with self.assertRaisesOpError("must be a 0-D Tensor"):
+        operator.to_dense().eval(feed_dict={num_rows: [2]})
+
+  def test_negative_num_rows_raises_dynamic(self):
+    with self.test_session():
+      n = array_ops.placeholder(dtypes.int32)
+      operator = linalg_lib.LinearOperatorZeros(
+          num_rows=n, assert_proper_shapes=True)
+      with self.assertRaisesOpError("must be non-negative"):
+        operator.to_dense().eval(feed_dict={n: -2})
+
+      operator = linalg_lib.LinearOperatorZeros(
+          num_rows=2, num_columns=n, assert_proper_shapes=True)
+      with self.assertRaisesOpError("must be non-negative"):
+        operator.to_dense().eval(feed_dict={n: -2})
+
+  def test_non_1d_batch_shape_raises_dynamic(self):
+    with self.test_session():
+      batch_shape = array_ops.placeholder(dtypes.int32)
+      operator = linalg_lib.LinearOperatorZeros(
+          num_rows=2, batch_shape=batch_shape, assert_proper_shapes=True)
+      with self.assertRaisesOpError("must be a 1-D"):
+        operator.to_dense().eval(feed_dict={batch_shape: 2})
+
+  def test_negative_batch_shape_raises_dynamic(self):
+    with self.test_session():
+      batch_shape = array_ops.placeholder(dtypes.int32)
+      operator = linalg_lib.LinearOperatorZeros(
+          num_rows=2, batch_shape=batch_shape, assert_proper_shapes=True)
+      with self.assertRaisesOpError("must be non-negative"):
+        operator.to_dense().eval(feed_dict={batch_shape: [-2]})
+
+  def test_wrong_matrix_dimensions_raises_static(self):
+    operator = linalg_lib.LinearOperatorZeros(num_rows=2)
+    x = rng.randn(3, 3).astype(np.float32)
+    with self.assertRaisesRegexp(ValueError, "Dimensions.*not compatible"):
+      operator.matmul(x)
+
+  def test_wrong_matrix_dimensions_raises_dynamic(self):
+    num_rows = array_ops.placeholder(dtypes.int32)
+    x = array_ops.placeholder(dtypes.float32)
+
+    with self.test_session():
+      operator = linalg_lib.LinearOperatorZeros(
+          num_rows, assert_proper_shapes=True)
+      y = operator.matmul(x)
+      with self.assertRaisesOpError("Incompatible.*dimensions"):
+        y.eval(feed_dict={num_rows: 2, x: rng.rand(3, 3)})
+
+  def test_is_x_flags(self):
+    # The is_x flags are by default all True.
+    operator = linalg_lib.LinearOperatorZeros(num_rows=2)
+    self.assertFalse(operator.is_positive_definite)
+    self.assertFalse(operator.is_non_singular)
+    self.assertTrue(operator.is_self_adjoint)
+
+
+class LinearOperatorZerosNotSquareTest(
+    linear_operator_test_util.NonSquareLinearOperatorDerivedClassTest):
+
+  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+    del use_placeholder
+    shape = list(build_info.shape)
+
+    batch_shape = shape[:-2]
+    num_rows = shape[-2]
+    num_columns = shape[-1]
+
+    operator = linalg_lib.LinearOperatorZeros(
+        num_rows, num_columns, is_square=False, is_self_adjoint=False,
+        batch_shape=batch_shape, dtype=dtype)
+    matrix = array_ops.zeros(shape=shape, dtype=dtype)
+
+    return operator, matrix
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/neon_depthwise_conv_op_test.py b/tensorflow/python/kernel_tests/neon_depthwise_conv_op_test.py
index d8ce9fffbd..3cbbd48c8c 100644
--- a/tensorflow/python/kernel_tests/neon_depthwise_conv_op_test.py
+++ b/tensorflow/python/kernel_tests/neon_depthwise_conv_op_test.py
@@ -82,7 +82,7 @@ def CheckGradConfigsToTest():
 class DepthwiseConv2DTest(test.TestCase):
 
   # This is testing that depthwise_conv2d and depthwise_conv2d_native
-  # produce the same results.  It also tests that NCHW and NWHC
+  # produce the same results.  It also tests that NCHW and NHWC
   # formats agree, by comparing the depthwise_conv2d_native with
   # 'NCHW' format (with transposition) matches the 'NHWC' format using
   # the higher level interface.
@@ -123,7 +123,7 @@ class DepthwiseConv2DTest(test.TestCase):
       native_t1 = t1
       strides = [1, stride, stride, 1]
       if data_format == "NCHW":
-        # Transpose from NWHC input to NCHW
+        # Transpose from NHWC input to NCHW
         # Ex. [4, 5, 5, 48] to [4, 48, 5, 5]
         native_t1 = array_ops.transpose(t1, [0, 3, 1, 2])
         strides = [1, 1, stride, stride]
diff --git a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
index 080319f6e8..f2f3023469 100644
--- a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
@@ -268,12 +268,12 @@ class StatefulScatterNdTest(test.TestCase):
         # Test some out of range errors.
         indices = np.array([[-1], [0], [5]])
         with self.assertRaisesOpError(
-            r"Invalid indices: \[0,0\] = \[-1\] does not index into \[6\]"):
+            r"indices\[0\] = \[-1\] does not index into shape \[6\]"):
           op(ref, indices, updates).eval()
 
         indices = np.array([[2], [0], [6]])
         with self.assertRaisesOpError(
-            r"Invalid indices: \[2,0\] = \[6\] does not index into \[6\]"):
+            r"indices\[2\] = \[6\] does not index into shape \[6\]"):
           op(ref, indices, updates).eval()
 
   def testRank3ValidShape(self):
diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py
index fe459a96b9..a2b5f77f91 100644
--- a/tensorflow/python/ops/array_grad.py
+++ b/tensorflow/python/ops/array_grad.py
@@ -790,7 +790,7 @@ def _ExtractImagePatchesGrad(op, grad):
 
   sp_mat = sparse_tensor.SparseTensor(
       array_ops.constant(idx, dtype=ops.dtypes.int64),
-      array_ops.ones((len(idx),), dtype=ops.dtypes.float32), sp_shape)
+      array_ops.ones((len(idx),), dtype=grad.dtype), sp_shape)
 
   jac = sparse_ops.sparse_tensor_dense_matmul(sp_mat, grad_flat)
 
diff --git a/tensorflow/python/ops/boosted_trees_ops.py b/tensorflow/python/ops/boosted_trees_ops.py
index 868a4f6b84..f7cbfe0312 100644
--- a/tensorflow/python/ops/boosted_trees_ops.py
+++ b/tensorflow/python/ops/boosted_trees_ops.py
@@ -37,8 +37,19 @@ from tensorflow.python.training import saver
 
 
 class PruningMode(object):
+  """Class for working with Pruning modes."""
   NO_PRUNING, PRE_PRUNING, POST_PRUNING = range(0, 3)
 
+  _map = {'none': NO_PRUNING, 'pre': PRE_PRUNING, 'post': POST_PRUNING}
+
+  @classmethod
+  def from_str(cls, mode):
+    if mode in cls._map:
+      return cls._map[mode]
+    else:
+      raise ValueError('pruning_mode mode must be one of: {}'.format(', '.join(
+          sorted(cls._map))))
+
 
 class _TreeEnsembleSavable(saver.BaseSaverBuilder.SaveableObject):
   """SaveableObject implementation for TreeEnsemble."""
diff --git a/tensorflow/python/ops/cond_v2_impl.py b/tensorflow/python/ops/cond_v2_impl.py
index d310f83dca..44c5c050c0 100644
--- a/tensorflow/python/ops/cond_v2_impl.py
+++ b/tensorflow/python/ops/cond_v2_impl.py
@@ -58,12 +58,14 @@ def cond_v2(pred, true_fn, false_fn, name="cond"):
 
   with ops.name_scope(name) as scope:
     # Identify if there is a caller device, & get the innermost if possible.
-    device_stack = ops.get_default_graph()._device_function_stack
-    caller_device = device_stack[-1] if device_stack else None
+    # pylint: disable=protected-access
+    device_funcs = ops.get_default_graph()._device_functions_outer_to_inner
+    caller_device = device_funcs[-1] if device_funcs else None
 
     caller_colocation_stack = ops.get_default_graph()._colocation_stack
     caller_container = ops.get_default_graph()._container
     caller_collection_ref = ops.get_default_graph()._collections
+    # pylint: enable=protected-access
 
     func_name_prefix = scope.replace("/", "_")
 
@@ -106,7 +108,7 @@ def cond_v2(pred, true_fn, false_fn, name="cond"):
     false_graph.outputs.extend(extra_false_outputs)
 
     # Create the If op.
-    tensors = gen_functional_ops._if(
+    tensors = gen_functional_ops._if(  # pylint: disable=protected-access
         pred, cond_inputs, [t.dtype for t in true_graph.outputs],
         _create_new_tf_function(true_graph),
         _create_new_tf_function(false_graph),
@@ -125,8 +127,10 @@ def cond_v2(pred, true_fn, false_fn, name="cond"):
     # TODO(b/110167197) this approach requires cond_v2 to have at least 1 output
     if_op = tensors[0].op
     if not control_flow_util.IsInXLAContext(if_op):
+      # pylint: disable=protected-access
       if_op._set_attr("_lower_using_switch_merge",
                       attr_value_pb2.AttrValue(b=True))
+      # pylint: enable=protected-access
 
     return tensors[:num_cond_outputs]
 
@@ -135,6 +139,10 @@ def cond_v2(pred, true_fn, false_fn, name="cond"):
 def _IfGrad(op, *grads):  # pylint: disable=invalid-name
   """The gradient of an If op produced by cond_v2."""
   true_graph, false_graph = _get_func_graphs(op)
+  # Note: op.graph != ops.get_default_graph() when we are computing the gradient
+  # of a nested cond.
+  assert true_graph._outer_graph == op.graph
+  assert false_graph._outer_graph == op.graph
 
   # Create grad functions that compute the gradient of the true/false forward
   # graphs. These functions will capture tensors from the forward pass
@@ -147,15 +155,16 @@ def _IfGrad(op, *grads):  # pylint: disable=invalid-name
   assert ([t.dtype for t in true_grad_graph.outputs] ==
           [t.dtype for t in false_grad_graph.outputs])
 
-  # Match up the captured grad function inputs with outputs of 'op' and other
-  # external tensors.
-  true_grad_inputs = _get_grad_inputs(op, true_graph, true_grad_graph)
-  false_grad_inputs = _get_grad_inputs(op, false_graph, false_grad_graph)
+  # Resolve references to forward graph tensors in grad graphs and ensure
+  # they are in-scope, i.e., belong to one of outer graphs of the grad graph.
+  true_grad_extra_inputs = _resolve_grad_inputs(true_graph, true_grad_graph)
+  false_grad_extra_inputs = _resolve_grad_inputs(false_graph, false_grad_graph)
 
   # Make the inputs to true_grad_graph and false_grad_graph match. Note that
   # this modifies true_grad_graph and false_grad_graph.
   grad_inputs = _make_inputs_match(true_grad_graph, false_grad_graph,
-                                   true_grad_inputs, false_grad_inputs)
+                                   true_grad_extra_inputs,
+                                   false_grad_extra_inputs)
 
   # Add all intermediate tensors as function outputs so they're available for
   # higher-order gradient computations.
@@ -199,11 +208,20 @@ def _get_func_graphs(if_op):
     input_shapes = [t.shape for t in extra_inputs]
     func_name = if_op.get_attr(branch_name).name
     fdef = if_op.graph._get_function(func_name).definition
-    func_graph = _function_def_to_graph.function_def_to_graph(
-        fdef, input_shapes)
+    # `if_op.graph` may not be the same as `ops.get_default_graph()` e.g.
+    # in the case of nested if ops or when the gradient is being computed
+    # from inside a Defun. We build the `func_graph` with `if_op.graph` as its
+    # `outer_graph`. This resembles how the `_FuncGraph` was built in the
+    # forward pass. We need this so that we can resolve references to tensors
+    # in `func_graph` from its gradient graph in `_resolve_grad_inputs`.
+    with if_op.graph.as_default():
+      func_graph = _function_def_to_graph.function_def_to_graph(
+          fdef, input_shapes)
     func_graph.extra_inputs = extra_inputs
     func_graph.extra_args = func_graph.inputs
     func_graph._captured = dict(zip(extra_inputs, func_graph.inputs))
+    # Set the if op so that the gradient code can use it.
+    func_graph._if = if_op
     return func_graph
 
   return (_get_func_graph_for_branch("then_branch"),
@@ -240,7 +258,7 @@ def _grad_fn(func_graph, grads):
   # Build the gradient graph. Note that this builds the gradient computation of
   # func_graph in the current graph, which requires capturing tensors from
   # func_graph. The captured func_graph tensors are resolved to external tensors
-  # in _get_grad_inputs.
+  # in _resolve_grad_inputs.
   result = _gradients_impl._GradientsHelper(
       ys, func_graph.inputs, grad_ys=grad_ys,
       src_graph=func_graph)
@@ -261,43 +279,49 @@ def _create_grad_func(func_graph, grads, name):
                                            [], [], name)
 
 
-def _get_grad_inputs(if_op, cond_graph, grad_graph):
-  """Returns the tensors we should pass to grad_graph.
+def _resolve_grad_inputs(cond_graph, grad_graph):
+  """Returns the tensors to pass as `extra_inputs` to `grad_graph`.
 
-  This method handles tensors captured from cond_graph in grad_graph. It
-  converts these to suitable input tensors from the outer graph.
+  The `grad_graph` may have external references to
+  1. Its outer graph containing the input gradients. These references are kept
+     as is.
+  2. Tensors in the forward pass graph. These tensors may not be "live"
+     when the gradient is being computed. We replace such references by their
+     corresponding tensor in the least common ancestor graph of `grad_graph` and
+     `cond_graph`. Since we export intermediate tensors for all branch
+     functions, this is always possible.
 
   Args:
-    if_op: Operation. The forward-pass If op that uses cond_graph.
     cond_graph: function._FuncGraph. The forward-pass function.
     grad_graph: function._FuncGraph. The gradients function.
 
   Returns:
     A list of inputs tensors to be passed to grad_graph.
   """
-  inputs = []
-
-  # Maps placeholders in cond_graph -> input tensor in outer graph.
-  forward_input_map = {v: k for k, v in cond_graph._captured.items()}
+  new_extra_inputs = []
 
   for t in grad_graph.extra_inputs:
-    if t.graph == ops.get_default_graph():
-      # t is in the outer graph (e.g. one of the input gradients).
-      inputs.append(t)
-    elif t in forward_input_map:
-      # t is an input placeholder in cond_graph. Get the corresponding input
-      # tensor in the outer graph.
-      assert t.graph == cond_graph
-      assert forward_input_map[t].graph == ops.get_default_graph()
-      inputs.append(forward_input_map[t])
-    else:
-      # t is an intermediate value in cond_graph. Get the corresponding output
-      # of 'if_op' (note that all intermediate values are outputs).
-      assert t.graph == cond_graph
-      output_idx = cond_graph.outputs.index(t)
-      inputs.append(if_op.outputs[output_idx])
-
-  return inputs
+    if t.graph != grad_graph._outer_graph:
+      # `t` is a tensor in `cond_graph` or one of its ancestors. We bubble this
+      # tensor to the least common ancestor of the `cond_graph` and
+      # `grad_graph` so that it is "in-scope" for `grad_graph`.
+      # TODO(srbs): `_is_ancestor` calls may be expensive. Compute the least
+      # common ancestor once and re-use.
+      assert _is_ancestor(cond_graph, t.graph)
+      while not _is_ancestor(grad_graph, t.graph):
+        assert isinstance(t.graph, _function._FuncGraph)
+        if t in t.graph.extra_args:
+          # TODO(srbs): Consider building a map of extra_args -> extra_inputs.
+          # instead of searching for `t` twice.
+          t = t.graph.extra_inputs[t.graph.extra_args.index(t)]
+        else:
+          # Note: All intermediate tensors are output by the If op.
+          # TODO(srbs): .index() calls may be expensive. Optimize.
+          t = t.graph._if.outputs[t.graph.outputs.index(t)]
+      assert _is_ancestor(grad_graph, t.graph)
+    new_extra_inputs.append(t)
+
+  return new_extra_inputs
 
 
 def _create_new_tf_function(func_graph):
@@ -326,7 +350,8 @@ def _create_new_tf_function(func_graph):
   # a new TF_Function that we add to the graph.
   fdef = _function.function_def_from_tf_function(c_func)
   defined_func = _function._from_definition(fdef)
-  defined_func.add_to_graph(ops.get_default_graph())
+  defined_func._sub_functions = func_graph._functions
+  defined_func.add_to_graph(func_graph._outer_graph)
 
   return func_graph.name
 
@@ -389,7 +414,8 @@ def _pad_params(true_graph, false_graph, true_params, false_params):
   return new_true_params, new_false_inputs
 
 
-def _make_inputs_match(true_graph, false_graph, true_inputs, false_inputs):
+def _make_inputs_match(true_graph, false_graph, true_extra_inputs,
+                       false_extra_inputs):
   """Modifies true_graph and false_graph so they have the same input signature.
 
   This method reorders and/or adds parameters to true_graph and false_graph so
@@ -400,9 +426,9 @@ def _make_inputs_match(true_graph, false_graph, true_inputs, false_inputs):
   Args:
     true_graph: function._FuncGraph
     false_graph: function._FuncGraph
-    true_inputs: a list of Tensors in the outer graph. The inputs for
+    true_extra_inputs: a list of Tensors in the outer graph. The inputs for
       true_graph.
-    false_inputs: a list of Tensors in the outer graph. The inputs for
+    false_extra_inputs: a list of Tensors in the outer graph. The inputs for
       false_graph.
 
   Returns:
@@ -411,12 +437,12 @@ def _make_inputs_match(true_graph, false_graph, true_inputs, false_inputs):
     false_inputs.
   """
   shared_inputs, true_only_inputs, false_only_inputs = _separate_unique_inputs(
-      true_inputs, false_inputs)
+      true_extra_inputs, false_extra_inputs)
 
   new_inputs = shared_inputs + true_only_inputs + false_only_inputs
 
-  true_input_to_param = dict(zip(true_inputs, true_graph.inputs))
-  false_input_to_param = dict(zip(false_inputs, false_graph.inputs))
+  true_input_to_param = dict(zip(true_extra_inputs, true_graph.inputs))
+  false_input_to_param = dict(zip(false_extra_inputs, false_graph.inputs))
 
   true_graph.inputs = (
       [true_input_to_param[t] for t in shared_inputs] +
@@ -432,6 +458,9 @@ def _make_inputs_match(true_graph, false_graph, true_inputs, false_inputs):
   true_graph.extra_inputs = new_inputs
   false_graph.extra_inputs = new_inputs
 
+  true_graph.extra_args = true_graph.inputs
+  false_graph.extra_args = false_graph.inputs
+
   true_graph._captured = dict(zip(new_inputs, true_graph.inputs))
   false_graph._captured = dict(zip(new_inputs, false_graph.inputs))
 
@@ -454,14 +483,30 @@ def _create_dummy_params(func_graph, template_tensors):
 
 
 def _get_grad_fn_name(func_graph):
-  """Returns a unique name to use for the grad function of `func_graph`."""
+  """Returns a unique name to use for the grad function of `func_graph`.
+
+  Ensures this name is unique in the entire hierarchy.
+
+  Args:
+    func_graph: The _FuncGraph.
+
+  Returns:
+    A string, the name to use for the gradient function.
+  """
   name = "%s_grad" % func_graph.name
 
   base_name = name
   counter = 1
-  if ops.get_default_graph()._is_function(name):
-    name = "%s_%s" % (base_name, counter)
-    counter += 1
+  has_conflict = True
+  while has_conflict:
+    curr_graph = func_graph._outer_graph
+    has_conflict = curr_graph._is_function(name)
+    while not has_conflict and isinstance(curr_graph, _function._FuncGraph):
+      curr_graph = curr_graph._outer_graph
+      has_conflict = curr_graph._is_function(name)
+    if has_conflict:
+      name = "%s_%s" % (base_name, counter)
+      counter += 1
 
   return name
 
@@ -477,3 +522,11 @@ def _check_same_outputs(true_graph, false_graph):
         "arguments, got:\n"
         "  true_fn: %s\n"
         "  false_fn: %s" % (true_output_types, false_output_types))
+
+
+def _is_ancestor(graph, maybe_ancestor):
+  if maybe_ancestor == graph:
+    return True
+  if isinstance(graph, _function._FuncGraph):
+    return _is_ancestor(graph._outer_graph, maybe_ancestor)
+  return False
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 888075ba2e..aeac61c005 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -1817,15 +1817,34 @@ class CondContext(ControlFlowContext):
   def _AddOpInternal(self, op):
     """Add `op` to the current context."""
     if not op.inputs:
-      # Remove any external control dependency on this op
+      # If we're in a while loop, remove any control inputs from outside the
+      # loop.
       self._RemoveExternalControlEdges(op)
-      # pylint: disable=protected-access
-      op._add_control_input(self._pivot.op)
-      # pylint: enable=protected-access
+
+      if not any(util.OpInContext(input_op, self)
+                 for input_op in op.control_inputs):
+        # pylint: disable=protected-access
+        op._add_control_input(self._pivot.op)
+        # pylint: enable=protected-access
     else:
+      # Make each input to 'op' available in this CondContext. If an input is
+      # already part of this context there's nothing to do, but if it's
+      # external, AddValue() will handle adding the appropriate Switch node and
+      # other bookkeeping.
       for index in range(len(op.inputs)):
         x = op.inputs[index]
-        real_x = self.AddValue(x)
+        if op.type == "Merge" and x.op.type == "NextIteration":
+          # Edge case: if we're importing a while loop inside this CondContext,
+          # AddValue() will not correctly handle the NextIteration inputs to
+          # Merge node. The problem is that the NextIteration should also be
+          # part of this context, but if we're importing it won't have been
+          # processed and added to the context yet, so AddValue() will try to
+          # add a Switch which results in an invalid graph. Instead, we use the
+          # NextIteration input as-is here, and it will eventually be added to
+          # the context via AddOp().
+          real_x = x
+        else:
+          real_x = self.AddValue(x)
         if real_x != x:
           # pylint: disable=protected-access
           op._update_input(index, real_x)
diff --git a/tensorflow/python/ops/control_flow_util.py b/tensorflow/python/ops/control_flow_util.py
index 7a18986c5b..72c074ed1a 100644
--- a/tensorflow/python/ops/control_flow_util.py
+++ b/tensorflow/python/ops/control_flow_util.py
@@ -214,6 +214,14 @@ def IsContainingContext(ctxt, maybe_containing_ctxt):
   return True
 
 
+def OpInContext(op, ctxt):
+  return IsContainingContext(op._get_control_flow_context(), ctxt)  # pylint: disable=protected-access
+
+
+def TensorInContext(tensor, ctxt):
+  return OpInContext(tensor.op, ctxt)
+
+
 def CheckInputFromValidContext(op, input_op):
   """Returns whether `input_op` can be used from `op`s context.
 
diff --git a/tensorflow/python/ops/conv2d_benchmark.py b/tensorflow/python/ops/conv2d_benchmark.py
index aacdaa7ad0..28111c2730 100644
--- a/tensorflow/python/ops/conv2d_benchmark.py
+++ b/tensorflow/python/ops/conv2d_benchmark.py
@@ -175,7 +175,8 @@ class Conv2DBenchmark(test.Benchmark):
 
     data_types = [dtypes.float32, dtypes.float16]
     data_formats = ["NHWC", "NCHW"]
-    in_channels = list(range(3, 16))
+    in_channels = list(range(1, 10)) + list(range(10, 20, 2)) + list(
+        range(20, 33, 4))
     out_channels = [4, 16, 32]
     hw_strides = [[2, 2]]
     paddings = ["VALID", "SAME"]
diff --git a/tensorflow/python/ops/linalg/linalg.py b/tensorflow/python/ops/linalg/linalg.py
index a7ba0bbe9c..c29b5033bb 100644
--- a/tensorflow/python/ops/linalg/linalg.py
+++ b/tensorflow/python/ops/linalg/linalg.py
@@ -31,6 +31,7 @@ from tensorflow.python.ops.linalg.linear_operator_identity import *
 from tensorflow.python.ops.linalg.linear_operator_kronecker import *
 from tensorflow.python.ops.linalg.linear_operator_low_rank_update import *
 from tensorflow.python.ops.linalg.linear_operator_lower_triangular import *
+from tensorflow.python.ops.linalg.linear_operator_zeros import *
 # pylint: enable=wildcard-import
 
 # Seal API.
diff --git a/tensorflow/python/ops/linalg/linear_operator_zeros.py b/tensorflow/python/ops/linalg/linear_operator_zeros.py
new file mode 100644
index 0000000000..b8a79c065b
--- /dev/null
+++ b/tensorflow/python/ops/linalg/linear_operator_zeros.py
@@ -0,0 +1,452 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""`LinearOperator` acting like a zero matrix."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.linalg import linalg_impl as linalg
+from tensorflow.python.ops.linalg import linear_operator
+from tensorflow.python.ops.linalg import linear_operator_util
+from tensorflow.python.util.tf_export import tf_export
+
+__all__ = [
+    "LinearOperatorZeros",
+]
+
+
+@tf_export("linalg.LinearOperatorZeros")
+class LinearOperatorZeros(linear_operator.LinearOperator):
+  """`LinearOperator` acting like a [batch] zero matrix.
+
+  This operator acts like a [batch] zero matrix `A` with shape
+  `[B1,...,Bb, N, M]` for some `b >= 0`.  The first `b` indices index a
+  batch member.  For every batch index `(i1,...,ib)`, `A[i1,...,ib, : :]` is
+  an `N x M` matrix.  This matrix `A` is not materialized, but for
+  purposes of broadcasting this shape will be relevant.
+
+  `LinearOperatorZeros` is initialized with `num_rows`, and optionally
+  `num_columns, `batch_shape`, and `dtype` arguments.  If `num_columns` is
+  `None`, then this operator will be initialized as a square matrix. If
+  `batch_shape` is `None`, this operator efficiently passes through all
+  arguments.  If `batch_shape` is provided, broadcasting may occur, which will
+  require making copies.
+
+  ```python
+  # Create a 2 x 2 zero matrix.
+  operator = LinearOperatorZero(num_rows=2, dtype=tf.float32)
+
+  operator.to_dense()
+  ==> [[0., 0.]
+       [0., 0.]]
+
+  operator.shape
+  ==> [2, 2]
+
+  operator.determinant()
+  ==> 0.
+
+  x = ... Shape [2, 4] Tensor
+  operator.matmul(x)
+  ==> Shape [2, 4] Tensor, same as x.
+
+  # Create a 2-batch of 2x2 zero matrices
+  operator = LinearOperatorZeros(num_rows=2, batch_shape=[2])
+  operator.to_dense()
+  ==> [[[0., 0.]
+        [0., 0.]],
+       [[0., 0.]
+        [0., 0.]]]
+
+  # Here, even though the operator has a batch shape, the input is the same as
+  # the output, so x can be passed through without a copy.  The operator is able
+  # to detect that no broadcast is necessary because both x and the operator
+  # have statically defined shape.
+  x = ... Shape [2, 2, 3]
+  operator.matmul(x)
+  ==> Shape [2, 2, 3] Tensor, same as tf.zeros_like(x)
+
+  # Here the operator and x have different batch_shape, and are broadcast.
+  # This requires a copy, since the output is different size than the input.
+  x = ... Shape [1, 2, 3]
+  operator.matmul(x)
+  ==> Shape [2, 2, 3] Tensor, equal to tf.zeros_like([x, x])
+  ```
+
+  ### Shape compatibility
+
+  This operator acts on [batch] matrix with compatible shape.
+  `x` is a batch matrix with compatible shape for `matmul` and `solve` if
+
+  ```
+  operator.shape = [B1,...,Bb] + [N, M],  with b >= 0
+  x.shape =   [C1,...,Cc] + [M, R],
+  and [C1,...,Cc] broadcasts with [B1,...,Bb] to [D1,...,Dd]
+  ```
+
+  #### Matrix property hints
+
+  This `LinearOperator` is initialized with boolean flags of the form `is_X`,
+  for `X = non_singular, self_adjoint, positive_definite, square`.
+  These have the following meaning:
+
+  * If `is_X == True`, callers should expect the operator to have the
+    property `X`.  This is a promise that should be fulfilled, but is *not* a
+    runtime assert.  For example, finite floating point precision may result
+    in these promises being violated.
+  * If `is_X == False`, callers should expect the operator to not have `X`.
+  * If `is_X == None` (the default), callers should have no expectation either
+    way.
+  """
+
+  def __init__(self,
+               num_rows,
+               num_columns=None,
+               batch_shape=None,
+               dtype=None,
+               is_non_singular=False,
+               is_self_adjoint=True,
+               is_positive_definite=False,
+               is_square=True,
+               assert_proper_shapes=False,
+               name="LinearOperatorZeros"):
+    r"""Initialize a `LinearOperatorZeros`.
+
+    The `LinearOperatorZeros` is initialized with arguments defining `dtype`
+    and shape.
+
+    This operator is able to broadcast the leading (batch) dimensions, which
+    sometimes requires copying data.  If `batch_shape` is `None`, the operator
+    can take arguments of any batch shape without copying.  See examples.
+
+    Args:
+      num_rows:  Scalar non-negative integer `Tensor`.  Number of rows in the
+        corresponding zero matrix.
+      num_columns:  Scalar non-negative integer `Tensor`.  Number of columns in
+        the corresponding zero matrix. If `None`, defaults to the value of
+        `num_rows`.
+      batch_shape:  Optional `1-D` integer `Tensor`.  The shape of the leading
+        dimensions.  If `None`, this operator has no leading dimensions.
+      dtype:  Data type of the matrix that this operator represents.
+      is_non_singular:  Expect that this operator is non-singular.
+      is_self_adjoint:  Expect that this operator is equal to its hermitian
+        transpose.
+      is_positive_definite:  Expect that this operator is positive definite,
+        meaning the quadratic form `x^H A x` has positive real part for all
+        nonzero `x`.  Note that we do not require the operator to be
+        self-adjoint to be positive-definite.  See:
+        https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
+      is_square:  Expect that this operator acts like square [batch] matrices.
+      assert_proper_shapes:  Python `bool`.  If `False`, only perform static
+        checks that initialization and method arguments have proper shape.
+        If `True`, and static checks are inconclusive, add asserts to the graph.
+      name: A name for this `LinearOperator`
+
+    Raises:
+      ValueError:  If `num_rows` is determined statically to be non-scalar, or
+        negative.
+      ValueError:  If `num_columns` is determined statically to be non-scalar,
+        or negative.
+      ValueError:  If `batch_shape` is determined statically to not be 1-D, or
+        negative.
+      ValueError:  If any of the following is not `True`:
+        `{is_self_adjoint, is_non_singular, is_positive_definite}`.
+    """
+    dtype = dtype or dtypes.float32
+    self._assert_proper_shapes = assert_proper_shapes
+
+    with ops.name_scope(name):
+      dtype = dtypes.as_dtype(dtype)
+      if not is_self_adjoint and is_square:
+        raise ValueError("A zero operator is always self adjoint.")
+      if is_non_singular:
+        raise ValueError("A zero operator is always singular.")
+      if is_positive_definite:
+        raise ValueError("A zero operator is always not positive-definite.")
+
+      super(LinearOperatorZeros, self).__init__(
+          dtype=dtype,
+          is_non_singular=is_non_singular,
+          is_self_adjoint=is_self_adjoint,
+          is_positive_definite=is_positive_definite,
+          is_square=is_square,
+          name=name)
+
+      self._num_rows = linear_operator_util.shape_tensor(
+          num_rows, name="num_rows")
+      self._num_rows_static = tensor_util.constant_value(self._num_rows)
+
+      if num_columns is None:
+        num_columns = num_rows
+
+      self._num_columns = linear_operator_util.shape_tensor(
+          num_columns, name="num_columns")
+      self._num_columns_static = tensor_util.constant_value(self._num_columns)
+
+      self._check_domain_range_possibly_add_asserts()
+
+      if (self._num_rows_static is not None and
+          self._num_columns_static is not None):
+        if is_square and self._num_rows_static != self._num_columns_static:
+          raise ValueError(
+              "LinearOperatorZeros initialized as is_square=True, but got "
+              "num_rows({}) != num_columns({})".format(
+                  self._num_rows_static,
+                  self._num_columns_static))
+
+      if batch_shape is None:
+        self._batch_shape_arg = None
+      else:
+        self._batch_shape_arg = linear_operator_util.shape_tensor(
+            batch_shape, name="batch_shape_arg")
+        self._batch_shape_static = tensor_util.constant_value(
+            self._batch_shape_arg)
+        self._check_batch_shape_possibly_add_asserts()
+
+  def _shape(self):
+    matrix_shape = tensor_shape.TensorShape((self._num_rows_static,
+                                             self._num_columns_static))
+    if self._batch_shape_arg is None:
+      return matrix_shape
+
+    batch_shape = tensor_shape.TensorShape(self._batch_shape_static)
+    return batch_shape.concatenate(matrix_shape)
+
+  def _shape_tensor(self):
+    matrix_shape = array_ops.stack((self._num_rows, self._num_columns), axis=0)
+    if self._batch_shape_arg is None:
+      return matrix_shape
+
+    return array_ops.concat((self._batch_shape_arg, matrix_shape), 0)
+
+  def _assert_non_singular(self):
+    raise errors.InvalidArgumentError(
+        node_def=None, op=None, message="Zero operators are always "
+        "non-invertible.")
+
+  def _assert_positive_definite(self):
+    raise errors.InvalidArgumentError(
+        node_def=None, op=None, message="Zero operators are always "
+        "non-positive definite.")
+
+  def _assert_self_adjoint(self):
+    return control_flow_ops.no_op("assert_self_adjoint")
+
+  def _possibly_broadcast_batch_shape(self, x):
+    """Return 'x', possibly after broadcasting the leading dimensions."""
+    # If we have no batch shape, our batch shape broadcasts with everything!
+    if self._batch_shape_arg is None:
+      return x
+
+    # Static attempt:
+    #   If we determine that no broadcast is necessary, pass x through
+    #   If we need a broadcast, add to an array of zeros.
+    #
+    # special_shape is the shape that, when broadcast with x's shape, will give
+    # the correct broadcast_shape.  Note that
+    #   We have already verified the second to last dimension of self.shape
+    #   matches x's shape in assert_compatible_matrix_dimensions.
+    #   Also, the final dimension of 'x' can have any shape.
+    #   Therefore, the final two dimensions of special_shape are 1's.
+    special_shape = self.batch_shape.concatenate([1, 1])
+    bshape = array_ops.broadcast_static_shape(x.get_shape(), special_shape)
+    if special_shape.is_fully_defined():
+      # bshape.is_fully_defined iff special_shape.is_fully_defined.
+      if bshape == x.get_shape():
+        return x
+      # Use the built in broadcasting of addition.
+      zeros = array_ops.zeros(shape=special_shape, dtype=self.dtype)
+      return x + zeros
+
+    # Dynamic broadcast:
+    #   Always add to an array of zeros, rather than using a "cond", since a
+    #   cond would require copying data from GPU --> CPU.
+    special_shape = array_ops.concat((self.batch_shape_tensor(), [1, 1]), 0)
+    zeros = array_ops.zeros(shape=special_shape, dtype=self.dtype)
+    return x + zeros
+
+  def _matmul(self, x, adjoint=False, adjoint_arg=False):
+    if self._assert_proper_shapes:
+      x = linalg.adjoint(x) if adjoint_arg else x
+      aps = linear_operator_util.assert_compatible_matrix_dimensions(self, x)
+      x = control_flow_ops.with_dependencies([aps], x)
+    if self.is_square:
+      # Note that adjoint has no effect since this matrix is self-adjoint.
+      if adjoint_arg:
+        output_shape = array_ops.concat([
+            array_ops.shape(x)[:-2],
+            [array_ops.shape(x)[-1], array_ops.shape(x)[-2]]], axis=0)
+      else:
+        output_shape = array_ops.shape(x)
+
+      return self._possibly_broadcast_batch_shape(
+          array_ops.zeros(shape=output_shape, dtype=x.dtype))
+
+    x_shape = array_ops.shape(x)
+    n = self._num_columns if adjoint else self._num_rows
+    m = x_shape[-2] if adjoint_arg else x_shape[-1]
+
+    output_shape = array_ops.concat([x_shape[:-2], [n, m]], axis=0)
+
+    zeros = array_ops.zeros(shape=output_shape, dtype=x.dtype)
+    return self._possibly_broadcast_batch_shape(zeros)
+
+  def _determinant(self):
+    if self.batch_shape.is_fully_defined():
+      return array_ops.zeros(shape=self.batch_shape, dtype=self.dtype)
+    else:
+      return array_ops.zeros(shape=self.batch_shape_tensor(), dtype=self.dtype)
+
+  def _trace(self):
+    # Get Tensor of all zeros of same shape as self.batch_shape.
+    if self.batch_shape.is_fully_defined():
+      return array_ops.zeros(shape=self.batch_shape, dtype=self.dtype)
+    else:
+      return array_ops.zeros(shape=self.batch_shape_tensor(), dtype=self.dtype)
+
+  def _diag_part(self):
+    return self._zeros_diag()
+
+  def add_to_tensor(self, mat, name="add_to_tensor"):
+    """Add matrix represented by this operator to `mat`.  Equiv to `I + mat`.
+
+    Args:
+      mat:  `Tensor` with same `dtype` and shape broadcastable to `self`.
+      name:  A name to give this `Op`.
+
+    Returns:
+      A `Tensor` with broadcast shape and same `dtype` as `self`.
+    """
+    return self._possibly_broadcast_batch_shape(mat)
+
+  def _check_domain_range_possibly_add_asserts(self):
+    """Static check of init arg `num_rows`, possibly add asserts."""
+    # Possibly add asserts.
+    if self._assert_proper_shapes:
+      self._num_rows = control_flow_ops.with_dependencies([
+          check_ops.assert_rank(
+              self._num_rows,
+              0,
+              message="Argument num_rows must be a 0-D Tensor."),
+          check_ops.assert_non_negative(
+              self._num_rows,
+              message="Argument num_rows must be non-negative."),
+      ], self._num_rows)
+      self._num_columns = control_flow_ops.with_dependencies([
+          check_ops.assert_rank(
+              self._num_columns,
+              0,
+              message="Argument num_columns must be a 0-D Tensor."),
+          check_ops.assert_non_negative(
+              self._num_columns,
+              message="Argument num_columns must be non-negative."),
+      ], self._num_columns)
+
+    # Static checks.
+    if not self._num_rows.dtype.is_integer:
+      raise TypeError("Argument num_rows must be integer type.  Found:"
+                      " %s" % self._num_rows)
+
+    if not self._num_columns.dtype.is_integer:
+      raise TypeError("Argument num_columns must be integer type.  Found:"
+                      " %s" % self._num_columns)
+
+    num_rows_static = self._num_rows_static
+    num_columns_static = self._num_columns_static
+
+    if num_rows_static is not None:
+      if num_rows_static.ndim != 0:
+        raise ValueError("Argument num_rows must be a 0-D Tensor.  Found:"
+                         " %s" % num_rows_static)
+
+      if num_rows_static < 0:
+        raise ValueError("Argument num_rows must be non-negative.  Found:"
+                         " %s" % num_rows_static)
+    if num_columns_static is not None:
+      if num_columns_static.ndim != 0:
+        raise ValueError("Argument num_columns must be a 0-D Tensor.  Found:"
+                         " %s" % num_columns_static)
+
+      if num_columns_static < 0:
+        raise ValueError("Argument num_columns must be non-negative.  Found:"
+                         " %s" % num_columns_static)
+
+  def _check_batch_shape_possibly_add_asserts(self):
+    """Static check of init arg `batch_shape`, possibly add asserts."""
+    if self._batch_shape_arg is None:
+      return
+
+    # Possibly add asserts
+    if self._assert_proper_shapes:
+      self._batch_shape_arg = control_flow_ops.with_dependencies([
+          check_ops.assert_rank(
+              self._batch_shape_arg,
+              1,
+              message="Argument batch_shape must be a 1-D Tensor."),
+          check_ops.assert_non_negative(
+              self._batch_shape_arg,
+              message="Argument batch_shape must be non-negative."),
+      ], self._batch_shape_arg)
+
+    # Static checks
+    if not self._batch_shape_arg.dtype.is_integer:
+      raise TypeError("Argument batch_shape must be integer type.  Found:"
+                      " %s" % self._batch_shape_arg)
+
+    if self._batch_shape_static is None:
+      return  # Cannot do any other static checks.
+
+    if self._batch_shape_static.ndim != 1:
+      raise ValueError("Argument batch_shape must be a 1-D Tensor.  Found:"
+                       " %s" % self._batch_shape_static)
+
+    if np.any(self._batch_shape_static < 0):
+      raise ValueError("Argument batch_shape must be non-negative.  Found:"
+                       "%s" % self._batch_shape_static)
+
+  def _min_matrix_dim(self):
+    """Minimum of domain/range dimension, if statically available, else None."""
+    domain_dim = self.domain_dimension.value
+    range_dim = self.range_dimension.value
+    if domain_dim is None or range_dim is None:
+      return None
+    return min(domain_dim, range_dim)
+
+  def _min_matrix_dim_tensor(self):
+    """Minimum of domain/range dimension, as a tensor."""
+    return math_ops.reduce_min(self.shape_tensor()[-2:])
+
+  def _zeros_diag(self):
+    """Returns the diagonal of this operator as all zeros."""
+    if self.shape.is_fully_defined():
+      d_shape = self.batch_shape.concatenate([self._min_matrix_dim()])
+    else:
+      d_shape = array_ops.concat(
+          [self.batch_shape_tensor(),
+           [self._min_matrix_dim_tensor()]], axis=0)
+
+    return array_ops.zeros(shape=d_shape, dtype=self.dtype)
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index 1f56ad25bf..8b259b6b6b 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -196,15 +196,16 @@ class ResourceVariable(variables.RefVariable):
   the variable are fixed. The value can be changed using one of the assign
   methods.
 
-  Just like any `Tensor`, variables created with `ResourceVariable()` can be
-  used as inputs for other Ops in the graph. Additionally, all the operators
-  overloaded for the `Tensor` class are carried over to variables, so you can
-  also add nodes to the graph by just doing arithmetic on variables.
+  Just like any `Tensor`, variables created with
+  `tf.Variable(use_resource=True)` can be used as inputs for other Ops in the
+  graph. Additionally, all the operators overloaded for the `Tensor` class are
+  carried over to variables, so you can also add nodes to the graph by just
+  doing arithmetic on variables.
 
-  Unlike tf.Variable, a tf.ResourceVariable has well-defined semantics. Each
+  Unlike ref-based variable, a ResourceVariable has well-defined semantics. Each
   usage of a ResourceVariable in a TensorFlow graph adds a read_value operation
-  to the graph. The Tensors returned by a read_value operation are guaranteed
-  to see all modifications to the value of the variable which happen in any
+  to the graph. The Tensors returned by a read_value operation are guaranteed to
+  see all modifications to the value of the variable which happen in any
   operation on which the read_value depends on (either directly, indirectly, or
   via a control dependency) and guaranteed to not see any modification to the
   value of the variable from operations that depend on the read_value operation.
@@ -218,7 +219,7 @@ class ResourceVariable(variables.RefVariable):
   can cause tf.Variable and tf.ResourceVariable to behave differently:
 
   ```python
-  a = tf.ResourceVariable(1.0)
+  a = tf.Variable(1.0, use_resource=True)
   a.initializer.run()
 
   assign = a.assign(2.0)
@@ -742,8 +743,14 @@ class ResourceVariable(variables.RefVariable):
   def _read_variable_op(self):
     if self.trainable:
       tape.watch_variable(self)
-    return gen_resource_variable_ops.read_variable_op(self._handle,
-                                                      self._dtype)
+    result = gen_resource_variable_ops.read_variable_op(self._handle,
+                                                        self._dtype)
+    if not context.executing_eagerly():
+      # Note that if a control flow context is active the input of the read op
+      # might not actually be the handle. This line bypasses it.
+      tape.record_operation(
+          "ReadVariableOp", [result], [self._handle], lambda x: [x])
+    return result
 
   def read_value(self):
     """Constructs an op which reads the value of this variable.
diff --git a/tensorflow/python/ops/rnn.py b/tensorflow/python/ops/rnn.py
index deba133fb9..7b6ab20975 100644
--- a/tensorflow/python/ops/rnn.py
+++ b/tensorflow/python/ops/rnn.py
@@ -417,24 +417,30 @@ def bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None,
 
     # Backward direction
     if not time_major:
-      time_dim = 1
-      batch_dim = 0
+      time_axis = 1
+      batch_axis = 0
     else:
-      time_dim = 0
-      batch_dim = 1
+      time_axis = 0
+      batch_axis = 1
 
-    def _reverse(input_, seq_lengths, seq_dim, batch_dim):
+    def _reverse(input_, seq_lengths, seq_axis, batch_axis):
       if seq_lengths is not None:
         return array_ops.reverse_sequence(
             input=input_, seq_lengths=seq_lengths,
-            seq_dim=seq_dim, batch_dim=batch_dim)
+            seq_axis=seq_axis, batch_axis=batch_axis)
       else:
-        return array_ops.reverse(input_, axis=[seq_dim])
+        return array_ops.reverse(input_, axis=[seq_axis])
 
     with vs.variable_scope("bw") as bw_scope:
-      inputs_reverse = _reverse(
-          inputs, seq_lengths=sequence_length,
-          seq_dim=time_dim, batch_dim=batch_dim)
+
+      def _map_reverse(inp):
+        return _reverse(
+            inp,
+            seq_lengths=sequence_length,
+            seq_axis=time_axis,
+            batch_axis=batch_axis)
+
+      inputs_reverse = nest.map_structure(_map_reverse, inputs)
       tmp, output_state_bw = dynamic_rnn(
           cell=cell_bw, inputs=inputs_reverse, sequence_length=sequence_length,
           initial_state=initial_state_bw, dtype=dtype,
@@ -443,7 +449,7 @@ def bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None,
 
   output_bw = _reverse(
       tmp, seq_lengths=sequence_length,
-      seq_dim=time_dim, batch_dim=batch_dim)
+      seq_axis=time_axis, batch_axis=batch_axis)
 
   outputs = (output_fw, output_bw)
   output_states = (output_state_fw, output_state_bw)
diff --git a/tensorflow/python/ops/rnn_cell_impl.py b/tensorflow/python/ops/rnn_cell_impl.py
index 70805fd572..42806ba6ec 100644
--- a/tensorflow/python/ops/rnn_cell_impl.py
+++ b/tensorflow/python/ops/rnn_cell_impl.py
@@ -1261,6 +1261,11 @@ class MultiRNNCell(RNNCell):
       raise TypeError(
           "cells must be a list or tuple, but saw: %s." % cells)
 
+    if len(set([id(cell) for cell in cells])) < len(cells):
+      logging.log_first_n(logging.WARN,
+                          "At least two cells provided to MultiRNNCell "
+                          "are the same object and will share weights.", 1)
+
     self._cells = cells
     for cell_number, cell in enumerate(self._cells):
       # Add Checkpointable dependencies on these cells so their variables get
diff --git a/tensorflow/python/ops/tensor_array_ops.py b/tensorflow/python/ops/tensor_array_ops.py
index cc92da4fd7..f86dfb3527 100644
--- a/tensorflow/python/ops/tensor_array_ops.py
+++ b/tensorflow/python/ops/tensor_array_ops.py
@@ -554,7 +554,7 @@ class _EagerTensorArray(object):
       self._tensor_array.extend([None for _ in range(index - size + 1)])
 
     if not isinstance(value, ops.EagerTensor):
-      value = constant_op.constant(value)
+      value = ops.convert_to_tensor(value)
 
     if self._infer_shape:
       if self._element_shape is None:
@@ -633,8 +633,8 @@ class _EagerTensorArray(object):
   def split(self, value, lengths, name=None):
     """See TensorArray."""
     # error checking to match graph-mode errors
-    value = constant_op.constant(value)
-    lengths = constant_op.constant(lengths)
+    value = ops.convert_to_tensor(value)
+    lengths = ops.convert_to_tensor(lengths)
     sum_lengths = math_ops.reduce_sum(lengths)
     if lengths.shape.ndims != 1:
       raise errors_impl.InvalidArgumentError(
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index d03d93beeb..fc00ce68ae 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -220,27 +220,31 @@ class Variable(six.with_metaclass(VariableMetaclass,
   various `Optimizer` classes use this collection as the default list of
   variables to optimize.
 
-  WARNING: tf.Variable objects have a non-intuitive memory model. A Variable is
-  represented internally as a mutable Tensor which can non-deterministically
-  alias other Tensors in a graph. The set of operations which consume a Variable
-  and can lead to aliasing is undetermined and can change across TensorFlow
-  versions. Avoid writing code which relies on the value of a Variable either
-  changing or not changing as other operations happen. For example, using
-  Variable objects or simple functions thereof as predicates in a `tf.cond` is
-  dangerous and error-prone:
+  WARNING: tf.Variable objects by default have a non-intuitive memory model. A
+  Variable is represented internally as a mutable Tensor which can
+  non-deterministically alias other Tensors in a graph. The set of operations
+  which consume a Variable and can lead to aliasing is undetermined and can
+  change across TensorFlow versions. Avoid writing code which relies on the
+  value of a Variable either changing or not changing as other operations
+  happen. For example, using Variable objects or simple functions thereof as
+  predicates in a `tf.cond` is dangerous and error-prone:
 
   ```
   v = tf.Variable(True)
   tf.cond(v, lambda: v.assign(False), my_false_fn)  # Note: this is broken.
   ```
 
-  Here replacing tf.Variable with tf.contrib.eager.Variable will fix any
-  nondeterminism issues.
+  Here replacing adding `use_resource=True` when constructing the variable will
+  fix any nondeterminism issues:
+  ```
+  v = tf.Variable(True, use_resource=True)
+  tf.cond(v, lambda: v.assign(False), my_false_fn)
+  ```
 
   To use the replacement for variables which does
   not have these issues:
 
-  * Replace `tf.Variable` with `tf.contrib.eager.Variable`;
+  * Add `use_resource=True` when constructing `tf.Variable`;
   * Call `tf.get_variable_scope().set_use_resource(True)` inside a
     `tf.variable_scope` before the `tf.get_variable()` call.
   """
@@ -869,19 +873,7 @@ class RefVariable(Variable):
       ValueError: If the initial value is not specified, or does not have a
         shape and `validate_shape` is `True`.
       RuntimeError: If eager execution is enabled.
-
-    @compatibility(eager)
-    `tf.Variable` is not compatible with eager execution.  Use
-    `tfe.Variable` instead which is compatible with both eager execution
-    and graph construction.  See [the TensorFlow Eager Execution
-    guide](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/g3doc/guide.md#variables-and-optimizers)
-    for details on how variables work in eager execution.
-    @end_compatibility
     """
-    if context.executing_eagerly():
-      raise RuntimeError(
-          "tf.Variable not supported when eager execution is enabled. "
-          "Please use tf.contrib.eager.Variable instead")
     self._in_graph_mode = True
     if variable_def:
       # If variable_def is provided, recreates the variable from its fields.
@@ -992,8 +984,7 @@ class RefVariable(Variable):
       # Ensure that we weren't lifted into the eager context.
       if context.executing_eagerly():
         raise RuntimeError(
-            "tf.Variable not supported when eager execution is enabled. "
-            "Please use tf.contrib.eager.Variable instead")
+            "RefVariable not supported when eager execution is enabled. ")
       with ops.name_scope(name, "Variable", [] if init_from_fn else
                           [initial_value]) as name:
 
diff --git a/tensorflow/python/profiler/profile_context.py b/tensorflow/python/profiler/profile_context.py
index 18eb66ef98..fa4260a712 100644
--- a/tensorflow/python/profiler/profile_context.py
+++ b/tensorflow/python/profiler/profile_context.py
@@ -88,16 +88,19 @@ def _profiled_run(self,
       to_profiles = self.profile_context._profile_candidates()
       for to_prof in to_profiles:
         cmd, opts, _ = to_prof
+        saved_views = self.profile_context._views.setdefault(cmd, {})
         if self.profile_context._debug:
           sys.stderr.write('debug: profiling %s step: %d\n' % (cmd, step))
         if cmd == 'graph':
-          self.profile_context.profiler.profile_graph(opts)
+          saved_views[step] = self.profile_context.profiler.profile_graph(opts)
         elif cmd == 'scope':
-          self.profile_context.profiler.profile_name_scope(opts)
+          saved_views[step] = self.profile_context.profiler.profile_name_scope(
+              opts)
         elif cmd == 'op':
-          self.profile_context.profiler.profile_operations(opts)
+          saved_views[step] = self.profile_context.profiler.profile_operations(
+              opts)
         elif cmd == 'code':
-          self.profile_context.profiler.profile_python(opts)
+          saved_views[step] = self.profile_context.profiler.profile_python(opts)
         else:
           raise ValueError('Unknown cmd: %s\n' % cmd)
       return ret
@@ -185,8 +188,30 @@ class ProfileContext(object):
     self._traced_steps = 0
     self._auto_profiles = []
     self._profiler = None
+    self._views = {}
     self._lock = threading.Lock()
 
+  def get_profiles(self, cmd):
+    """Returns profiling results for each step at which `cmd` was run.
+
+    Args:
+      cmd: string, profiling command used in an `add_auto_profiling` call.
+
+    Returns:
+      dict[int: (MultiGraphNodeProto | GraphNodeProto)]. Keys are steps at which
+      the profiling command was run. Values are the outputs of profiling.
+      For "code" and "op" commands this will be a `MultiGraphNodeProto`, for
+      "scope" and "graph" commands this will be a `GraphNodeProto.
+
+    Raises:
+      ValueError: if `cmd` was never run (either because no session.run call was
+      made or because there was no `add_auto_profiling` call with the specified
+      `cmd`.
+    """
+    if cmd not in self._views:
+      raise ValueError('No autoprofiler for command: {}, was run'.format(cmd))
+    return self._views[cmd]
+
   def add_auto_profiling(self, cmd, options, profile_steps):
     """Traces and profiles at some session run steps.
 
diff --git a/tensorflow/python/profiler/profile_context_test.py b/tensorflow/python/profiler/profile_context_test.py
index a623beee23..107ad443c3 100644
--- a/tensorflow/python/profiler/profile_context_test.py
+++ b/tensorflow/python/profiler/profile_context_test.py
@@ -61,6 +61,8 @@ class ProfilerContextTest(test.TestCase):
               profile_str = f.read()
             gfile.Remove(outfile)
 
+      self.assertEqual(set([15, 50, 100]), set(pctx.get_profiles("op").keys()))
+
     with lib.ProfilerFromFile(
         os.path.join(test.get_temp_dir(), "profile_100")) as profiler:
       profiler.profile_operations(options=opts)
diff --git a/tensorflow/python/saved_model/builder_impl.py b/tensorflow/python/saved_model/builder_impl.py
index e58be804c2..8c985a7c2f 100644
--- a/tensorflow/python/saved_model/builder_impl.py
+++ b/tensorflow/python/saved_model/builder_impl.py
@@ -34,6 +34,7 @@ from tensorflow.python.platform import tf_logging
 from tensorflow.python.saved_model import constants
 from tensorflow.python.training import saver as tf_saver
 from tensorflow.python.util import compat
+from tensorflow.python.util.deprecation import deprecated_args
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -133,39 +134,32 @@ class SavedModelBuilder(object):
     tf_logging.info("Assets written to: %s",
                     compat.as_text(assets_destination_dir))
 
-  def _maybe_add_legacy_init_op(self, legacy_init_op=None):
-    """Add legacy init op to the SavedModel.
+  def _maybe_add_main_op(self, main_op):
+    """Adds main op to the SavedModel.
 
     Args:
-      legacy_init_op: Optional legacy init op to support backward compatibility.
+      main_op: Main op to run as part of graph initialization. If None, no
+        main op will be added to the graph.
 
     Raises:
-      TypeError if legacy init op is not of type `Operation`.
-      AssertionError if the graph already contains one or more legacy init ops.
+      TypeError: if main op is provided but is not of type `Operation`.
+      ValueError: if the Graph already contains an init op.
     """
-    if legacy_init_op is not None:
-      if not isinstance(legacy_init_op, ops.Operation):
-        raise TypeError("legacy_init_op needs to be an Operation: %r" %
-                        legacy_init_op)
-      if ops.get_collection(constants.LEGACY_INIT_OP_KEY):
-        raise AssertionError(
-            "graph already contains one or more legacy init ops under the "
-            "collection {}.".format(constants.LEGACY_INIT_OP_KEY))
-      ops.add_to_collection(constants.LEGACY_INIT_OP_KEY, legacy_init_op)
-
-  def _add_main_op(self, main_op):
-    """Add main op to the SavedModel.
+    if main_op is None:
+      return
 
-    Args:
-      main_op: Main op to run as part of graph initialization.
+    if not isinstance(main_op, ops.Operation):
+      raise TypeError("main_op needs to be an Operation: %r" % main_op)
 
-    Raises:
-      TypeError if main op is not of type `Operation`.
-    """
-    if main_op is not None:
-      if not isinstance(main_op, ops.Operation):
-        raise TypeError("main_op needs to be an Operation: %r" % main_op)
-      ops.add_to_collection(constants.MAIN_OP_KEY, main_op)
+    # Validate that no other init ops have been added to this graph already.
+    # We check main_op and legacy_init_op for thoroughness and explicitness.
+    for init_op_key in (constants.MAIN_OP_KEY, constants.LEGACY_INIT_OP_KEY):
+      if ops.get_collection(init_op_key):
+        raise ValueError(
+            "Graph already contains one or more main ops under the "
+            "collection {}.".format(init_op_key))
+
+    ops.add_to_collection(constants.MAIN_OP_KEY, main_op)
 
   def _add_train_op(self, train_op):
     """Add train op to the SavedModel.
@@ -257,16 +251,12 @@ class SavedModelBuilder(object):
           self._validate_tensor_info(outputs[outputs_key])
 
   def _add_collections(
-      self, assets_collection, legacy_init_op, main_op, train_op):
+      self, assets_collection, main_op, train_op):
     """Add asset and op collections to be saved."""
     # Save asset files and write them to disk, if any.
     self._save_and_write_assets(assets_collection)
 
-    if main_op is None:
-      # Add legacy init op to the SavedModel.
-      self._maybe_add_legacy_init_op(legacy_init_op)
-    else:
-      self._add_main_op(main_op)
+    self._maybe_add_main_op(main_op)
 
     self._add_train_op(train_op)
 
@@ -282,6 +272,9 @@ class SavedModelBuilder(object):
           allow_empty=True)
     return saver
 
+  @deprecated_args(None,
+                   "Pass your op to the equivalent parameter main_op instead.",
+                   "legacy_init_op")
   def add_meta_graph(self,
                      tags,
                      signature_def_map=None,
@@ -306,7 +299,7 @@ class SavedModelBuilder(object):
           that this collection should be a subset of the assets saved as part of
           the first meta graph in the SavedModel.
       legacy_init_op: Legacy support for op or group of ops to execute after the
-          restore op upon a load.
+          restore op upon a load. Deprecated; please use main_op instead.
       clear_devices: Set to true if the device info on the default graph should
           be cleared.
       main_op: Op or group of ops to execute when the graph is loaded. Note
@@ -333,8 +326,12 @@ class SavedModelBuilder(object):
     # properly populated.
     self._validate_signature_def_map(signature_def_map)
 
+    # legacy_init_op is deprecated, and going away in TF 2.0.
+    # Re-mapping to main_op, as treatment is identical regardless.
+    main_op = main_op or legacy_init_op
+
     # Add assets and ops
-    self._add_collections(assets_collection, legacy_init_op, main_op, None)
+    self._add_collections(assets_collection, main_op, None)
 
     saver = self._maybe_create_saver(saver)
 
@@ -351,6 +348,9 @@ class SavedModelBuilder(object):
     # Tag the meta graph def and add it to the SavedModel.
     self._tag_and_add_meta_graph(meta_graph_def, tags, signature_def_map)
 
+  @deprecated_args(None,
+                   "Pass your op to the equivalent parameter main_op instead.",
+                   "legacy_init_op")
   def add_meta_graph_and_variables(self,
                                    sess,
                                    tags,
@@ -378,7 +378,7 @@ class SavedModelBuilder(object):
         def.
       assets_collection: Assets collection to be saved with SavedModel.
       legacy_init_op: Legacy support for op or group of ops to execute after the
-          restore op upon a load.
+          restore op upon a load. Deprecated; please use main_op instead.
       clear_devices: Set to true if the device info on the default graph should
           be cleared.
       main_op: Op or group of ops to execute when the graph is loaded. Note
@@ -402,8 +402,12 @@ class SavedModelBuilder(object):
     # properly populated.
     self._validate_signature_def_map(signature_def_map)
 
+    # legacy_init_op is deprecated, and going away in TF 2.0.
+    # Re-mapping to main_op, as treatment is identical regardless.
+    main_op = main_op or legacy_init_op
+
     # Add assets and ops
-    self._add_collections(assets_collection, legacy_init_op, main_op, None)
+    self._add_collections(assets_collection, main_op, None)
 
     # Create the variables sub-directory, if it does not exist.
     variables_dir = os.path.join(
diff --git a/tensorflow/python/saved_model/constants.py b/tensorflow/python/saved_model/constants.py
index 61c6ffbd0d..cb251f08bb 100644
--- a/tensorflow/python/saved_model/constants.py
+++ b/tensorflow/python/saved_model/constants.py
@@ -60,6 +60,10 @@ SAVED_MODEL_FILENAME_PBTXT = "saved_model.pbtxt"
 tf_export("saved_model.constants.SAVED_MODEL_FILENAME_PBTXT").export_constant(
     __name__, "SAVED_MODEL_FILENAME_PBTXT")
 
+# File name for json format of SavedModel.
+# Not exported while keras_saved_model is in contrib.
+SAVED_MODEL_FILENAME_JSON = "saved_model.json"
+
 # Subdirectory name containing the variables/checkpoint files.
 VARIABLES_DIRECTORY = "variables"
 tf_export("saved_model.constants.VARIABLES_DIRECTORY").export_constant(
@@ -69,5 +73,3 @@ tf_export("saved_model.constants.VARIABLES_DIRECTORY").export_constant(
 VARIABLES_FILENAME = "variables"
 tf_export("saved_model.constants.VARIABLES_FILENAME").export_constant(
     __name__, "VARIABLES_FILENAME")
-
-
diff --git a/tensorflow/python/saved_model/loader_impl.py b/tensorflow/python/saved_model/loader_impl.py
index e5f649fdab..16077f52fa 100644
--- a/tensorflow/python/saved_model/loader_impl.py
+++ b/tensorflow/python/saved_model/loader_impl.py
@@ -116,11 +116,14 @@ def _get_asset_tensors(export_dir, meta_graph_def_to_load, import_scope=None):
   return asset_tensor_dict
 
 
-def _get_main_op_tensor(meta_graph_def_to_load):
+def _get_main_op_tensor(
+    meta_graph_def_to_load, init_op_key=constants.MAIN_OP_KEY):
   """Gets the main op tensor, if one exists.
 
   Args:
     meta_graph_def_to_load: The meta graph def from the SavedModel to be loaded.
+    init_op_key: name of collection to check; should be one of MAIN_OP_KEY
+      or the deprecated LEGACY_INIT_OP_KEY
 
   Returns:
     The main op tensor, if it exists and `None` otherwise.
@@ -131,38 +134,15 @@ def _get_main_op_tensor(meta_graph_def_to_load):
   """
   collection_def = meta_graph_def_to_load.collection_def
   main_op_tensor = None
-  if constants.MAIN_OP_KEY in collection_def:
-    main_ops = collection_def[constants.MAIN_OP_KEY].node_list.value
+  if init_op_key in collection_def:
+    main_ops = collection_def[init_op_key].node_list.value
     if len(main_ops) != 1:
-      raise RuntimeError("Expected exactly one SavedModel main op.")
-    main_op_tensor = ops.get_collection(constants.MAIN_OP_KEY)[0]
+      raise RuntimeError("Expected exactly one SavedModel main op. "
+                         "Found: {}".format(main_ops))
+    main_op_tensor = ops.get_collection(init_op_key)[0]
   return main_op_tensor
 
 
-def _get_legacy_init_op_tensor(meta_graph_def_to_load):
-  """Gets the legacy init op tensor, if one exists.
-
-  Args:
-    meta_graph_def_to_load: The meta graph def from the SavedModel to be loaded.
-
-  Returns:
-    The legacy init op tensor, if it exists and `None` otherwise.
-
-  Raises:
-    RuntimeError: If the collection def corresponding to the legacy init op key
-        has other than exactly one tensor.
-  """
-  collection_def = meta_graph_def_to_load.collection_def
-  legacy_init_op_tensor = None
-  if constants.LEGACY_INIT_OP_KEY in collection_def:
-    legacy_init_ops = collection_def[
-        constants.LEGACY_INIT_OP_KEY].node_list.value
-    if len(legacy_init_ops) != 1:
-      raise RuntimeError("Expected exactly one legacy serving init op.")
-    legacy_init_op_tensor = ops.get_collection(constants.LEGACY_INIT_OP_KEY)[0]
-  return legacy_init_op_tensor
-
-
 @tf_export("saved_model.loader.maybe_saved_model_directory")
 def maybe_saved_model_directory(export_dir):
   """Checks whether the provided export directory could contain a SavedModel.
@@ -284,12 +264,15 @@ class SavedModelLoader(object):
       **saver_kwargs: keyword arguments to pass to tf.train.import_meta_graph.
 
     Returns:
-      Saver defined by the MetaGraph, which can be used to restore the variable
-      values.
+      A tuple of
+        * Saver defined by the MetaGraph, which can be used to restore the
+          variable values.
+        * List of `Operation`/`Tensor` objects returned from
+          `tf.import_graph_def` (may be `None`).
     """
     meta_graph_def = self.get_meta_graph_def_from_tags(tags)
     with graph.as_default():
-      return tf_saver.import_meta_graph(
+      return tf_saver._import_meta_graph_with_return_elements(  # pylint: disable=protected-access
           meta_graph_def, import_scope=import_scope, **saver_kwargs)
 
   def restore_variables(self, sess, saver, import_scope=None):
@@ -340,8 +323,8 @@ class SavedModelLoader(object):
           self._export_dir, meta_graph_def, import_scope=import_scope)
 
       main_op_tensor = (
-          _get_main_op_tensor(meta_graph_def) or
-          (_get_legacy_init_op_tensor(meta_graph_def)))
+          _get_main_op_tensor(meta_graph_def, constants.MAIN_OP_KEY) or
+          _get_main_op_tensor(meta_graph_def, constants.LEGACY_INIT_OP_KEY))
       if main_op_tensor is not None:
         sess.run(fetches=[main_op_tensor], feed_dict=asset_tensors_dictionary)
 
@@ -361,8 +344,8 @@ class SavedModelLoader(object):
       `MetagraphDef` proto of the graph that was loaded.
     """
     with sess.graph.as_default():
-      saver = self.load_graph(sess.graph, tags, import_scope,
-                              **saver_kwargs)
+      saver, _ = self.load_graph(sess.graph, tags, import_scope,
+                                 **saver_kwargs)
       self.restore_variables(sess, saver, import_scope)
       self.run_init_ops(sess, tags, import_scope)
     return self.get_meta_graph_def_from_tags(tags)
diff --git a/tensorflow/python/saved_model/loader_test.py b/tensorflow/python/saved_model/loader_test.py
index ce18859f6b..9a0b276a4b 100644
--- a/tensorflow/python/saved_model/loader_test.py
+++ b/tensorflow/python/saved_model/loader_test.py
@@ -111,7 +111,8 @@ class SavedModelLoaderTest(test.TestCase):
   def test_load_with_import_scope(self):
     loader = loader_impl.SavedModelLoader(SAVED_MODEL_WITH_MAIN_OP)
     with self.test_session(graph=ops.Graph()) as sess:
-      saver = loader.load_graph(sess.graph, ["foo_graph"], import_scope="baz")
+      saver, _ = loader.load_graph(
+          sess.graph, ["foo_graph"], import_scope="baz")
 
       # The default saver should not work when the import scope is set.
       with self.assertRaises(errors.NotFoundError):
@@ -149,7 +150,7 @@ class SavedModelLoaderTest(test.TestCase):
   def test_run_init_op(self):
     loader = loader_impl.SavedModelLoader(SAVED_MODEL_WITH_MAIN_OP)
     graph = ops.Graph()
-    saver = loader.load_graph(graph, ["foo_graph"])
+    saver, _ = loader.load_graph(graph, ["foo_graph"])
     with self.test_session(graph=graph) as sess:
       loader.restore_variables(sess, saver)
       self.assertEqual(5, sess.graph.get_tensor_by_name("x:0").eval())
@@ -203,7 +204,7 @@ class SavedModelLoaderTest(test.TestCase):
 
     loader = loader_impl.SavedModelLoader(path)
     with self.test_session(graph=ops.Graph()) as sess:
-      saver = loader.load_graph(sess.graph, ["foo_graph"])
+      saver, _ = loader.load_graph(sess.graph, ["foo_graph"])
       self.assertFalse(variables._all_saveable_objects())
       self.assertIsNotNone(saver)
 
@@ -212,6 +213,18 @@ class SavedModelLoaderTest(test.TestCase):
       self.assertEqual(5, sess.graph.get_tensor_by_name("x:0").eval())
       self.assertEqual(11, sess.graph.get_tensor_by_name("y:0").eval())
 
+  def test_load_saved_model_graph_with_return_elements(self):
+    """Ensure that the correct elements are returned."""
+    loader = loader_impl.SavedModelLoader(SIMPLE_ADD_SAVED_MODEL)
+    graph = ops.Graph()
+    _, ret = loader.load_graph(graph, ["foo_graph"],
+                               return_elements=["y:0", "x:0"])
+
+    self.assertEqual(graph.get_tensor_by_name("y:0"), ret[0])
+    self.assertEqual(graph.get_tensor_by_name("x:0"), ret[1])
+
+    with self.assertRaisesRegexp(ValueError, "not found in graph"):
+      loader.load_graph(graph, ["foo_graph"], return_elements=["z:0"])
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/saved_model/saved_model_test.py b/tensorflow/python/saved_model/saved_model_test.py
index fb4732aca2..00b669fc97 100644
--- a/tensorflow/python/saved_model/saved_model_test.py
+++ b/tensorflow/python/saved_model/saved_model_test.py
@@ -846,9 +846,19 @@ class SavedModelTest(test.TestCase):
   def testLegacyInitOpWithNonEmptyCollection(self):
     export_dir = self._get_export_dir(
         "test_legacy_init_op_with_non_empty_collection")
+    self._testInitOpsWithNonEmptyCollection(
+        export_dir, constants.LEGACY_INIT_OP_KEY)
+
+  def testMainOpWithNonEmptyCollection(self):
+    export_dir = self._get_export_dir(
+        "test_main_op_with_non_empty_collection")
+    self._testInitOpsWithNonEmptyCollection(export_dir, constants.MAIN_OP_KEY)
+
+  def _testInitOpsWithNonEmptyCollection(self, export_dir, key):
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    g = ops.Graph()
+    with self.test_session(graph=g) as sess:
       # Initialize variable `v1` to 1.
       v1 = variables.Variable(1, name="v1")
       ops.add_to_collection("v", v1)
@@ -857,19 +867,21 @@ class SavedModelTest(test.TestCase):
       v2 = variables.Variable(42, name="v2", trainable=False, collections=[])
       ops.add_to_collection("v", v2)
 
-      # Set up an assignment op to be run as part of the legacy_init_op.
+      # Set up an assignment op to be run as part of the init op.
       assign_v2 = state_ops.assign(v2, v1)
-      legacy_init_op = control_flow_ops.group(assign_v2, name="legacy_init_op")
+      init_op = control_flow_ops.group(assign_v2, name="init_op")
 
       sess.run(variables.global_variables_initializer())
 
-      ops.add_to_collection(constants.LEGACY_INIT_OP_KEY,
-                            control_flow_ops.no_op())
-      # AssertionError should be raised since the LEGACY_INIT_OP_KEY collection
+      ops.add_to_collection(key, control_flow_ops.no_op())
+      # ValueError should be raised since the LEGACY_INIT_OP_KEY collection
       # is not empty and we don't support multiple init ops.
-      with self.assertRaises(AssertionError):
+      with self.assertRaisesRegexp(ValueError, "Graph already contains"):
         builder.add_meta_graph_and_variables(
-            sess, ["foo"], legacy_init_op=legacy_init_op)
+            sess, ["foo"], legacy_init_op=init_op)
+      # We shouldn't be able to add as MAIN_OP, either.
+      with self.assertRaisesRegexp(ValueError, "Graph already contains"):
+        builder.add_meta_graph_and_variables(sess, ["foo"], main_op=init_op)
 
   def testTrainOp(self):
     export_dir = self._get_export_dir("test_train_op")
diff --git a/tensorflow/python/summary/writer/writer.py b/tensorflow/python/summary/writer/writer.py
index aca084fc91..60e96ee947 100644
--- a/tensorflow/python/summary/writer/writer.py
+++ b/tensorflow/python/summary/writer/writer.py
@@ -325,7 +325,7 @@ class FileWriter(SummaryToEventTransformer):
     ```
 
     The `session` argument to the constructor makes the returned `FileWriter` a
-    a compatibility layer over new graph-based summaries (`tf.contrib.summary`).
+    compatibility layer over new graph-based summaries (`tf.contrib.summary`).
     Crucially, this means the underlying writer resource and events file will
     be shared with any other `FileWriter` using the same `session` and `logdir`,
     and with any `tf.contrib.summary.SummaryWriter` in this session using the
diff --git a/tensorflow/python/training/checkpoint_utils.py b/tensorflow/python/training/checkpoint_utils.py
index 883f4fd910..a052081630 100644
--- a/tensorflow/python/training/checkpoint_utils.py
+++ b/tensorflow/python/training/checkpoint_utils.py
@@ -24,7 +24,6 @@ from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
@@ -308,32 +307,19 @@ def _set_checkpoint_initializer(variable,
     restore_op = io_ops.restore_v2(
         ckpt_file, [tensor_name], [slice_spec], [base_type], name=name)[0]
 
-    # TODO(priyag, allenl): Use `SaveableObject.restore` instead here.
-    if resource_variable_ops.is_resource_variable(variable):
-      init_op = variable.assign(restore_op, read_value=False)
-    else:
-      init_op = state_ops.assign(variable, restore_op)
+    names_to_saveables = saver.BaseSaverBuilder.OpListToDict([variable])
+    saveable_objects = []
+    for name, op in names_to_saveables.items():
+      for s in saver.BaseSaverBuilder.SaveableObjectsForOp(op, name):
+        saveable_objects.append(s)
+
+    assert len(saveable_objects) == 1  # Should be only one variable.
+    init_op = saveable_objects[0].restore([restore_op], restored_shapes=None)
 
     # pylint:disable=protected-access
-    # We need special handling for `DistributedVariable`s as they contain
-    # mutliple actual variables. `assign` on a `DistributedVariable` returns a
-    # combined `init_op` which contains initializers for all the contained
-    # variables. We then set each underlying variable's `_initializer_op` using
-    # the corresponding `init_op`.
-    # TODO(priyag): Use `isinstance` checks when `DistributedVariable` class
-    # moves out of contrib.
-    if any(base.__name__ == "DistributedVariable"
-           for base in  variable.__class__.__bases__):
-      assert distribute_lib.get_cross_tower_context()
-      assert hasattr(variable, "_index")
-      for (d, v) in six.iteritems(variable._index):
-        v._initializer_op = init_op._index[d]
-        restore_op.set_shape(v.shape)
-        v._initial_value = restore_op
-    else:
-      variable._initializer_op = init_op
-      restore_op.set_shape(variable.shape)
-      variable._initial_value = restore_op
+    variable._initializer_op = init_op
+    restore_op.set_shape(variable.shape)
+    variable._initial_value = restore_op
     # pylint:enable=protected-access
 
 
diff --git a/tensorflow/python/training/checkpoint_utils_test.py b/tensorflow/python/training/checkpoint_utils_test.py
index 4e08a1c859..1c1f126ce9 100644
--- a/tensorflow/python/training/checkpoint_utils_test.py
+++ b/tensorflow/python/training/checkpoint_utils_test.py
@@ -386,7 +386,9 @@ class CheckpointsTest(test.TestCase):
         op for op in g.get_operations()
         if (op.name.startswith("init_from_checkpoint/") and
             not op.name.startswith("init_from_checkpoint/checkpoint_initializer"
-                                  ) and op.type != "AssignVariableOp")
+                                  ) and
+            op.type != "AssignVariableOp" and
+            op.type != "Identity")
     ]
     self.assertEqual(ops_in_init_from_checkpoint_scope, [])
 
diff --git a/tensorflow/python/training/checkpointable/base.py b/tensorflow/python/training/checkpointable/base.py
index ee35b01328..66837ee52f 100644
--- a/tensorflow/python/training/checkpointable/base.py
+++ b/tensorflow/python/training/checkpointable/base.py
@@ -144,7 +144,7 @@ class _CheckpointPosition(object):
         # process deferred restorations for it and its dependencies.
         restore_ops = checkpointable._restore_from_checkpoint_position(self)  # pylint: disable=protected-access
         if restore_ops:
-          self._checkpoint.restore_ops.extend(restore_ops)
+          self._checkpoint.new_restore_ops(restore_ops)
 
   def bind_object(self, checkpointable):
     """Set a checkpoint<->object correspondence and process slot variables.
@@ -501,12 +501,6 @@ class CheckpointableBase(object):
       ValueError: If the variable name is not unique.
     """
     self._maybe_initialize_checkpointable()
-    if overwrite and self._lookup_dependency(name) is not None:
-      raise ValueError(
-          ("A variable named '%s' already exists in this Checkpointable, but "
-           "Checkpointable._add_variable called to create another with "
-           "that name. Variable names must be unique within a Checkpointable "
-           "object.") % (name,))
     with ops.init_scope():
       if context.executing_eagerly():
         # If this is a variable with a single Tensor stored in the checkpoint,
diff --git a/tensorflow/python/training/checkpointable/base_test.py b/tensorflow/python/training/checkpointable/base_test.py
index 950e9c5b53..fd935ac559 100644
--- a/tensorflow/python/training/checkpointable/base_test.py
+++ b/tensorflow/python/training/checkpointable/base_test.py
@@ -16,8 +16,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import test
 from tensorflow.python.training.checkpointable import base
+from tensorflow.python.training.checkpointable import util
 
 
 class InterfaceTests(test.TestCase):
@@ -37,5 +40,22 @@ class InterfaceTests(test.TestCase):
     self.assertIs(duplicate_name_dep, current_dependency)
     self.assertEqual("leaf", current_name)
 
+  def testAddVariableOverwrite(self):
+    root = base.CheckpointableBase()
+    a = root._add_variable_with_custom_getter(
+        name="v", shape=[], getter=variable_scope.get_variable)
+    self.assertEqual([root, a], util.list_objects(root))
+    with ops.Graph().as_default():
+      b = root._add_variable_with_custom_getter(
+          name="v", shape=[], overwrite=True,
+          getter=variable_scope.get_variable)
+      self.assertEqual([root, b], util.list_objects(root))
+    with ops.Graph().as_default():
+      with self.assertRaisesRegexp(
+          ValueError, "already declared as a dependency"):
+        root._add_variable_with_custom_getter(
+            name="v", shape=[], overwrite=False,
+            getter=variable_scope.get_variable)
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/training/checkpointable/data_structures.py b/tensorflow/python/training/checkpointable/data_structures.py
index 019d43f09c..507cda8734 100644
--- a/tensorflow/python/training/checkpointable/data_structures.py
+++ b/tensorflow/python/training/checkpointable/data_structures.py
@@ -57,6 +57,8 @@ def _wrap_or_unwrap(value):
     return value.value
   if isinstance(value, base.CheckpointableBase):
     return value  # Skip conversion for already checkpointable objects.
+  elif isinstance(value, dict):
+    return _DictWrapper(value)
   elif isinstance(value, list):
     return _ListWrapper(value)
   else:
@@ -438,12 +440,15 @@ class Mapping(CheckpointableDataStructure, collections.Mapping):
   def __init__(self, *args, **kwargs):
     """Construct a new sequence. Arguments are passed to `dict()`."""
     super(Mapping, self).__init__()
-    self._storage = dict(*args, **kwargs)
+    self._storage = self._make_storage(*args, **kwargs)
     self._storage.update(
         {key: self._track_value(
             value, name=self._name_element(key))
          for key, value in self._storage.items()})
 
+  def _make_storage(self, *args, **kwargs):
+    return dict(*args, **kwargs)
+
   def _name_element(self, key):
     if not isinstance(key, six.string_types):
       raise TypeError(
@@ -476,3 +481,185 @@ class Mapping(CheckpointableDataStructure, collections.Mapping):
 
   def __iter__(self):
     return iter(self._storage)
+
+
+# Unlike _ListWrapper, having _DictWrapper inherit from dict and pass isinstance
+# checks seems infeasible. CPython will not call Python methods/properties on
+# dictionary subclasses when running e.g. {}.update(dict_subclass), and instead
+# collects elements directly from dict_subclass's C structs. So subclassing dict
+# implies that the storage has to be "self" (i.e. the C structs for the object
+# must be updated correctly), but we also need that storage to be the wrapped
+# dictionary to avoid synchronization bugs (un-tracked external modifications
+# should still show up when the dict is accessed through the wrapper). Monkey
+# patching all of the "wrapped" dict's methods instead of creating a wrapper
+# object is an option, but not a very attractive one (replacing methods without
+# creating reference cycles is difficult, and then dicts would need to be
+# special cased everywhere as being checkpointable).
+class _DictWrapper(Mapping, collections.MutableMapping):
+  """Wraps built-in dicts to support restore-on-create for variables.
+
+  _DictWrapper is to Mapping as _ListWrapper is to List. Unlike Mapping,
+  _DictWrapper allows non-string keys and values and arbitrary mutations (delete
+  keys, reassign values). Like _ListWrapper, these mutations mean that
+  _DictWrapper will raise an exception on save.
+  """
+
+  def __new__(cls, *args):
+    if len(args) == 1 and isinstance(args[0], dict):
+      return super(_DictWrapper, cls).__new__(cls)
+    else:
+      # Allow construction from a sequence, e.g. for nest.pack_sequence_as. In
+      # this case there's nothing to wrap, so we make a normal dictionary. Also
+      # allows constructing empty instances of the _DictWrapper type, as Session
+      # is wont to do (and again there's nothing to wrap, so a normal dictionary
+      # makes more sense).
+      return dict(*args)
+
+  def __init__(self, wrapped_dict):
+    self._non_string_key = False
+    self._non_append_mutation = False
+    self._external_modification = False
+    super(_DictWrapper, self).__init__(wrapped_dict)
+    self._update_snapshot()
+
+  def _make_storage(self, wrapped_dict):
+    """Re-use the wrapped dict for storage (to force them to be in sync)."""
+    return wrapped_dict
+
+  @property
+  def _checkpoint_dependencies(self):
+    """Check that the object is saveable before listing its dependencies."""
+    self._check_external_modification()
+    if self._non_string_key:
+      raise ValueError(
+          "Unable to save the object %s (a dictionary wrapper constructed "
+          "automatically on attribute assignment). The wrapped dictionary "
+          "contains a non-string key which maps to a checkpointable object or "
+          "mutable data structure.\n\nIf you don't need this dictionary "
+          "checkpointed, wrap it in a tf.contrib.checkpoint.NoDependency "
+          "object; it will be automatically un-wrapped and subsequently "
+          "ignored." % (self,))
+    if self._non_append_mutation:
+      raise ValueError(
+          "Unable to save the object %s (a dictionary wrapper constructed "
+          "automatically on attribute assignment). A key mapping to a "
+          "checkpointable object was overwritten or deleted, which would "
+          "cause problems for restoration.\n\nIf you don't need this "
+          "dictionary checkpointed, wrap it in a "
+          "tf.contrib.checkpoint.NoDependency object; it will be automatically "
+          "un-wrapped and subsequently ignored." % (self,))
+    if self._external_modification:
+      raise ValueError(
+          "Unable to save the object %s (a dictionary wrapper constructed "
+          "automatically on attribute assignment). The wrapped dictionary was "
+          "modified outside the wrapper (its final value was %s, its value "
+          "when a checkpoint dependency was added was %s), which breaks "
+          "restoration on object creation.\n\nIf you don't need this "
+          "dictionary checkpointed, wrap it in a "
+          "tf.contrib.checkpoint.NoDependency object; it will be automatically "
+          "un-wrapped and subsequently ignored." % (
+              self, self, self._last_wrapped_dict_snapshot))
+    assert not self._dirty  # Any reason for dirtiness should have an exception.
+    return super(_DictWrapper, self)._checkpoint_dependencies
+
+  @property
+  def _dirty(self):
+    """Check if there has already been a mutation which prevents saving."""
+    return (self._external_modification
+            or self._non_append_mutation
+            or self._non_string_key)
+
+  def _check_external_modification(self):
+    """Checks for any changes to the wrapped dict not through the wrapper."""
+    if self._dirty:
+      return
+    if self != self._last_wrapped_dict_snapshot:
+      self._external_modification = True
+      self._last_wrapped_dict_snapshot = None
+
+  def _update_snapshot(self):
+    """Acknowledges tracked changes to the wrapped dict."""
+    if self._dirty:
+      return
+    self._last_wrapped_dict_snapshot = dict(self)
+
+  def _track_value(self, value, name):
+    """Allows storage of non-checkpointable objects."""
+    if isinstance(name, six.string_types):
+      string_key = True
+    else:
+      name = "-non_string_key"
+      string_key = False
+    try:
+      no_dependency = isinstance(value, NoDependency)
+      value = super(_DictWrapper, self)._track_value(value=value, name=name)
+      if not (string_key or no_dependency):
+        # A non-string key maps to a checkpointable value. This data structure
+        # is not saveable.
+        self._non_string_key = True
+      return value
+    except ValueError:
+      # Even if this value isn't checkpointable, we need to make sure
+      # NoDependency objects get unwrapped.
+      return sticky_attribute_assignment(
+          checkpointable=self, value=value, name=name)
+
+  def _name_element(self, key):
+    """Don't throw errors for non-string keys."""
+    if isinstance(key, six.string_types):
+      return super(_DictWrapper, self)._name_element(key)
+    else:
+      return key
+
+  def __setitem__(self, key, value):
+    """Allow any modifications, but possibly mark the wrapper as unsaveable."""
+    self._check_external_modification()
+    no_dep = isinstance(value, NoDependency)
+    if isinstance(key, six.string_types):
+      existing_dependency = self._lookup_dependency(key)
+      value = self._track_value(value, name=key)
+    else:
+      value = _wrap_or_unwrap(value)
+      existing_dependency = None
+      if not no_dep and isinstance(value, base.CheckpointableBase):
+        # Non-string keys are OK as long as we have no reason to add a
+        # dependency on the value (either because the value is not
+        # checkpointable, or because it was wrapped in a NoDependency object).
+        self._non_string_key = True
+    current_value = self._storage.setdefault(key, value)
+    if current_value is not value:
+      if ((not no_dep and isinstance(value, base.CheckpointableBase))
+          # We don't want to just check that the existing object is
+          # checkpointable, since it may have been wrapped in a NoDependency
+          # object.
+          or existing_dependency is not None):
+        # A checkpointable object was replaced under the same key; this means
+        # that restoring would be error-prone, so we'll throw an exception on
+        # save.
+        self._non_append_mutation = True
+      self._storage[key] = value
+
+    self._update_snapshot()
+
+  def __delitem__(self, key):
+    self._check_external_modification()
+    existing_value = self[key]
+    if isinstance(existing_value, base.CheckpointableBase):
+      # Deleting tracked checkpointable values means restoring is problematic,
+      # so we'll throw an exception on save.
+      self._non_append_mutation = True
+    del self._storage[key]
+    self._update_snapshot()
+
+  def __repr__(self):
+    return "DictWrapper(%s)" % (repr(self._storage),)
+
+  def __hash__(self):
+    raise TypeError("unhashable type: 'DictWrapper'")
+
+  def __eq__(self, other):
+    return self._storage == getattr(other, "_storage", other)
+
+  def update(self, *args, **kwargs):
+    for key, value in dict(*args, **kwargs).items():
+      self[key] = value
diff --git a/tensorflow/python/training/checkpointable/data_structures_test.py b/tensorflow/python/training/checkpointable/data_structures_test.py
index 7bee00a927..472b7c32b4 100644
--- a/tensorflow/python/training/checkpointable/data_structures_test.py
+++ b/tensorflow/python/training/checkpointable/data_structures_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.training.checkpointable import data_structures
 from tensorflow.python.training.checkpointable import tracking
+from tensorflow.python.training.checkpointable import util
 
 
 class HasList(training.Model):
@@ -303,6 +304,124 @@ class MappingTests(test.TestCase):
                         data_structures.Mapping()])
     self.assertEqual(2, len(has_mappings))
     self.assertNotIn(data_structures.Mapping(), has_mappings)
+    # In contrast to Mapping, dict wrappers are not hashable
+    a = tracking.Checkpointable()
+    a.d = {}
+    self.assertEqual({}, a.d)
+    self.assertFalse({} != a.d)  # pylint: disable=g-explicit-bool-comparison
+    self.assertNotEqual({1: 2}, a.d)
+    with self.assertRaisesRegexp(TypeError, "unhashable"):
+      set([a.d])
+
+  def testDictWrapperBadKeys(self):
+    a = tracking.Checkpointable()
+    a.d = {}
+    a.d[1] = data_structures.List()
+    model = training.Model()
+    model.sub = a
+    save_path = os.path.join(self.get_temp_dir(), "ckpt")
+    with self.assertRaisesRegexp(ValueError, "non-string key"):
+      model.save_weights(save_path)
+
+  def testDictWrapperNoDependency(self):
+    a = tracking.Checkpointable()
+    a.d = data_structures.NoDependency({})
+    a.d[1] = [3]
+    self.assertEqual([a], util.list_objects(a))
+    model = training.Model()
+    model.sub = a
+    save_path = os.path.join(self.get_temp_dir(), "ckpt")
+    model.save_weights(save_path)
+    model.load_weights(save_path)
+
+  def testNonStringKeyNotCheckpointableValue(self):
+    a = tracking.Checkpointable()
+    a.d = {}
+    a.d["a"] = [3]
+    a.d[1] = data_structures.NoDependency([3])
+    self.assertEqual([a, a.d, a.d["a"]], util.list_objects(a))
+    model = training.Model()
+    model.sub = a
+    save_path = os.path.join(self.get_temp_dir(), "ckpt")
+    model.save_weights(save_path)
+    model.load_weights(save_path)
+
+  def testNonAppendNotCheckpointable(self):
+    # Non-append mutations (deleting or overwriting values) are OK when the
+    # values aren't tracked.
+    a = tracking.Checkpointable()
+    a.d = {}
+    a.d["a"] = [3]
+    a.d[1] = 3
+    a.d[1] = 2
+    self.assertEqual(2, a.d[1])
+    del a.d[1]
+    a.d[2] = data_structures.NoDependency(tracking.Checkpointable())
+    second = tracking.Checkpointable()
+    a.d[2] = data_structures.NoDependency(second)
+    self.assertIs(second, a.d[2])
+    self.assertEqual([a, a.d, a.d["a"]], util.list_objects(a))
+    model = training.Model()
+    model.sub = a
+    save_path = os.path.join(self.get_temp_dir(), "ckpt")
+    model.save_weights(save_path)
+    model.load_weights(save_path)
+
+  def testDelNoSave(self):
+    model = training.Model()
+    model.d = {}
+    model.d["a"] = []
+    del model.d["a"]
+    save_path = os.path.join(self.get_temp_dir(), "ckpt")
+    with self.assertRaisesRegexp(ValueError, "overwritten or deleted"):
+      model.save_weights(save_path)
+
+  def testPopNoSave(self):
+    model = training.Model()
+    model.d = {}
+    model.d["a"] = []
+    model.d.pop("a")
+    save_path = os.path.join(self.get_temp_dir(), "ckpt")
+    with self.assertRaisesRegexp(ValueError, "overwritten or deleted"):
+      model.save_weights(save_path)
+
+  def testExternalModificationNoSave(self):
+    model = training.Model()
+    external_reference = {}
+    model.d = external_reference
+    external_reference["a"] = []
+    save_path = os.path.join(self.get_temp_dir(), "ckpt")
+    with self.assertRaisesRegexp(ValueError, "modified outside the wrapper"):
+      model.save_weights(save_path)
+
+  def testOverwriteNoSave(self):
+    model = training.Model()
+    model.d = {}
+    model.d["a"] = {}
+    model.d["a"] = {}
+    save_path = os.path.join(self.get_temp_dir(), "ckpt")
+    with self.assertRaisesRegexp(ValueError, "overwritten or deleted"):
+      model.save_weights(save_path)
+
+  def testIter(self):
+    model = training.Model()
+    model.d = {1: 3}
+    model.d[1] = 3
+    self.assertEqual([1], list(model.d))
+    new_dict = {}
+    # This update() is super tricky. If the dict wrapper subclasses dict,
+    # CPython will access its storage directly instead of calling any
+    # methods/properties on the object. So the options are either not to
+    # subclass dict (in which case update will call normal iter methods, but the
+    # object won't pass isinstance checks) or to subclass dict and keep that
+    # storage updated (no shadowing all its methods like _ListWrapper).
+    new_dict.update(model.d)
+    self.assertEqual({1: 3}, new_dict)
+
+  def testConstructableFromSequence(self):
+    result = data_structures._DictWrapper([(1, 2), (3, 4)])
+    self.assertIsInstance(result, dict)
+    self.assertEqual({1: 2, 3: 4}, result)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/training/checkpointable/tracking_test.py b/tensorflow/python/training/checkpointable/tracking_test.py
index 96da0d6e47..f8d17cd417 100644
--- a/tensorflow/python/training/checkpointable/tracking_test.py
+++ b/tensorflow/python/training/checkpointable/tracking_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 import os
 
 import numpy
+import six
 
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras.engine import training
@@ -144,6 +145,29 @@ class InterfaceTests(test.TestCase):
       checkpoint.save(os.path.join(self.get_temp_dir(), "ckpt"))
 
   @test_util.run_in_graph_and_eager_modes
+  def testDictionariesBasic(self):
+    a = training.Model()
+    b = training.Model()
+    a.attribute = {"b": b}
+    c = training.Model()
+    a.attribute["c"] = []
+    a.attribute["c"].append(c)
+    a_deps = util.list_objects(a)
+    self.assertIn(b, a_deps)
+    self.assertIn(c, a_deps)
+    self.assertIs(b, a.attribute["b"])
+    six.assertCountEqual(
+        self,
+        ["b", "c"],
+        [dep.name for dep in a.attribute._checkpoint_dependencies])
+    self.assertEqual([b, c], a.layers)
+    self.assertEqual([b, c], a.attribute.layers)
+    self.assertEqual([c], a.attribute["c"].layers)
+    checkpoint = util.Checkpoint(a=a)
+    save_path = checkpoint.save(os.path.join(self.get_temp_dir(), "ckpt"))
+    checkpoint.restore(save_path).assert_consumed()
+
+  @test_util.run_in_graph_and_eager_modes
   def testNoDepList(self):
     a = training.Model()
     a.l1 = data_structures.NoDependency([])
@@ -159,12 +183,13 @@ class InterfaceTests(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testAssertions(self):
     a = tracking.Checkpointable()
-    a.l = [numpy.zeros([2, 2])]
-    self.assertAllEqual([numpy.zeros([2, 2])], a.l)
-    self.assertAllClose([numpy.zeros([2, 2])], a.l)
-    nest.map_structure(self.assertAllClose, a.l, [numpy.zeros([2, 2])])
-    a.tensors = [array_ops.ones([2, 2]), array_ops.zeros([3, 3])]
-    self.assertAllClose([numpy.ones([2, 2]), numpy.zeros([3, 3])],
+    a.l = {"k": [numpy.zeros([2, 2])]}
+    self.assertAllEqual(nest.flatten({"k": [numpy.zeros([2, 2])]}),
+                        nest.flatten(a.l))
+    self.assertAllClose({"k": [numpy.zeros([2, 2])]}, a.l)
+    nest.map_structure(self.assertAllClose, a.l, {"k": [numpy.zeros([2, 2])]})
+    a.tensors = {"k": [array_ops.ones([2, 2]), array_ops.zeros([3, 3])]}
+    self.assertAllClose({"k": [numpy.ones([2, 2]), numpy.zeros([3, 3])]},
                         self.evaluate(a.tensors))
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/training/checkpointable/util.py b/tensorflow/python/training/checkpointable/util.py
index 686232fe27..664b2348c0 100644
--- a/tensorflow/python/training/checkpointable/util.py
+++ b/tensorflow/python/training/checkpointable/util.py
@@ -101,6 +101,7 @@ class _CheckpointRestoreCoordinator(object):
     # this checkpoint.
     self.restore_ops = []
     self.restore_ops_by_name = {}
+    self.new_restore_ops_callback = None
     # A mapping from optimizer proto ids to lists of slot variables to be
     # restored when the optimizer is tracked. Only includes slot variables whose
     # regular variables have already been created, and only for optimizer
@@ -121,6 +122,11 @@ class _CheckpointRestoreCoordinator(object):
                     slot_variable_id=slot_reference.slot_variable_node_id,
                     slot_name=slot_reference.slot_name))
 
+  def new_restore_ops(self, new_ops):
+    self.restore_ops.extend(new_ops)
+    if self.new_restore_ops_callback:
+      self.new_restore_ops_callback(new_ops)  # pylint: disable=not-callable
+
 
 class _NameBasedRestoreCoordinator(object):
   """Keeps the status of a name-based checkpoint restore."""
@@ -361,24 +367,42 @@ class _ObjectIdentityWeakKeyDictionary(_ObjectIdentityDictionary):
         yield unwrapped
 
 
-class _ObjectIdentityWeakSet(collections.MutableSet):
-  """Like weakref.WeakSet, but compares objects with "is"."""
+class _ObjectIdentitySet(collections.MutableSet):
+  """Like the built-in set, but compares objects with "is"."""
 
-  def __init__(self):
-    self._storage = set()
+  def __init__(self, *args):
+    self._storage = set([self._wrap_key(obj) for obj in list(*args)])
+
+  def _wrap_key(self, key):
+    return _ObjectIdentityWrapper(key)
 
   def __contains__(self, key):
-    return _WeakObjectIdentityWrapper(key) in self._storage
+    return self._wrap_key(key) in self._storage
 
   def discard(self, key):
-    self._storage.discard(_WeakObjectIdentityWrapper(key))
+    self._storage.discard(self._wrap_key(key))
 
   def add(self, key):
-    self._storage.add(_WeakObjectIdentityWrapper(key))
+    self._storage.add(self._wrap_key(key))
+
+  def __len__(self):
+    return len(self._storage)
+
+  def __iter__(self):
+    keys = list(self._storage)
+    for key in keys:
+      yield key.unwrapped
+
+
+class _ObjectIdentityWeakSet(_ObjectIdentitySet):
+  """Like weakref.WeakSet, but compares objects with "is"."""
+
+  def _wrap_key(self, key):
+    return _WeakObjectIdentityWrapper(key)
 
   def __len__(self):
     # Iterate, discarding old weak refs
-    return len(list(self))
+    return len([_ for _ in self])
 
   def __iter__(self):
     keys = list(self._storage)
@@ -803,6 +827,31 @@ class _LoadStatus(object):
     pass
 
 
+def streaming_restore(status, session=None):
+  """When graph building, runs restore ops as soon as they come in.
+
+  Args:
+    status: A _LoadStatus objects from an object-based saver's
+      restore(). Streaming restore from name-based checkpoints is not currently
+      supported.
+    session: A session to run new restore ops in.
+  """
+  if context.executing_eagerly():
+    # Streaming restore is the default/only behavior when executing eagerly.
+    return
+  if session is None:
+    session = ops.get_default_session()
+  if isinstance(status, NameBasedSaverStatus):
+    raise NotImplementedError(
+        "Streaming restore not supported from name-based checkpoints. File a "
+        "feature request if this limitation bothers you.")
+  status.run_restore_ops(session=session)
+  # pylint: disable=protected-access
+  status._checkpoint.new_restore_ops_callback = (
+      lambda ops: session.run(ops, feed_dict=status._feed_dict))
+  # pylint: enable=protected-access
+
+
 class CheckpointLoadStatus(_LoadStatus):
   """Checks the status of checkpoint loading and manages restore ops.
 
@@ -857,8 +906,8 @@ class CheckpointLoadStatus(_LoadStatus):
     for checkpointable_object in list_objects(self._root_checkpointable):
       self._checkpoint.all_python_objects.add(checkpointable_object)
     unused_python_objects = (
-        set(self._checkpoint.all_python_objects)
-        - set(self._checkpoint.object_by_proto_id.values()))
+        _ObjectIdentitySet(self._checkpoint.all_python_objects)
+        - _ObjectIdentitySet(self._checkpoint.object_by_proto_id.values()))
     if unused_python_objects:
       raise AssertionError(
           ("Some Python objects were not bound to checkpointed values, likely "
@@ -974,11 +1023,13 @@ _DEPRECATED_RESTORE_INSTRUCTIONS = (
     "one this message is coming from) and use that checkpoint in the future.")
 
 
-@deprecation.deprecated(
-    date=None, instructions=_DEPRECATED_RESTORE_INSTRUCTIONS)
 class NameBasedSaverStatus(_LoadStatus):
   """Status for loading a name-based training checkpoint."""
 
+  # Ideally this deprecation decorator would be on the class, but that
+  # interferes with isinstance checks.
+  @deprecation.deprecated(
+      date=None, instructions=_DEPRECATED_RESTORE_INSTRUCTIONS)
   def __init__(self, checkpoint, root_checkpointable):
     self._checkpoint = checkpoint
     self._root_checkpointable = root_checkpointable
diff --git a/tensorflow/python/training/learning_rate_decay.py b/tensorflow/python/training/learning_rate_decay.py
index 51190264e8..fd195a7965 100644
--- a/tensorflow/python/training/learning_rate_decay.py
+++ b/tensorflow/python/training/learning_rate_decay.py
@@ -356,7 +356,15 @@ def natural_exp_decay(learning_rate,
   The function returns the decayed learning rate.  It is computed as:
 
   ```python
-  decayed_learning_rate = learning_rate * exp(-decay_rate * global_step)
+  decayed_learning_rate = learning_rate * exp(-decay_rate * global_step /
+  decay_step)
+  ```
+
+  or, if `staircase` is `True`, as:
+
+  ```python
+  decayed_learning_rate = learning_rate * exp(-decay_rate * floor(global_step /
+  decay_step))
   ```
 
   Example: decay exponentially with a base of 0.96:
@@ -365,8 +373,10 @@ def natural_exp_decay(learning_rate,
   ...
   global_step = tf.Variable(0, trainable=False)
   learning_rate = 0.1
+  decay_steps = 5
   k = 0.5
-  learning_rate = tf.train.exponential_time_decay(learning_rate, global_step, k)
+  learning_rate = tf.train.natural_exp_decay(learning_rate, global_step,
+                                             decay_steps, k)
 
   # Passing global_step to minimize() will increment it at each step.
   learning_step = (
diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index 11510d9928..c80cdf03be 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -126,13 +126,10 @@ class BaseSaverBuilder(object):
           def f():
             with ops.device(v.device):
               x = v.read_value()
-            # To allow variables placed on non-CPU devices to be checkpointed,
-            # we copy them to CPU on the same machine first.
-            device_spec = pydev.DeviceSpec().parse_from_string(v.device)
-            device_spec.merge_from(
-                pydev.DeviceSpec().parse_from_string("/device:CPU:0"))
-            with ops.device(device_spec.to_string()):
-              return array_ops.identity(x)
+              # To allow variables placed on non-CPU devices to be checkpointed,
+              # we copy them to CPU on the same machine first.
+              with ops.device("/device:CPU:0"):
+                return array_ops.identity(x)
           return f
 
         self.handle_op = var.handle
@@ -1928,6 +1925,14 @@ def import_meta_graph(meta_graph_or_file, clear_devices=False,
   execution is enabled.
   @end_compatibility
   """  # pylint: disable=g-doc-exception
+  return _import_meta_graph_with_return_elements(
+      meta_graph_or_file, clear_devices, import_scope, **kwargs)[0]
+
+
+def _import_meta_graph_with_return_elements(
+    meta_graph_or_file, clear_devices=False, import_scope=None,
+    return_elements=None, **kwargs):
+  """Import MetaGraph, and return both a saver and returned elements."""
   if context.executing_eagerly():
     raise RuntimeError("Exporting/importing meta graphs is not supported when "
                        "eager execution is enabled. No graph exists when eager "
@@ -1937,12 +1942,22 @@ def import_meta_graph(meta_graph_or_file, clear_devices=False,
   else:
     meta_graph_def = meta_graph_or_file
 
-  imported_vars = meta_graph.import_scoped_meta_graph(
-      meta_graph_def,
-      clear_devices=clear_devices,
-      import_scope=import_scope,
-      **kwargs)
+  imported_vars, imported_return_elements = (
+      meta_graph.import_scoped_meta_graph_with_return_elements(
+          meta_graph_def,
+          clear_devices=clear_devices,
+          import_scope=import_scope,
+          return_elements=return_elements,
+          **kwargs))
+
+  saver = _create_saver_from_imported_meta_graph(
+      meta_graph_def, import_scope, imported_vars)
+  return saver, imported_return_elements
+
 
+def _create_saver_from_imported_meta_graph(
+    meta_graph_def, import_scope, imported_vars):
+  """Return a saver for restoring variable values to an imported MetaGraph."""
   if meta_graph_def.HasField("saver_def"):
     # Infer the scope that is prepended by `import_scoped_meta_graph`.
     scope = import_scope
diff --git a/tensorflow/python/training/saver_test.py b/tensorflow/python/training/saver_test.py
index ecce8ae6bd..204e81dda0 100644
--- a/tensorflow/python/training/saver_test.py
+++ b/tensorflow/python/training/saver_test.py
@@ -786,6 +786,37 @@ class SaverTest(test.TestCase):
       self.assertEqual(20.0, v1.eval())
       save.save(sess, save_path)
 
+  # Test restoring large tensors (triggers a thread pool)
+  def testRestoreLargeTensors(self):
+    save_dir = self.get_temp_dir()
+    def _model():
+      small_v = [variable_scope.get_variable(
+          "small%d" % i, shape=[10, 2], use_resource=True) for i in range(5)]
+      large_v = [variable_scope.get_variable(
+          "large%d" % i, shape=[32000, 1000], use_resource=True)
+                 for i in range(3)]
+      return small_v + large_v
+
+    save_graph = ops_lib.Graph()
+    with save_graph.as_default(), self.test_session(graph=save_graph) as sess:
+      orig_vars = _model()
+      sess.run(variables.global_variables_initializer())
+      save = saver_module.Saver(max_to_keep=1)
+      variables.global_variables_initializer().run()
+      save.save(sess, save_dir)
+      orig_vals = sess.run(orig_vars)
+
+    restore_graph = ops_lib.Graph()
+    with restore_graph.as_default(), self.test_session(
+        graph=restore_graph) as sess:
+      restored_vars = _model()
+      save = saver_module.Saver(max_to_keep=1)
+      save.restore(sess, save_dir)
+      restored_vals = sess.run(restored_vars)
+
+    for orig, restored in zip(orig_vals, restored_vals):
+      self.assertAllEqual(orig, restored)
+
 
 class SaveRestoreShardedTest(test.TestCase):
 
diff --git a/tensorflow/python/util/function_utils.py b/tensorflow/python/util/function_utils.py
index 7bbbde3cd2..4e9b07e20a 100644
--- a/tensorflow/python/util/function_utils.py
+++ b/tensorflow/python/util/function_utils.py
@@ -20,6 +20,8 @@ from __future__ import print_function
 
 import functools
 
+import six
+
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
 
@@ -55,3 +57,36 @@ def fn_args(fn):
     if _is_bounded_method(fn):
       args.remove('self')
   return tuple(args)
+
+
+def get_func_name(func):
+  """Returns name of passed callable."""
+  _, func = tf_decorator.unwrap(func)
+  if callable(func):
+    if tf_inspect.isfunction(func):
+      return func.__name__
+    elif tf_inspect.ismethod(func):
+      return '%s.%s' % (six.get_method_self(func).__class__.__name__,
+                        six.get_method_function(func).__name__)
+    else:  # Probably a class instance with __call__
+      return str(type(func))
+  else:
+    raise ValueError('Argument must be callable')
+
+
+def get_func_code(func):
+  """Returns func_code of passed callable, or None if not available."""
+  _, func = tf_decorator.unwrap(func)
+  if callable(func):
+    if tf_inspect.isfunction(func) or tf_inspect.ismethod(func):
+      return six.get_function_code(func)
+    # Since the object is not a function or method, but is a callable, we will
+    # try to access the __call__method as a function.  This works with callable
+    # classes but fails with functool.partial objects despite their __call__
+    # attribute.
+    try:
+      return six.get_function_code(func.__call__)
+    except AttributeError:
+      return None
+  else:
+    raise ValueError('Argument must be callable')
diff --git a/tensorflow/python/util/function_utils_test.py b/tensorflow/python/util/function_utils_test.py
index e78cf6a5b0..1588328c26 100644
--- a/tensorflow/python/util/function_utils_test.py
+++ b/tensorflow/python/util/function_utils_test.py
@@ -24,6 +24,16 @@ from tensorflow.python.platform import test
 from tensorflow.python.util import function_utils
 
 
+def silly_example_function():
+  pass
+
+
+class SillyCallableClass(object):
+
+  def __call__(self):
+    pass
+
+
 class FnArgsTest(test.TestCase):
 
   def test_simple_function(self):
@@ -124,5 +134,73 @@ class FnArgsTest(test.TestCase):
     self.assertEqual(3, double_wrapped_fn(3))
     self.assertEqual(3, double_wrapped_fn(a=3))
 
+
+class GetFuncNameTest(test.TestCase):
+
+  def testWithSimpleFunction(self):
+    self.assertEqual(
+        'silly_example_function',
+        function_utils.get_func_name(silly_example_function))
+
+  def testWithClassMethod(self):
+    self.assertEqual(
+        'GetFuncNameTest.testWithClassMethod',
+        function_utils.get_func_name(self.testWithClassMethod))
+
+  def testWithCallableClass(self):
+    callable_instance = SillyCallableClass()
+    self.assertRegexpMatches(
+        function_utils.get_func_name(callable_instance),
+        '<.*SillyCallableClass.*>')
+
+  def testWithFunctoolsPartial(self):
+    partial = functools.partial(silly_example_function)
+    self.assertRegexpMatches(
+        function_utils.get_func_name(partial),
+        '<.*functools.partial.*>')
+
+  def testWithLambda(self):
+    anon_fn = lambda x: x
+    self.assertEqual('<lambda>', function_utils.get_func_name(anon_fn))
+
+  def testRaisesWithNonCallableObject(self):
+    with self.assertRaises(ValueError):
+      function_utils.get_func_name(None)
+
+
+class GetFuncCodeTest(test.TestCase):
+
+  def testWithSimpleFunction(self):
+    code = function_utils.get_func_code(silly_example_function)
+    self.assertIsNotNone(code)
+    self.assertRegexpMatches(code.co_filename, 'function_utils_test.py')
+
+  def testWithClassMethod(self):
+    code = function_utils.get_func_code(self.testWithClassMethod)
+    self.assertIsNotNone(code)
+    self.assertRegexpMatches(code.co_filename, 'function_utils_test.py')
+
+  def testWithCallableClass(self):
+    callable_instance = SillyCallableClass()
+    code = function_utils.get_func_code(callable_instance)
+    self.assertIsNotNone(code)
+    self.assertRegexpMatches(code.co_filename, 'function_utils_test.py')
+
+  def testWithLambda(self):
+    anon_fn = lambda x: x
+    code = function_utils.get_func_code(anon_fn)
+    self.assertIsNotNone(code)
+    self.assertRegexpMatches(code.co_filename, 'function_utils_test.py')
+
+  def testWithFunctoolsPartial(self):
+    partial = functools.partial(silly_example_function)
+    code = function_utils.get_func_code(partial)
+    self.assertIsNone(code)
+
+  def testRaisesWithNonCallableObject(self):
+    with self.assertRaises(ValueError):
+      function_utils.get_func_code(None)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index d63f59a8c8..5aac559b9b 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -73,7 +73,7 @@ def _sequence_like(instance, args):
   Returns:
     `args` with the type of `instance`.
   """
-  if isinstance(instance, dict):
+  if isinstance(instance, (dict, _collections.Mapping)):
     # Pack dictionaries in a deterministic order by sorting the keys.
     # Notice this means that we ignore the original order of `OrderedDict`
     # instances. This is intentional, to avoid potential bugs caused by mixing
@@ -89,7 +89,7 @@ def _sequence_like(instance, args):
 
 
 def _yield_value(iterable):
-  if isinstance(iterable, dict):
+  if isinstance(iterable, (dict, _collections.Mapping)):
     # Iterate through dictionaries in a deterministic order by sorting the
     # keys. Notice this means that we ignore the original order of `OrderedDict`
     # instances. This is intentional, to avoid potential bugs caused by mixing
@@ -215,7 +215,7 @@ def flatten_dict_items(dictionary):
     ValueError: If any key and value have not the same structure, or if keys are
       not unique.
   """
-  if not isinstance(dictionary, dict):
+  if not isinstance(dictionary, (dict, _collections.Mapping)):
     raise TypeError("input must be a dictionary")
   flat_dictionary = {}
   for i, v in _six.iteritems(dictionary):
@@ -455,7 +455,7 @@ def assert_shallow_structure(shallow_tree, input_tree, check_types=True):
           "structure has length %s, while shallow structure has length %s."
           % (len(input_tree), len(shallow_tree)))
 
-    if check_types and isinstance(shallow_tree, dict):
+    if check_types and isinstance(shallow_tree, (dict, _collections.Mapping)):
       if set(input_tree) != set(shallow_tree):
         raise ValueError(
             "The two structures don't have the same keys. Input "
@@ -716,7 +716,7 @@ def yield_flat_paths(nest):
 
   # The _maybe_add_final_path_element function is used below in order to avoid
   # adding trailing slashes when the sub-element recursed into is a leaf.
-  if isinstance(nest, dict):
+  if isinstance(nest, (dict, _collections.Mapping)):
     for key in _sorted(nest):
       value = nest[key]
       for sub_path in yield_flat_paths(value):
@@ -760,3 +760,4 @@ def flatten_with_joined_string_paths(structure, separator="/"):
 
 
 _pywrap_tensorflow.RegisterSequenceClass(_collections.Sequence)
+_pywrap_tensorflow.RegisterMappingClass(_collections.Mapping)
diff --git a/tensorflow/python/util/nest_test.py b/tensorflow/python/util/nest_test.py
index 2f12b25354..26c6ea4b01 100644
--- a/tensorflow/python/util/nest_test.py
+++ b/tensorflow/python/util/nest_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import collections
 import time
 
+from absl.testing import parameterized
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
@@ -33,7 +34,22 @@ from tensorflow.python.platform import test
 from tensorflow.python.util import nest
 
 
-class NestTest(test.TestCase):
+class _CustomMapping(collections.Mapping):
+
+  def __init__(self, *args, **kwargs):
+    self._wrapped = dict(*args, **kwargs)
+
+  def __getitem__(self, key):
+    return self._wrapped[key]
+
+  def __iter__(self):
+    return iter(self._wrapped)
+
+  def __len__(self):
+    return len(self._wrapped)
+
+
+class NestTest(parameterized.TestCase, test.TestCase):
 
   PointXY = collections.namedtuple("Point", ["x", "y"])  # pylint: disable=invalid-name
 
@@ -72,26 +88,32 @@ class NestTest(test.TestCase):
     with self.assertRaises(ValueError):
       nest.pack_sequence_as([5, 6, [7, 8]], ["a", "b", "c"])
 
+  @parameterized.parameters({"mapping_type": collections.OrderedDict},
+                            {"mapping_type": _CustomMapping})
   @test_util.assert_no_new_pyobjects_executing_eagerly
-  def testFlattenDictOrder(self):
+  def testFlattenDictOrder(self, mapping_type):
     """`flatten` orders dicts by key, including OrderedDicts."""
-    ordered = collections.OrderedDict([("d", 3), ("b", 1), ("a", 0), ("c", 2)])
+    ordered = mapping_type([("d", 3), ("b", 1), ("a", 0), ("c", 2)])
     plain = {"d": 3, "b": 1, "a": 0, "c": 2}
     ordered_flat = nest.flatten(ordered)
     plain_flat = nest.flatten(plain)
     self.assertEqual([0, 1, 2, 3], ordered_flat)
     self.assertEqual([0, 1, 2, 3], plain_flat)
 
-  def testPackDictOrder(self):
+  @parameterized.parameters({"mapping_type": collections.OrderedDict},
+                            {"mapping_type": _CustomMapping})
+  def testPackDictOrder(self, mapping_type):
     """Packing orders dicts by key, including OrderedDicts."""
-    ordered = collections.OrderedDict([("d", 0), ("b", 0), ("a", 0), ("c", 0)])
+    custom = mapping_type([("d", 0), ("b", 0), ("a", 0), ("c", 0)])
     plain = {"d": 0, "b": 0, "a": 0, "c": 0}
     seq = [0, 1, 2, 3]
-    ordered_reconstruction = nest.pack_sequence_as(ordered, seq)
+    custom_reconstruction = nest.pack_sequence_as(custom, seq)
     plain_reconstruction = nest.pack_sequence_as(plain, seq)
+    self.assertIsInstance(custom_reconstruction, mapping_type)
+    self.assertIsInstance(plain_reconstruction, dict)
     self.assertEqual(
-        collections.OrderedDict([("d", 3), ("b", 1), ("a", 0), ("c", 2)]),
-        ordered_reconstruction)
+        mapping_type([("d", 3), ("b", 1), ("a", 0), ("c", 2)]),
+        custom_reconstruction)
     self.assertEqual({"d": 3, "b": 1, "a": 0, "c": 2}, plain_reconstruction)
 
   Abc = collections.namedtuple("A", ("b", "c"))  # pylint: disable=invalid-name
@@ -101,8 +123,10 @@ class NestTest(test.TestCase):
     # A nice messy mix of tuples, lists, dicts, and `OrderedDict`s.
     mess = [
         "z",
-        NestTest.Abc(3, 4),
-        {
+        NestTest.Abc(3, 4), {
+            "d": _CustomMapping({
+                41: 4
+            }),
             "c": [
                 1,
                 collections.OrderedDict([
@@ -111,17 +135,19 @@ class NestTest(test.TestCase):
                 ]),
             ],
             "b": 5
-        },
-        17
+        }, 17
     ]
 
     flattened = nest.flatten(mess)
-    self.assertEqual(flattened, ["z", 3, 4, 5, 1, 2, 3, 17])
+    self.assertEqual(flattened, ["z", 3, 4, 5, 1, 2, 3, 4, 17])
 
     structure_of_mess = [
         14,
         NestTest.Abc("a", True),
         {
+            "d": _CustomMapping({
+                41: 42
+            }),
             "c": [
                 0,
                 collections.OrderedDict([
@@ -142,6 +168,10 @@ class NestTest(test.TestCase):
     self.assertIsInstance(unflattened_ordered_dict, collections.OrderedDict)
     self.assertEqual(list(unflattened_ordered_dict.keys()), ["b", "a"])
 
+    unflattened_custom_mapping = unflattened[2]["d"]
+    self.assertIsInstance(unflattened_custom_mapping, _CustomMapping)
+    self.assertEqual(list(unflattened_custom_mapping.keys()), [41])
+
   def testFlatten_numpyIsNotFlattened(self):
     structure = np.array([1, 2, 3])
     flattened = nest.flatten(structure)
@@ -179,19 +209,23 @@ class NestTest(test.TestCase):
     self.assertFalse(nest.is_sequence(math_ops.tanh(ones)))
     self.assertFalse(nest.is_sequence(np.ones((4, 5))))
 
-  def testFlattenDictItems(self):
-    dictionary = {(4, 5, (6, 8)): ("a", "b", ("c", "d"))}
+  @parameterized.parameters({"mapping_type": _CustomMapping},
+                            {"mapping_type": dict})
+  def testFlattenDictItems(self, mapping_type):
+    dictionary = mapping_type({(4, 5, (6, 8)): ("a", "b", ("c", "d"))})
     flat = {4: "a", 5: "b", 6: "c", 8: "d"}
     self.assertEqual(nest.flatten_dict_items(dictionary), flat)
 
     with self.assertRaises(TypeError):
       nest.flatten_dict_items(4)
 
-    bad_dictionary = {(4, 5, (4, 8)): ("a", "b", ("c", "d"))}
+    bad_dictionary = mapping_type({(4, 5, (4, 8)): ("a", "b", ("c", "d"))})
     with self.assertRaisesRegexp(ValueError, "not unique"):
       nest.flatten_dict_items(bad_dictionary)
 
-    another_bad_dictionary = {(4, 5, (6, 8)): ("a", "b", ("c", ("d", "e")))}
+    another_bad_dictionary = mapping_type({
+        (4, 5, (6, 8)): ("a", "b", ("c", ("d", "e")))
+    })
     with self.assertRaisesRegexp(
         ValueError, "Key had [0-9]* elements, but value had [0-9]* elements"):
       nest.flatten_dict_items(another_bad_dictionary)
diff --git a/tensorflow/python/util/util.cc b/tensorflow/python/util/util.cc
index 366f8a0deb..ad85a44f8d 100644
--- a/tensorflow/python/util/util.cc
+++ b/tensorflow/python/util/util.cc
@@ -31,6 +31,8 @@ namespace {
 
 // Type object for collections.Sequence. This is set by RegisterSequenceClass.
 PyObject* CollectionsSequenceType = nullptr;
+// Type object for collections.Mapping, set by RegisterMappingClass.
+PyObject* CollectionsMappingType = nullptr;
 PyTypeObject* SparseTensorValueType = nullptr;
 
 const int kMaxItemsInCache = 1024;
@@ -45,6 +47,23 @@ bool IsString(PyObject* o) {
          PyUnicode_Check(o);
 }
 
+// Work around a writable-strings warning with Python 2's PyMapping_Keys macro,
+// and while we're at it give them consistent behavior by making sure the
+// returned value is a list.
+//
+// As with PyMapping_Keys, returns a new reference.
+PyObject* MappingKeys(PyObject* o) {
+#if PY_MAJOR_VERSION >= 3
+  return PyMapping_Keys(o);
+#else
+  static char key_method_name[] = "keys";
+  Safe_PyObjectPtr raw_result(PyObject_CallMethod(o, key_method_name, nullptr));
+  return PySequence_Fast(
+      raw_result.get(),
+      "The '.keys()' method of a custom mapping returned a non-sequence.");
+#endif
+}
+
 // Equivalent to Python's 'o.__class__.__name__'
 // Note that '__class__' attribute is set only in new-style classes.
 // A lot of tensorflow code uses __class__ without checks, so it seems like
@@ -85,6 +104,119 @@ string PyObjectToString(PyObject* o) {
   }
 }
 
+class CachedTypeCheck {
+ public:
+  explicit CachedTypeCheck(std::function<int(PyObject*)> ternary_predicate)
+      : ternary_predicate_(std::move(ternary_predicate)) {}
+
+  ~CachedTypeCheck() {
+    mutex_lock l(type_to_sequence_map_mu_);
+    for (const auto& pair : type_to_sequence_map_) {
+      Py_DECREF(pair.first);
+    }
+  }
+
+  // Caches successful executions of the one-argument (PyObject*) callable
+  // "ternary_predicate" based on the type of "o". -1 from the callable
+  // indicates an unsuccessful check (not cached), 0 indicates that "o"'s type
+  // does not match the predicate, and 1 indicates that it does. Used to avoid
+  // calling back into Python for expensive isinstance checks.
+  int CachedLookup(PyObject* o) {
+    // Try not to return to Python - see if the type has already been seen
+    // before.
+
+    auto* type = Py_TYPE(o);
+
+    {
+      mutex_lock l(type_to_sequence_map_mu_);
+      auto it = type_to_sequence_map_.find(type);
+      if (it != type_to_sequence_map_.end()) {
+        return it->second;
+      }
+    }
+
+    int check_result = ternary_predicate_(o);
+
+    if (check_result == -1) {
+      return -1;  // Type check error, not cached.
+    }
+
+    // NOTE: This is never decref'd as long as the object lives, which is likely
+    // forever, but we don't want the type to get deleted as long as it is in
+    // the map. This should not be too much of a leak, as there should only be a
+    // relatively small number of types in the map, and an even smaller number
+    // that are eligible for decref. As a precaution, we limit the size of the
+    // map to 1024.
+    {
+      mutex_lock l(type_to_sequence_map_mu_);
+      if (type_to_sequence_map_.size() < kMaxItemsInCache) {
+        Py_INCREF(type);
+        type_to_sequence_map_.insert({type, check_result});
+      }
+    }
+
+    return check_result;
+  }
+
+ private:
+  std::function<int(PyObject*)> ternary_predicate_;
+  mutex type_to_sequence_map_mu_;
+  std::unordered_map<PyTypeObject*, bool> type_to_sequence_map_
+      GUARDED_BY(type_to_sequence_map_mu_);
+};
+
+// Returns 1 if `o` is considered a mapping for the purposes of Flatten().
+// Returns 0 otherwise.
+// Returns -1 if an error occurred.
+int IsMappingHelper(PyObject* o) {
+  static auto* const check_cache = new CachedTypeCheck([](PyObject* to_check) {
+    return PyObject_IsInstance(to_check, CollectionsMappingType);
+  });
+  if (PyDict_Check(o)) return true;
+  if (TF_PREDICT_FALSE(CollectionsMappingType == nullptr)) {
+    PyErr_SetString(
+        PyExc_RuntimeError,
+        tensorflow::strings::StrCat(
+            "collections.Mapping type has not been set. "
+            "Please call RegisterMappingClass before using this module")
+            .c_str());
+    return -1;
+  }
+  return check_cache->CachedLookup(o);
+}
+
+// Returns 1 if `o` is considered a sequence for the purposes of Flatten().
+// Returns 0 otherwise.
+// Returns -1 if an error occurred.
+int IsSequenceHelper(PyObject* o) {
+  static auto* const check_cache = new CachedTypeCheck([](PyObject* to_check) {
+    int is_instance = PyObject_IsInstance(to_check, CollectionsSequenceType);
+
+    // Don't cache a failed is_instance check.
+    if (is_instance == -1) return -1;
+
+    return static_cast<int>(is_instance != 0 && !IsString(to_check));
+  });
+  // We treat dicts and other mappings as special cases of sequences.
+  if (IsMappingHelper(o)) return true;
+  if (PySet_Check(o) && !WarnedThatSetIsNotSequence) {
+    LOG(WARNING) << "Sets are not currently considered sequences, "
+                    "but this may change in the future, "
+                    "so consider avoiding using them.";
+    WarnedThatSetIsNotSequence = true;
+  }
+  if (TF_PREDICT_FALSE(CollectionsSequenceType == nullptr)) {
+    PyErr_SetString(
+        PyExc_RuntimeError,
+        tensorflow::strings::StrCat(
+            "collections.Sequence type has not been set. "
+            "Please call RegisterSequenceClass before using this module")
+            .c_str());
+    return -1;
+  }
+  return check_cache->CachedLookup(o);
+}
+
 // Implements the same idea as tensorflow.util.nest._yield_value
 // During construction we check if the iterable is a dictionary.
 // If so, we construct a sequence from its sorted keys that will be used
@@ -96,7 +228,12 @@ string PyObjectToString(PyObject* o) {
 // 'iterable' must not be modified while ValIterator is used.
 class ValIterator {
  public:
-  explicit ValIterator(PyObject* iterable) : dict_(nullptr), index_(0) {
+  explicit ValIterator(PyObject* iterable)
+      : dict_(nullptr),
+        mapping_(nullptr),
+        last_mapping_element_(nullptr),
+        seq_(nullptr),
+        index_(0) {
     if (PyDict_Check(iterable)) {
       dict_ = iterable;
       // PyDict_Keys returns a list, which can be used with
@@ -108,6 +245,10 @@ class ValIterator {
       // bugs caused by mixing ordered and plain dicts (e.g., flattening
       // a dict but using a corresponding `OrderedDict` to pack it back).
       PyList_Sort(seq_);
+    } else if (IsMappingHelper(iterable)) {
+      mapping_ = iterable;
+      seq_ = MappingKeys(iterable);
+      PyList_Sort(seq_);
     } else {
       seq_ = PySequence_Fast(iterable, "");
     }
@@ -122,7 +263,9 @@ class ValIterator {
     PyObject* element = nullptr;
     if (index_ < size_) {
       // Both PySequence_Fast_GET_ITEM and PyDict_GetItem return borrowed
-      // references.
+      // references. For general mappings, ValIterator keeps a reference to the
+      // last retrieved element (and decrefs it before producing the next
+      // element) to abstract away the borrowed/new difference.
       element = PySequence_Fast_GET_ITEM(seq_, index_);
       ++index_;
       if (dict_ != nullptr) {
@@ -132,85 +275,32 @@ class ValIterator {
                           "Dictionary was modified during iteration over it");
           return nullptr;
         }
+      } else if (mapping_ != nullptr) {
+        element = PyObject_GetItem(mapping_, element);
+        if (element == nullptr) {
+          PyErr_SetString(PyExc_RuntimeError,
+                          "Mapping was modified during iteration over it");
+          return nullptr;
+        }
+        last_mapping_element_.reset(element);
       }
     }
     return element;
   }
 
  private:
-  PyObject* seq_;
+  // Special casing for things that pass PyDict_Check (faster, no Python calls)
   PyObject* dict_;
+
+  // General mappings which have custom Python logic
+  PyObject* mapping_;
+  Safe_PyObjectPtr last_mapping_element_;
+
+  PyObject* seq_;
   Py_ssize_t size_;
   Py_ssize_t index_;
 };
 
-mutex g_type_to_sequence_map(LINKER_INITIALIZED);
-std::unordered_map<PyTypeObject*, bool>* IsTypeSequenceMap() {
-  static auto* const m = new std::unordered_map<PyTypeObject*, bool>;
-  return m;
-}
-
-// Returns 1 if `o` is considered a sequence for the purposes of Flatten().
-// Returns 0 otherwise.
-// Returns -1 if an error occurred.
-int IsSequenceHelper(PyObject* o) {
-  if (PyDict_Check(o)) return true;
-  if (PySet_Check(o) && !WarnedThatSetIsNotSequence) {
-    LOG(WARNING) << "Sets are not currently considered sequences, "
-                    "but this may change in the future, "
-                    "so consider avoiding using them.";
-    WarnedThatSetIsNotSequence = true;
-  }
-  if (TF_PREDICT_FALSE(CollectionsSequenceType == nullptr)) {
-    PyErr_SetString(
-        PyExc_RuntimeError,
-        tensorflow::strings::StrCat(
-            "collections.Sequence type has not been set. "
-            "Please call RegisterSequenceClass before using this module")
-            .c_str());
-    return -1;
-  }
-
-  // Try not to return to Python - see if the type has already been seen
-  // before.
-
-  auto* type_to_sequence_map = IsTypeSequenceMap();
-  auto* type = Py_TYPE(o);
-
-  {
-    mutex_lock l(g_type_to_sequence_map);
-    auto it = type_to_sequence_map->find(type);
-    if (it != type_to_sequence_map->end()) {
-      return it->second;
-    }
-  }
-
-  // NOTE: We explicitly release the g_type_to_sequence_map mutex,
-  // because PyObject_IsInstance() may release the GIL, allowing another thread
-  // concurrent entry to this function.
-  int is_instance = PyObject_IsInstance(o, CollectionsSequenceType);
-
-  // Don't cache a failed is_instance check.
-  if (is_instance == -1) return -1;
-
-  bool is_sequence = static_cast<int>(is_instance != 0 && !IsString(o));
-
-  // NOTE: This is never decref'd, but we don't want the type to get deleted
-  // as long as it is in the map. This should not be too much of a
-  // leak, as there should only be a relatively small number of types in the
-  // map, and an even smaller number that are eligible for decref. As a
-  // precaution, we limit the size of the map to 1024.
-  {
-    mutex_lock l(g_type_to_sequence_map);
-    if (type_to_sequence_map->size() < kMaxItemsInCache) {
-      Py_INCREF(type);
-      type_to_sequence_map->insert({type, is_sequence});
-    }
-  }
-
-  return is_sequence;
-}
-
 bool IsSparseTensorValueType(PyObject* o) {
   if (TF_PREDICT_FALSE(SparseTensorValueType == nullptr)) {
     return false;
@@ -226,21 +316,35 @@ int IsSequenceForDataHelper(PyObject* o) {
 
 bool GetNextValuesForDict(PyObject* nested,
                           std::vector<Safe_PyObjectPtr>* next_values) {
-  std::vector<PyObject*> result;
-
-  PyObject* keys = PyDict_Keys(nested);
-  if (PyList_Sort(keys) == -1) return false;
-  Py_ssize_t size = PyList_Size(keys);
+  Safe_PyObjectPtr keys(PyDict_Keys(nested));
+  if (PyList_Sort(keys.get()) == -1) return false;
+  Py_ssize_t size = PyList_Size(keys.get());
   for (Py_ssize_t i = 0; i < size; ++i) {
     // We know that key and item will not be deleted because nested owns
     // a reference to them and callers of flatten must not modify nested
     // while the method is running.
-    PyObject* key = PyList_GET_ITEM(keys, i);
+    PyObject* key = PyList_GET_ITEM(keys.get(), i);
     PyObject* item = PyDict_GetItem(nested, key);
     Py_INCREF(item);
     next_values->emplace_back(item);
   }
-  Py_DECREF(keys);
+  return true;
+}
+
+bool GetNextValuesForMapping(PyObject* nested,
+                             std::vector<Safe_PyObjectPtr>* next_values) {
+  Safe_PyObjectPtr keys(MappingKeys(nested));
+  if (keys.get() == nullptr) {
+    return false;
+  }
+  if (PyList_Sort(keys.get()) == -1) return false;
+  Py_ssize_t size = PyList_Size(keys.get());
+  for (Py_ssize_t i = 0; i < size; ++i) {
+    PyObject* key = PyList_GET_ITEM(keys.get(), i);
+    // Unlike PyDict_GetItem, PyObject_GetItem returns a new reference.
+    PyObject* item = PyObject_GetItem(nested, key);
+    next_values->emplace_back(item);
+  }
   return true;
 }
 
@@ -265,6 +369,9 @@ bool GetNextValues(PyObject* nested,
   if (PyDict_Check(nested)) {
     // if nested is dictionary, sort it by key and recurse on each value
     return GetNextValuesForDict(nested, next_values);
+  } else if (IsMappingHelper(nested)) {
+    // same treatment as dictionaries, but for custom mapping types
+    return GetNextValuesForMapping(nested, next_values);
   }
   // iterate and recurse
   return GetNextValuesForIterable(nested, next_values);
@@ -276,6 +383,9 @@ bool GetNextValuesForData(PyObject* nested,
   if (PyDict_Check(nested)) {
     // if nested is dictionary, sort it by key and recurse on each value
     return GetNextValuesForDict(nested, next_values);
+  } else if (IsMappingHelper(nested)) {
+    // same treatment as dictionaries, but for custom mapping types
+    return GetNextValuesForMapping(nested, next_values);
   } else if (IsSparseTensorValueType(nested)) {
     // if nested is a SparseTensorValue, just return itself as a single item
     Py_INCREF(nested);
@@ -320,8 +430,8 @@ bool FlattenHelper(
 // 'dict1' and 'dict2' are assumed to be Python dictionaries.
 void SetDifferentKeysError(PyObject* dict1, PyObject* dict2, string* error_msg,
                            bool* is_type_error) {
-  PyObject* k1 = PyDict_Keys(dict1);
-  PyObject* k2 = PyDict_Keys(dict2);
+  PyObject* k1 = MappingKeys(dict1);
+  PyObject* k2 = MappingKeys(dict2);
   *is_type_error = false;
   *error_msg = tensorflow::strings::StrCat(
       "The two dictionaries don't have the same set of keys. "
@@ -396,9 +506,12 @@ bool AssertSameStructureHelper(PyObject* o1, PyObject* o2, bool check_types,
       }
     } else if (type1 != type2
                /* If both sequences are list types, don't complain. This allows
-               one to be a list subclass (e.g. _ListWrapper used for automatic
-               dependency tracking.) */
-               && !(PyList_Check(o1) && PyList_Check(o2))) {
+                  one to be a list subclass (e.g. _ListWrapper used for
+                  automatic dependency tracking.) */
+               && !(PyList_Check(o1) && PyList_Check(o2))
+               /* Two mapping types will also compare equal, making _DictWrapper
+                  and dict compare equal. */
+               && !(IsMappingHelper(o1) && IsMappingHelper(o2))) {
       *is_type_error = true;
       *error_msg = tensorflow::strings::StrCat(
           "The two namedtuples don't have the same sequence type. "
@@ -423,6 +536,24 @@ bool AssertSameStructureHelper(PyObject* o1, PyObject* o2, bool check_types,
           return true;
         }
       }
+    } else if (IsMappingHelper(o1)) {
+      // Fallback for custom mapping types. Instead of using PyDict methods
+      // which stay in C, we call iter(o1).
+      if (PyMapping_Size(o1) != PyMapping_Size(o2)) {
+        SetDifferentKeysError(o1, o2, error_msg, is_type_error);
+        return true;
+      }
+
+      Safe_PyObjectPtr iter(PyObject_GetIter(o1));
+      PyObject* key;
+      while ((key = PyIter_Next(iter.get())) != nullptr) {
+        if (!PyMapping_HasKey(o2, key)) {
+          SetDifferentKeysError(o1, o2, error_msg, is_type_error);
+          Py_DECREF(key);
+          return true;
+        }
+        Py_DECREF(key);
+      }
     }
   }
 
@@ -470,6 +601,19 @@ void RegisterSequenceClass(PyObject* sequence_class) {
   CollectionsSequenceType = sequence_class;
 }
 
+void RegisterMappingClass(PyObject* mapping_class) {
+  if (!PyType_Check(mapping_class)) {
+    PyErr_SetString(
+        PyExc_TypeError,
+        tensorflow::strings::StrCat(
+            "Expecting a class definition for `collections.Mapping`. Got ",
+            Py_TYPE(mapping_class)->tp_name)
+            .c_str());
+    return;
+  }
+  CollectionsMappingType = mapping_class;
+}
+
 void RegisterSparseTensorValueClass(PyObject* sparse_tensor_value_class) {
   if (!PyType_Check(sparse_tensor_value_class)) {
     PyErr_SetString(
diff --git a/tensorflow/python/util/util.h b/tensorflow/python/util/util.h
index 70efc10c9a..41dcc969f8 100644
--- a/tensorflow/python/util/util.h
+++ b/tensorflow/python/util/util.h
@@ -118,7 +118,9 @@ PyObject* Flatten(PyObject* nested);
 // the type from the module. This approach also requires some trigger from
 // Python so that we know that Python interpreter had been initialzied.
 void RegisterSequenceClass(PyObject* sequence_class);
-// Similar to the above function, except for the
+// Like RegisterSequenceClass, but for collections.Mapping.
+void RegisterMappingClass(PyObject* mapping_class);
+// Similar to the above functions, except for the
 // sparse_tensor.SparseTensorValue class.
 void RegisterSparseTensorValueClass(PyObject* sparse_tensor_value_class);
 
diff --git a/tensorflow/python/util/util.i b/tensorflow/python/util/util.i
index 9f3b11b982..6ad1484295 100644
--- a/tensorflow/python/util/util.i
+++ b/tensorflow/python/util/util.i
@@ -31,6 +31,9 @@ limitations under the License.
 %unignore tensorflow::swig::RegisterSequenceClass;
 %noexception tensorflow::swig::RegisterSequenceClass;
 
+%unignore tensorflow::swig::RegisterMappingClass;
+%noexception tensorflow::swig::RegisterMappingClass;
+
 %unignore tensorflow::swig::RegisterSparseTensorValueClass;
 %noexception tensorflow::swig::RegisterSparseTensorValueClass;
 
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 766a0dafb5..1c3940e92c 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -322,6 +322,7 @@ port::Status GetLoadedCudnnVersion(CudnnVersion* version) {
 CudnnSupport::CudnnSupport(CUDAExecutor* parent) : parent_(parent) {}
 
 port::Status CudnnSupport::Init() {
+  ScopedActivateExecutorContext context(parent_);
   cudnnHandle_t cudnn_handle = nullptr;
   auto status = cudnnCreate(&cudnn_handle);
   if (status == CUDNN_STATUS_SUCCESS) {
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index d508f6594a..dbece3adf9 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -102,117 +102,16 @@ class CreatedContexts {
 /* static */ int64 CreatedContexts::next_id_ = 1;  // 0 means "no context"
 
 // Formats CUresult to output prettified values into a log stream.
-// Error summaries taken from:
-// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html#group__CUDA__TYPES_1gc6c391505e117393cc2558fff6bfc2e9
-//
-// TODO(leary) switch to cuGetErrorName when updated cuda.h is available.
 string ToString(CUresult result) {
-#define OSTREAM_CUDA_ERROR(__name) \
-  case CUDA_ERROR_##__name:        \
-    return "CUDA_ERROR_" #__name;
-
-///////////////
-// NOTE: here we specify return code values outside of the enum explicitly
-// because our in-tree cuda.h is from the CUDA 5.5 SDK, but CUDA 6.0+ driver
-// libraries are deployed in the fleet these error codes are backwards
-// compatible, but if we see a "new" one, we want to be able to identify it in
-// the logs.
-//
-// Once we get a cuda.h that has cuGetErrorName (TODO is above) we can
-// eliminate this function and just rely on the driver to provide us these
-// strings.
-//
-// NOTE: "Must reboot all context" below is shorthand for, "must
-// destroy/recreate the offending context and any allocation which come from
-// it if you are to continue using CUDA."
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wswitch"
-  switch (result) {
-    OSTREAM_CUDA_ERROR(INVALID_VALUE)
-    OSTREAM_CUDA_ERROR(OUT_OF_MEMORY)
-    OSTREAM_CUDA_ERROR(NOT_INITIALIZED)
-    OSTREAM_CUDA_ERROR(DEINITIALIZED)
-    OSTREAM_CUDA_ERROR(NO_DEVICE)
-    OSTREAM_CUDA_ERROR(INVALID_DEVICE)
-    OSTREAM_CUDA_ERROR(INVALID_IMAGE)
-    OSTREAM_CUDA_ERROR(INVALID_CONTEXT)
-    OSTREAM_CUDA_ERROR(INVALID_HANDLE)
-    OSTREAM_CUDA_ERROR(NOT_FOUND)
-    OSTREAM_CUDA_ERROR(NOT_READY)
-    OSTREAM_CUDA_ERROR(NO_BINARY_FOR_GPU)
-
-    // Encountered an uncorrectable ECC error during execution.
-    OSTREAM_CUDA_ERROR(ECC_UNCORRECTABLE)
-
-    // Load/store on an invalid address. Must reboot all context.
-    case 700:
-      return "CUDA_ERROR_ILLEGAL_ADDRESS";
-    // Passed too many / wrong arguments, too many threads for register count.
-    case 701:
-      return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES";
-    // Kernel took too long to execute.
-    case 702:
-      return "CUDA_ERROR_LAUNCH_TIMEOUT";
-    // Kernel launch uses an incompatible texturing mode.
-    case 703:
-      return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING";
-    // Trying to re-enable peer access that already has it enabled.
-    case 704:
-      return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED";
-    // Trying to disable peer access that has not yet been enabled.
-    case 705:
-      return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED";
-    // Primary context for the specified device has already been initialized.
-    case 708:
-      return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE";
-    // Context current to calling thread has been destroyed or is a primary
-    // context that has not yet been initialized.
-    case 709:
-      return "CUDA_ERROR_CONTEXT_IS_DESTROYED";
-    // Device-side assert triggered during kernel execution. Must reboot all
-    // context.
-    case 710:
-      return "CUDA_ERROR_ASSERT";
-    // Hardware resources to enable peer access have been exhausted.
-    case 711:
-      return "CUDA_ERROR_TOO_MANY_PEERS";
-    // Memory range has already been registered.
-    case 712:
-      return "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED";
-    // Pointer does not correspond to any currently registered memory region.
-    case 713:
-      return "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED";
-    // Due to stack corruption or exceeding stack size limit. Must reboot all
-    // context.
-    case 714:
-      return "CUDA_ERROR_HARDWARE_STACK_ERROR";
-    case 715:
-      return "CUDA_ERROR_ILLEGAL_INSTRUCTION";
-    // Load/store on an unaligned memory address. Must reboot all context.
-    case 716:
-      return "CUDA_ERROR_MISALIGNED_ADDRESS";
-    // Device instruction with specific address space given address not
-    // belonging to allowed address space. Must reboot all context.
-    case 717:
-      return "CUDA_ERROR_INVALID_ADDRESS_SPACE";
-    // Device program counter wrapped its address space. Must reboot all
-    // context.
-    case 718:
-      return "CUDA_ERROR_INVALID_PC";
-    // Exception on device while executing a kernel; e.g. deref invalid device
-    // pointer, accessing OOB shared memory. Must reboot all context.
-    case 719:
-      return "CUDA_ERROR_LAUNCH_FAILED";
-
-      OSTREAM_CUDA_ERROR(CONTEXT_ALREADY_IN_USE)
-      OSTREAM_CUDA_ERROR(PEER_ACCESS_UNSUPPORTED)
-      OSTREAM_CUDA_ERROR(NOT_PERMITTED)
-      OSTREAM_CUDA_ERROR(NOT_SUPPORTED)
-      OSTREAM_CUDA_ERROR(UNKNOWN)  // Unknown internal error to CUDA.
-    default:
-      return port::StrCat("CUresult(", static_cast<int>(result), ")");
+  const char *error_name;
+  if (cuGetErrorName(result, &error_name)) {
+    return port::StrCat("UNKNOWN ERROR (", static_cast<int>(result), ")");
+  }
+  const char *error_string;
+  if (cuGetErrorString(result, &error_string)) {
+    return error_name;
   }
-#pragma GCC diagnostic pop
+  return port::StrCat(error_name, ": ", error_string);
 }
 
 // Returns the current context and checks that it is in the set of CUDA contexts
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index 259c813c57..73f05b94db 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -206,6 +206,48 @@ static string GetBinaryDir(bool strip_exe) {
   return exe_path;
 }
 
+bool CUDAExecutor::LoadModuleFromCuBin(const char *cubin, CUmodule *module) {
+  uint64_t module_refcount;
+  std::tie(*module, module_refcount) = gpu_binary_to_module_[cubin];
+
+  if (*module == nullptr) {
+    auto load_status = CUDADriver::LoadCubin(context_, cubin, module);
+    if (!load_status.ok()) {
+      LOG(ERROR) << "failed to load CUBIN: " << load_status;
+      return false;
+    }
+    module_refcount = 1;
+    VLOG(3) << "Loaded CUBIN " << static_cast<const void *>(cubin)
+            << " as module " << *module;
+  } else {
+    ++module_refcount;
+    VLOG(3) << "CUBIN " << static_cast<const void *>(cubin)
+            << " is already loaded as module " << *module;
+  }
+  gpu_binary_to_module_[cubin] = {*module, module_refcount};
+  return true;
+}
+
+bool CUDAExecutor::LoadModuleFromPtx(const char *ptx, CUmodule *module) {
+  uint64_t module_refcount;
+  std::tie(*module, module_refcount) = gpu_binary_to_module_[ptx];
+
+  if (*module == nullptr) {
+    if (!CUDADriver::LoadPtx(context_, ptx, module)) {
+      return false;
+    }
+    VLOG(3) << "Loaded PTX " << static_cast<const void *>(ptx) << " as module "
+            << *module;
+    module_refcount = 1;
+  } else {
+    ++module_refcount;
+    VLOG(3) << "PTX " << static_cast<const void *>(ptx)
+            << " is already loaded as module " << module;
+  }
+  gpu_binary_to_module_[ptx] = {*module, module_refcount};
+  return true;
+}
+
 bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec,
                              KernelBase *kernel) {
   CUDAKernel *cuda_kernel = AsCUDAKernel(kernel);
@@ -215,28 +257,13 @@ bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec,
   VLOG(3) << "GetKernel on kernel " << kernel << " : " << kernel->name();
 
   if (spec.has_cuda_cubin_in_memory()) {
+    mutex_lock lock{in_memory_modules_mu_};
     kernelname = &spec.cuda_cubin_in_memory().kernelname();
     const char *cubin = spec.cuda_cubin_in_memory().bytes();
-    mutex_lock lock{in_memory_modules_mu_};
-    uint64_t module_refcount;
-    std::tie(module, module_refcount) = gpu_binary_to_module_[cubin];
-
-    if (module == nullptr) {
-      auto load_status = CUDADriver::LoadCubin(context_, cubin, &module);
-      if (!load_status.ok()) {
-        LOG(ERROR) << "failed to load CUBIN: " << load_status;
-        return false;
-      }
-      module_refcount = 1;
-      VLOG(3) << "Loaded CUBIN " << static_cast<const void *>(cubin)
-              << " as module " << module;
-    } else {
-      ++module_refcount;
-      VLOG(3) << "CUBIN " << static_cast<const void *>(cubin)
-              << " is already loaded as module " << module;
+    if (!LoadModuleFromCuBin(cubin, &module)) {
+      return false;
     }
     kernel_to_gpu_binary_[kernel] = cubin;
-    gpu_binary_to_module_[cubin] = {module, module_refcount};
   } else if (spec.has_cuda_ptx_in_memory()) {
     kernelname = &spec.cuda_ptx_in_memory().kernelname();
 
@@ -254,24 +281,10 @@ bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec,
     }
 
     mutex_lock lock{in_memory_modules_mu_};
-    uint64_t module_refcount;
-    std::tie(module, module_refcount) = gpu_binary_to_module_[ptx];
-
-    if (module == nullptr) {
-      if (!CUDADriver::LoadPtx(context_, ptx, &module)) {
-        LOG(ERROR) << "failed to load PTX for kernel " << *kernelname;
-        return false;
-      }
-      VLOG(3) << "Loaded PTX " << static_cast<const void *>(ptx)
-              << " as module " << module;
-      module_refcount = 1;
-    } else {
-      ++module_refcount;
-      VLOG(3) << "PTX " << static_cast<const void *>(ptx)
-              << " is already loaded as module " << module;
+    if (!LoadModuleFromPtx(ptx, &module)) {
+      return false;
     }
     kernel_to_gpu_binary_[kernel] = ptx;
-    gpu_binary_to_module_[ptx] = {module, module_refcount};
   } else {
     LOG(WARNING) << "no method of loading CUDA kernel provided";
     return false;
@@ -295,6 +308,23 @@ bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec,
   return true;
 }
 
+bool CUDAExecutor::UnloadGpuBinary(const void *gpu_binary) {
+  auto module_it = gpu_binary_to_module_.find(gpu_binary);
+  if (gpu_binary_to_module_.end() == module_it) {
+    VLOG(3) << "No loaded CUDA module for " << gpu_binary;
+    return false;
+  }
+  auto &module = module_it->second.first;
+  auto &refcount = module_it->second.second;
+  VLOG(3) << "Found CUDA module " << module << " with refcount " << refcount;
+  if (--refcount == 0) {
+    VLOG(3) << "Unloading CUDA module " << module;
+    CUDADriver::UnloadModule(context_, module);
+    gpu_binary_to_module_.erase(module_it);
+  }
+  return true;
+}
+
 void CUDAExecutor::UnloadKernel(const KernelBase *kernel) {
   VLOG(3) << "Unloading kernel " << kernel << " : " << kernel->name();
 
@@ -307,25 +337,52 @@ void CUDAExecutor::UnloadKernel(const KernelBase *kernel) {
   }
   VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
           << " has loaded GPU code " << gpu_binary_it->second;
-  auto module_it = gpu_binary_to_module_.find(gpu_binary_it->second);
-  if (gpu_binary_to_module_.end() == module_it) {
-    VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
-            << " has no loaded CUDA module.";
-    return;  // This kernel never loaded any modules
-  }
-  auto &module = module_it->second.first;
-  auto &refcount = module_it->second.second;
-  VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
-          << " has loaded GPU code " << gpu_binary_it->second
-          << " into CUDA module " << module << " with refcount " << refcount;
-  if (--refcount == 0) {
-    VLOG(3) << "Unloading CUDA module " << module;
-    CUDADriver::UnloadModule(context_, module);
-    gpu_binary_to_module_.erase(module_it);
-  }
+  UnloadGpuBinary(gpu_binary_it->second);
   kernel_to_gpu_binary_.erase(gpu_binary_it);
 }
 
+bool CUDAExecutor::LoadModule(const MultiModuleLoaderSpec &spec,
+                              ModuleHandle *module_handle) {
+  // In CUDAExecutor we store the pointer to the GPU binary (PTX or CUBIN) as
+  // ModuleHandle::id().
+  CUmodule cu_module;
+  if (spec.has_cuda_cubin_in_memory()) {
+    mutex_lock lock{in_memory_modules_mu_};
+    if (!LoadModuleFromCuBin(
+            reinterpret_cast<const char *>(spec.cuda_cubin_in_memory().data()),
+            &cu_module)) {
+      return false;
+    }
+    *module_handle = ModuleHandle(const_cast<void *>(
+        static_cast<const void *>(spec.cuda_cubin_in_memory().data())));
+    return true;
+  } else if (spec.has_cuda_ptx_in_memory()) {
+    if (cc_major_ == 0 && cc_minor_ == 0) {
+      return false;
+    }
+
+    if (!spec.cuda_ptx_in_memory()) {
+      return false;
+    }
+
+    mutex_lock lock{in_memory_modules_mu_};
+    if (!LoadModuleFromPtx(spec.cuda_ptx_in_memory(), &cu_module)) {
+      return false;
+    }
+    *module_handle = ModuleHandle(const_cast<void *>(
+        static_cast<const void *>(spec.cuda_ptx_in_memory())));
+    return true;
+  }
+  LOG(WARNING) << "no method of loading CUDA module provided";
+  return false;
+}
+
+bool CUDAExecutor::UnloadModule(ModuleHandle module_handle) {
+  const char *gpu_binary = reinterpret_cast<const char *>(module_handle.id());
+  mutex_lock lock{in_memory_modules_mu_};
+  return UnloadGpuBinary(gpu_binary);
+}
+
 bool CUDAExecutor::GetKernelMetadata(CUDAKernel *cuda_kernel,
                                      KernelMetadata *kernel_metadata) {
   int value;
@@ -783,16 +840,26 @@ bool CUDAExecutor::DeviceMemoryUsage(int64 *free, int64 *total) const {
   return CUDADriver::GetDeviceMemoryInfo(context_, free, total);
 }
 
-bool CUDAExecutor::GetSymbol(const string& symbol_name, void **mem,
+bool CUDAExecutor::GetSymbol(const string &symbol_name,
+                             ModuleHandle module_handle, void **mem,
                              size_t *bytes) {
+  auto lookup_in_module = [&](CUmodule module) {
+    CHECK(module != nullptr);
+    return CUDADriver::GetModuleSymbol(context_, module, symbol_name.c_str(),
+                                       reinterpret_cast<CUdeviceptr *>(mem),
+                                       bytes);
+  };
+
   {  // give limited scope to mutex_lock
     mutex_lock lock{in_memory_modules_mu_};
+    if (static_cast<bool>(module_handle)) {
+      auto it = gpu_binary_to_module_.find(module_handle.id());
+      CHECK(it != gpu_binary_to_module_.end());
+      return lookup_in_module(it->second.first);
+    }
+
     for (auto &it : gpu_binary_to_module_) {
-      CUmodule module = it.second.first;
-      CHECK(module != nullptr);
-      if (CUDADriver::GetModuleSymbol(context_, module, symbol_name.c_str(),
-                                      reinterpret_cast<CUdeviceptr *>(mem),
-                                      bytes)) {
+      if (lookup_in_module(it.second.first)) {
         return true;
       }
     }
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
index f7c341c857..8a954d5461 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
@@ -62,6 +62,9 @@ class CUDAExecutor : public internal::StreamExecutorInterface {
   bool GetKernel(const MultiKernelLoaderSpec &spec,
                  KernelBase *kernel) override;
   void UnloadKernel(const KernelBase *kernel) override;
+  bool LoadModule(const MultiModuleLoaderSpec &spec,
+                  ModuleHandle *module_handle) override;
+  bool UnloadModule(ModuleHandle module_handle) override;
 
   bool Launch(Stream *stream, const ThreadDim &thread_dims,
               const BlockDim &block_dims, const KernelBase &k,
@@ -175,7 +178,8 @@ class CUDAExecutor : public internal::StreamExecutorInterface {
 
   // Search for the symbol and returns a device pointer and size.
   // Returns false if symbol does not exist.
-  bool GetSymbol(const string& symbol_name, void **mem, size_t *bytes) override;
+  bool GetSymbol(const string &symbol_name, ModuleHandle module_handle,
+                 void **mem, size_t *bytes) override;
 
   DeviceDescription *PopulateDeviceDescription() const override;
 
@@ -239,6 +243,16 @@ class CUDAExecutor : public internal::StreamExecutorInterface {
   void VlogOccupancyInfo(const KernelBase &kernel, const ThreadDim &thread_dims,
                          const BlockDim &block_dims);
 
+  bool LoadModuleFromCuBin(const char *cubin, CUmodule *module)
+      EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
+
+  // Loads the PTX text `ptx` as a CUDA module.  `ptx` must be null terminated.
+  bool LoadModuleFromPtx(const char *ptx, CUmodule *module)
+      EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
+
+  bool UnloadGpuBinary(const void *gpu_binary)
+      EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
+
   // Guards the in-memory-module mapping.
   mutex in_memory_modules_mu_;
 
diff --git a/tensorflow/stream_executor/host/host_gpu_executor.cc b/tensorflow/stream_executor/host/host_gpu_executor.cc
index 3cd97b3cf1..8adf739b17 100644
--- a/tensorflow/stream_executor/host/host_gpu_executor.cc
+++ b/tensorflow/stream_executor/host/host_gpu_executor.cc
@@ -93,7 +93,7 @@ bool HostExecutor::MemcpyDeviceToDevice(Stream *stream,
   // the nature of the HostExecutor) memcpy  on the stream (HostStream)
   // associated with the HostExecutor.
   AsHostStream(stream)->EnqueueTask(
-      [src_mem, dst_mem, size]() { memcpy(src_mem, dst_mem, size); });
+      [src_mem, dst_mem, size]() { memcpy(dst_mem, src_mem, size); });
   return true;
 }
 
diff --git a/tensorflow/stream_executor/module_spec.h b/tensorflow/stream_executor/module_spec.h
new file mode 100644
index 0000000000..75bdfed2d7
--- /dev/null
+++ b/tensorflow/stream_executor/module_spec.h
@@ -0,0 +1,66 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_MODULE_SPEC_H_
+#define TENSORFLOW_STREAM_EXECUTOR_MODULE_SPEC_H_
+
+#include "tensorflow/stream_executor/lib/array_slice.h"
+#include "tensorflow/stream_executor/lib/stringpiece.h"
+#include "tensorflow/stream_executor/platform/logging.h"
+#include "tensorflow/stream_executor/platform/port.h"
+
+namespace stream_executor {
+
+// Describes how to load a module on a target platform.
+//
+// The exact meaning of a "module" may differ from platform to platform but
+// loosely speaking a module a collection of kernels and global variables.  It
+// corresponds to CUmodule when running on CUDA.
+class MultiModuleLoaderSpec {
+ public:
+  bool has_cuda_cubin_in_memory() const { return has_cuda_cubin_in_memory_; }
+  port::ArraySlice<const uint8> cuda_cubin_in_memory() const {
+    CHECK(has_cuda_cubin_in_memory());
+    return {cuda_cubin_in_memory_.data(), cuda_cubin_in_memory_.size()};
+  }
+
+  bool has_cuda_ptx_in_memory() const { return has_cuda_ptx_in_memory_; }
+  const char* cuda_ptx_in_memory() const {
+    CHECK(has_cuda_ptx_in_memory());
+    return cuda_ptx_in_memory_;
+  }
+
+  void AddCudaCubinInMemory(port::ArraySlice<const uint8> cubin_bytes) {
+    CHECK(!cubin_bytes.empty());
+    has_cuda_cubin_in_memory_ = true;
+    cuda_cubin_in_memory_ = cubin_bytes;
+  }
+
+  void AddCudaPtxInMemory(const char* ptx) {
+    has_cuda_ptx_in_memory_ = true;
+    // The CUDA driver does not like getting an empty string as PTX.
+    cuda_ptx_in_memory_ = *ptx ? ptx : nullptr;
+  }
+
+ private:
+  port::ArraySlice<const uint8> cuda_cubin_in_memory_;
+  bool has_cuda_cubin_in_memory_ = false;
+  const char* cuda_ptx_in_memory_;
+  bool has_cuda_ptx_in_memory_ = false;
+};
+
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_MODULE_SPEC_H_
diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc
index ca1b8e28e6..b0c061fd74 100644
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@@ -267,6 +267,12 @@ Stream::Stream(StreamExecutor *parent,
 Stream::~Stream() {
   VLOG_CALL();
 
+  // Ensure the stream is completed.
+  auto status = BlockHostUntilDone();
+  if (!status.ok()) {
+    LOG(WARNING) << "Error blocking host until done in stream destructor: "
+                 << status;
+  }
   temporary_memory_manager_.ForceDeallocateAll();
 
   if (allocated_) {
@@ -1935,7 +1941,14 @@ void Stream::ReturnSubStream(Stream *sub_stream) {
   mutex_lock lock(mu_);
   for (auto &stream : sub_streams_) {
     if (stream.first.get() == sub_stream) {
-      stream.second = true;
+      // Streams have a monotonic state machine; if a stream
+      // encounters an error, it will remain in an error state
+      // forever. Only allow re-use of ok streams.
+      //
+      // TODO(toddw): Improve this mechanism, if necessary, to drop
+      // failed streams completely.
+      const bool ready_to_reuse = sub_stream->ok();
+      stream.second = ready_to_reuse;
       return;
     }
   }
diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h
index 63d64947c8..706442a666 100644
--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@@ -125,7 +125,7 @@ class Stream {
   Stream *GetOrCreateSubStream() LOCKS_EXCLUDED(mu_);
 
   // Return the sub-stream back to the host stream so that it can be reused
-  // later.
+  // later. Sub-streams that are !ok() will not be reused.
   void ReturnSubStream(Stream *sub_stream) LOCKS_EXCLUDED(mu_);
 
   // Allocate temporary memories. The stream will deallocate them when blocked
diff --git a/tensorflow/stream_executor/stream_executor_internal.h b/tensorflow/stream_executor/stream_executor_internal.h
index fb1b92cb84..f34b1fc083 100644
--- a/tensorflow/stream_executor/stream_executor_internal.h
+++ b/tensorflow/stream_executor/stream_executor_internal.h
@@ -36,20 +36,38 @@ limitations under the License.
 #include "tensorflow/stream_executor/kernel_cache_config.h"
 #include "tensorflow/stream_executor/kernel_spec.h"
 #include "tensorflow/stream_executor/launch_dim.h"
+#include "tensorflow/stream_executor/lib/inlined_vector.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/module_spec.h"
 #include "tensorflow/stream_executor/platform.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/plugin_registry.h"
 #include "tensorflow/stream_executor/shared_memory_config.h"
 #include "tensorflow/stream_executor/trace_listener.h"
-#include "tensorflow/stream_executor/lib/inlined_vector.h"
 
 namespace stream_executor {
 
 class Stream;
 class Timer;
 
+// An opaque handle to a loaded module.
+//
+// An instance of this is returned from StreamExecutor::GetModule.
+class ModuleHandle {
+ public:
+  /*implicit*/ ModuleHandle(void *id = nullptr) : id_(id) {}
+
+  // A ModuleHandle with id() == nullptr is an invalid module handle, akin to a
+  // null pointer.
+  void *id() const { return id_; }
+
+  explicit operator bool() const { return id() != nullptr; }
+
+ private:
+  void *id_;
+};
+
 namespace internal {
 
 // Platform-dependent interface class for the generic Events interface, in
@@ -164,6 +182,11 @@ class StreamExecutorInterface {
                          KernelBase *kernel) {
     return false;
   }
+  virtual bool LoadModule(const MultiModuleLoaderSpec &spec,
+                          ModuleHandle *module_handle) {
+    return false;
+  }
+  virtual bool UnloadModule(ModuleHandle module_handle) { return false; }
   virtual bool Launch(Stream *stream, const ThreadDim &thread_dims,
                       const BlockDim &block_dims, const KernelBase &k,
                       const KernelArgsArrayBase &args) {
@@ -247,7 +270,12 @@ class StreamExecutorInterface {
   // null, however, both of them cannot be null at the same time. To use
   // constant memory in CUDA, GetSymbol has to be used. Returns true if symbol
   // is found.
-  virtual bool GetSymbol(const string& symbol_name, void **mem, size_t *bytes) {
+  //
+  // If ModuleHandle is set then we search for `symbol_name` only within the
+  // module corresponding to `module_handle`.  Otherwise all loaded modules are
+  // searched.
+  virtual bool GetSymbol(const string &symbol_name, ModuleHandle module_handle,
+                         void **mem, size_t *bytes) {
     return false;
   }
 
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc
index 000795ff00..2e0137a485 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.cc
+++ b/tensorflow/stream_executor/stream_executor_pimpl.cc
@@ -220,6 +220,15 @@ void StreamExecutor::UnloadKernel(const KernelBase *kernel) {
   implementation_->UnloadKernel(kernel);
 }
 
+bool StreamExecutor::LoadModule(const MultiModuleLoaderSpec &spec,
+                                ModuleHandle *module_handle) {
+  return implementation_->LoadModule(spec, module_handle);
+}
+
+bool StreamExecutor::UnloadModule(ModuleHandle module_handle) {
+  return implementation_->UnloadModule(module_handle);
+}
+
 void StreamExecutor::Deallocate(DeviceMemoryBase *mem) {
   VLOG(1) << "Called StreamExecutor::Deallocate(mem=" << mem->opaque()
           << ") mem->size()=" << mem->size() << StackTraceIfVLOG10();
@@ -459,9 +468,34 @@ void *StreamExecutor::Allocate(uint64 size) {
   return buf;
 }
 
-bool StreamExecutor::GetSymbol(const string &symbol_name, void **mem,
+port::StatusOr<DeviceMemoryBase> StreamExecutor::GetUntypedSymbol(
+    const string &symbol_name, ModuleHandle module_handle) {
+  // If failed to get the symbol, opaque/bytes are unchanged. Initialize them to
+  // be nullptr/0 for consistency with DeviceMemory semantics.
+  void *opaque = nullptr;
+  size_t bytes = 0;
+  if (GetSymbol(symbol_name, module_handle, &opaque, &bytes)) {
+    return DeviceMemoryBase(opaque, bytes);
+  }
+
+  if (static_cast<bool>(module_handle)) {
+    return port::Status(
+        port::error::NOT_FOUND,
+        port::StrCat("Check if module containing symbol ", symbol_name,
+                     " is loaded (module_handle = ",
+                     reinterpret_cast<uintptr_t>(module_handle.id()), ")"));
+  } else {
+    return port::Status(
+        port::error::NOT_FOUND,
+        port::StrCat("Check if kernel using the symbol is loaded: ",
+                     symbol_name));
+  }
+}
+
+bool StreamExecutor::GetSymbol(const string &symbol_name,
+                               ModuleHandle module_handle, void **mem,
                                size_t *bytes) {
-  return implementation_->GetSymbol(symbol_name, mem, bytes);
+  return implementation_->GetSymbol(symbol_name, module_handle, mem, bytes);
 }
 
 void *StreamExecutor::UnifiedMemoryAllocate(uint64 bytes) {
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h
index ad80a1ba25..47b3a2b030 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.h
+++ b/tensorflow/stream_executor/stream_executor_pimpl.h
@@ -106,6 +106,16 @@ class StreamExecutor {
   // Releases any state associated with the previously loaded kernel.
   void UnloadKernel(const KernelBase *kernel);
 
+  // Loads a module for the platform this StreamExecutor is acting upon.
+  //
+  // `spec` describes the module to be loaded.  On success writes the handle for
+  // the loaded module to `module_handle` and returns true.  Else returns false.
+  bool LoadModule(const MultiModuleLoaderSpec &spec,
+                  ModuleHandle *module_handle);
+
+  // Unloads the module with handle `module_handle`.
+  bool UnloadModule(ModuleHandle module_handle);
+
   // Synchronously allocates an array on the device of type T with element_count
   // elements.
   template <typename T>
@@ -169,8 +179,16 @@ class StreamExecutor {
   // type of symbol and T match.
   // - Note: symbol_name should include its namespace as well. For example,
   //         pass "nms0::symbol" if referring to nms0::symbol.
+  //
+  // If `module_handle` is set then searches only within the module
+  // corresponding to `module_handle`.
   template <typename T>
-  port::StatusOr<DeviceMemory<T>> GetSymbol(const string &symbol_name);
+  port::StatusOr<DeviceMemory<T>> GetSymbol(const string &symbol_name,
+                                            ModuleHandle module_handle = {});
+
+  // An untyped version of GetSymbol.
+  port::StatusOr<DeviceMemoryBase> GetUntypedSymbol(
+      const string &symbol_name, ModuleHandle module_handle = {});
 
   // Deallocate the DeviceMemory previously allocated via this interface.
   // Deallocation of a nullptr-representative value is permitted.
@@ -507,7 +525,8 @@ class StreamExecutor {
 
   // Finds and retrieves device memory for the symbol on the underlying
   // platform.
-  bool GetSymbol(const string& symbol_name, void **mem, size_t *bytes);
+  bool GetSymbol(const string &symbol_name, ModuleHandle module_handle,
+                 void **mem, size_t *bytes);
 
   // Entrains a memcpy operation onto stream, with a host destination location
   // host_dst and a device memory source, with target size size.
@@ -678,6 +697,41 @@ class StreamExecutor {
   SE_DISALLOW_COPY_AND_ASSIGN(StreamExecutor);
 };
 
+// A wrapper around ModuleHandle that uses RAII to manage its lifetime.
+class ScopedModuleHandle {
+ public:
+  explicit ScopedModuleHandle(StreamExecutor *executor,
+                              ModuleHandle module_handle)
+      : executor_(executor), module_handle_(module_handle) {}
+
+  ScopedModuleHandle(ScopedModuleHandle &&other) {
+    executor_ = other.executor_;
+    module_handle_ = other.module_handle_;
+    other.executor_ = nullptr;
+    other.module_handle_ = ModuleHandle();
+  }
+
+  ScopedModuleHandle &operator=(ScopedModuleHandle &&other) {
+    executor_ = other.executor_;
+    module_handle_ = other.module_handle_;
+    other.executor_ = nullptr;
+    other.module_handle_ = ModuleHandle();
+    return *this;
+  }
+
+  ~ScopedModuleHandle() {
+    if (static_cast<bool>(module_handle_)) {
+      CHECK(executor_->UnloadModule(module_handle_));
+    }
+  }
+
+ private:
+  StreamExecutor *executor_;
+  ModuleHandle module_handle_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(ScopedModuleHandle);
+};
+
 ////////////
 // Inlines
 
@@ -690,19 +744,13 @@ inline DeviceMemory<T> StreamExecutor::AllocateArray(uint64 element_count) {
 
 template <typename T>
 inline port::StatusOr<DeviceMemory<T>> StreamExecutor::GetSymbol(
-    const string &symbol_name) {
-  // If failed to get the symbol, opaque/bytes are unchanged. Initialize them to
-  // be nullptr/0 for consistency with DeviceMemory semantics.
-  void *opaque = nullptr;
-  size_t bytes = 0;
-  if (GetSymbol(symbol_name, &opaque, &bytes)) {
-    CHECK_EQ(bytes % sizeof(T), 0);
-    return DeviceMemory<T>::MakeFromByteSize(opaque, bytes);
+    const string &symbol_name, ModuleHandle module_handle) {
+  port::StatusOr<DeviceMemoryBase> untyped_symbol =
+      GetUntypedSymbol(symbol_name, module_handle);
+  if (!untyped_symbol.ok()) {
+    return untyped_symbol.status();
   }
-  return port::Status(
-      port::error::NOT_FOUND,
-      port::StrCat("Check if kernel using the symbol is loaded: ",
-                   symbol_name));
+  return DeviceMemory<T>(untyped_symbol.ValueOrDie());
 }
 
 template <typename ElemT>
diff --git a/tensorflow/stream_executor/stream_test.cc b/tensorflow/stream_executor/stream_test.cc
new file mode 100644
index 0000000000..47dd675834
--- /dev/null
+++ b/tensorflow/stream_executor/stream_test.cc
@@ -0,0 +1,139 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/stream_executor.h"
+
+#include "tensorflow/core/platform/test.h"
+
+namespace stream_executor {
+namespace {
+
+class StreamTest : public ::testing::Test {
+ protected:
+  std::unique_ptr<StreamExecutor> NewStreamExecutor() {
+    Platform* platform =
+        MultiPlatformManager::PlatformWithName("Host").ConsumeValueOrDie();
+    StreamExecutorConfig config(/*ordinal=*/0);
+    return platform->GetUncachedExecutor(config).ConsumeValueOrDie();
+  }
+};
+
+TEST_F(StreamTest, NoInitNotOk) {
+  std::unique_ptr<StreamExecutor> executor = NewStreamExecutor();
+  Stream stream(executor.get());
+  EXPECT_FALSE(stream.ok());
+}
+
+TEST_F(StreamTest, InitOk) {
+  std::unique_ptr<StreamExecutor> executor = NewStreamExecutor();
+  Stream stream(executor.get());
+  stream.Init();
+  EXPECT_TRUE(stream.ok());
+}
+
+TEST_F(StreamTest, OneSubStream) {
+  std::unique_ptr<StreamExecutor> executor = NewStreamExecutor();
+  Stream stream(executor.get());
+  stream.Init();
+  EXPECT_TRUE(stream.ok());
+
+  // Get and return a sub-stream. Sub-streams are always initialized.
+  Stream* sub_stream1 = stream.GetOrCreateSubStream();
+  EXPECT_TRUE(sub_stream1->ok());
+  stream.ReturnSubStream(sub_stream1);
+
+  // Get and return another sub-stream.
+  Stream* sub_stream2 = stream.GetOrCreateSubStream();
+  EXPECT_TRUE(sub_stream2->ok());
+  stream.ReturnSubStream(sub_stream1);
+
+  // The underlying sub-streams should be the same, since sub_stream1
+  // was returned before we tried to get sub_stream2.
+  EXPECT_EQ(sub_stream1, sub_stream2);
+}
+
+TEST_F(StreamTest, TwoSubStreams) {
+  std::unique_ptr<StreamExecutor> executor = NewStreamExecutor();
+  Stream stream(executor.get());
+  stream.Init();
+  EXPECT_TRUE(stream.ok());
+
+  // Get two sub-streams.
+  Stream* sub_stream1 = stream.GetOrCreateSubStream();
+  EXPECT_TRUE(sub_stream1->ok());
+  Stream* sub_stream2 = stream.GetOrCreateSubStream();
+  EXPECT_TRUE(sub_stream2->ok());
+
+  // The underlying sub-streams should be different, since neither
+  // sub-stream has been returned.
+  EXPECT_NE(sub_stream1, sub_stream2);
+
+  // Return sub_stream1 and get sub_stream3, which should be the same.
+  stream.ReturnSubStream(sub_stream1);
+  Stream* sub_stream3 = stream.GetOrCreateSubStream();
+  EXPECT_TRUE(sub_stream3->ok());
+  EXPECT_EQ(sub_stream1, sub_stream3);
+  EXPECT_NE(sub_stream2, sub_stream3);
+
+  // Return sub_stream2 and get sub_stream4, which should be the same.
+  stream.ReturnSubStream(sub_stream2);
+  Stream* sub_stream4 = stream.GetOrCreateSubStream();
+  EXPECT_TRUE(sub_stream4->ok());
+  EXPECT_EQ(sub_stream2, sub_stream4);
+  EXPECT_NE(sub_stream3, sub_stream4);
+}
+
+TEST_F(StreamTest, FailedSubStreamNotReused) {
+  std::unique_ptr<StreamExecutor> executor = NewStreamExecutor();
+  Stream stream(executor.get());
+  stream.Init();
+  EXPECT_TRUE(stream.ok());
+
+  // Get a sub-stream.
+  Stream* sub_stream1 = stream.GetOrCreateSubStream();
+  EXPECT_TRUE(sub_stream1->ok());
+
+  // Force an error on the stream; here we call a method that requires
+  // DNN support, which we know the Host platform doesn't support.
+  sub_stream1->ThenDepthConcatenate({}, {}, nullptr);
+  EXPECT_FALSE(sub_stream1->ok());
+
+  // Return sub_stream1 and get sub_stream2.
+  stream.ReturnSubStream(sub_stream1);
+  Stream* sub_stream2 = stream.GetOrCreateSubStream();
+  EXPECT_TRUE(sub_stream2->ok());
+
+  // The underlying streams should be different. They would have been
+  // the same, but since we forced an error on sub_stream1, it will
+  // not be re-used. Sadly we can't just check:
+  //   EXPECT_NE(sub_stream1, sub_stream2);
+  //
+  // The above should hold logically, but it may fail if the new
+  // stream instance allocated for sub_stream2 happens to reside in
+  // the same memory address as sub_stream1.
+  //
+  // The check that sub_stream2->ok() serves as a good-enough check.
+
+  // Return sub_stream2 and get sub_stream3. The previous error on
+  // sub_stream1 has no effect on these streams, and they are the
+  // same.
+  stream.ReturnSubStream(sub_stream2);
+  Stream* sub_stream3 = stream.GetOrCreateSubStream();
+  EXPECT_TRUE(sub_stream3->ok());
+  EXPECT_EQ(sub_stream2, sub_stream3);
+}
+
+}  // namespace
+}  // namespace stream_executor
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 955b53f691..58282ec1c7 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -9,6 +9,7 @@ load(
     "tf_additional_grpc_deps_py",
     "tf_additional_xla_deps_py",
     "if_static",
+    "if_dynamic_kernels",
 )
 load(
     "@local_config_tensorrt//:build_defs.bzl",
@@ -137,6 +138,14 @@ def if_not_mobile(a):
       "//conditions:default": a,
   })
 
+# Config setting selector used when building for products
+# which requires restricted licenses to be avoided.
+def if_not_lgpl_restricted(a):
+  _ = (a,)
+  return select({
+      "//conditions:default": [],
+  })
+
 def if_not_windows(a):
   return select({
       clean_dep("//tensorflow:windows"): [],
@@ -236,6 +245,7 @@ def tf_copts(android_optimization_level_override="-O2", is_external=False):
             clean_dep("//tensorflow:windows"): get_win_copts(is_external),
             clean_dep("//tensorflow:windows_msvc"): get_win_copts(is_external),
             clean_dep("//tensorflow:ios"): ["-std=c++11"],
+            clean_dep("//tensorflow:no_lgpl_deps"): ["-D__TENSORFLOW_NO_LGPL_DEPS__", "-pthread"],
             "//conditions:default": ["-pthread"]
       }))
 
@@ -309,18 +319,36 @@ def tf_binary_additional_srcs():
           clean_dep("//tensorflow:libtensorflow_framework.so"),
       ])
 
+
+# Helper functions to add kernel dependencies to tf binaries when using dynamic
+# kernel linking.
+def tf_binary_dynamic_kernel_dsos(kernels):
+  return if_dynamic_kernels(
+      extra_deps=["libtfkernel_%s.so" % clean_dep(k) for k in kernels],
+      otherwise=[])
+
+# Helper functions to add kernel dependencies to tf binaries when using static
+# kernel linking.
+def tf_binary_dynamic_kernel_deps(kernels):
+  return if_dynamic_kernels(
+      extra_deps=[],
+      otherwise=kernels)
+
 def tf_cc_shared_object(
     name,
     srcs=[],
     deps=[],
+    data=[],
     linkopts=[],
     framework_so=tf_binary_additional_srcs(),
+    kernels=[],
     **kwargs):
   native.cc_binary(
       name=name,
       srcs=srcs + framework_so,
-      deps=deps,
+      deps=deps + tf_binary_dynamic_kernel_deps(kernels),
       linkshared = 1,
+      data = data + tf_binary_dynamic_kernel_dsos(kernels),
       linkopts=linkopts + _rpath_linkopts(name) + select({
           clean_dep("//tensorflow:darwin"): [
               "-Wl,-install_name,@rpath/" + name.split("/")[-1],
@@ -344,18 +372,21 @@ register_extension_info(
 def tf_cc_binary(name,
                  srcs=[],
                  deps=[],
+                 data=[],
                  linkopts=[],
                  copts=tf_copts(),
+                 kernels=[],
                  **kwargs):
   native.cc_binary(
       name=name,
       copts=copts,
       srcs=srcs + tf_binary_additional_srcs(),
-      deps=deps + if_mkl(
+      deps=deps + tf_binary_dynamic_kernel_deps(kernels) + if_mkl(
           [
               "//third_party/mkl:intel_binary_blob",
           ],
       ),
+      data=data +  tf_binary_dynamic_kernel_dsos(kernels),
       linkopts=linkopts + _rpath_linkopts(name),
       **kwargs)
 
@@ -540,9 +571,6 @@ def tf_gen_op_wrappers_cc(name,
 #     is invalid to specify both "hidden" and "op_whitelist".
 #   cc_linkopts: Optional linkopts to be added to tf_cc_binary that contains the
 #     specified ops.
-#   gen_locally: if True, the genrule to generate the Python library will be run
-#     without sandboxing. This would help when the genrule depends on symlinks
-#     which may not be supported in the sandbox.
 def tf_gen_op_wrapper_py(name,
                          out=None,
                          hidden=None,
@@ -553,8 +581,7 @@ def tf_gen_op_wrapper_py(name,
                          generated_target_name=None,
                          op_whitelist=[],
                          cc_linkopts=[],
-                         api_def_srcs=[],
-                         gen_locally=False):
+                         api_def_srcs=[]):
   if (hidden or hidden_file) and op_whitelist:
     fail('Cannot pass specify both hidden and op_whitelist.')
 
@@ -609,7 +636,6 @@ def tf_gen_op_wrapper_py(name,
         outs=[out],
         srcs=api_def_srcs + [hidden_file],
         tools=[tool_name] + tf_binary_additional_srcs(),
-        local = (1 if gen_locally else 0),
         cmd=("$(location " + tool_name + ") " + api_def_args_str +
              " @$(location " + hidden_file + ") " +
              ("1" if require_shape_functions else "0") + " > $@"))
@@ -619,7 +645,6 @@ def tf_gen_op_wrapper_py(name,
         outs=[out],
         srcs=api_def_srcs,
         tools=[tool_name] + tf_binary_additional_srcs(),
-        local = (1 if gen_locally else 0),
         cmd=("$(location " + tool_name + ") " + api_def_args_str + " " +
              op_list_arg + " " +
              ("1" if require_shape_functions else "0") + " " +
@@ -649,11 +674,13 @@ def tf_gen_op_wrapper_py(name,
 def tf_cc_test(name,
                srcs,
                deps,
+               data=[],
                linkstatic=0,
                extra_copts=[],
                suffix="",
                linkopts=[],
                nocopts=None,
+               kernels=[],
                **kwargs):
   native.cc_test(
       name="%s%s" % (name, suffix),
@@ -673,11 +700,12 @@ def tf_cc_test(name,
             "-lm"
         ],
       }) + linkopts + _rpath_linkopts(name),
-      deps=deps + if_mkl(
+      deps=deps + tf_binary_dynamic_kernel_deps(kernels) + if_mkl(
           [
               "//third_party/mkl:intel_binary_blob",
           ],
       ),
+      data=data + tf_binary_dynamic_kernel_dsos(kernels),
       # Nested select() statements seem not to be supported when passed to
       # linkstatic, and we already have a cuda select() passed in to this
       # function.
@@ -778,6 +806,7 @@ def tf_cuda_only_cc_test(name,
                     size="medium",
                     linkstatic=0,
                     args=[],
+                    kernels=[],
                     linkopts=[]):
   native.cc_test(
       name="%s%s" % (name, "_gpu"),
@@ -785,8 +814,8 @@ def tf_cuda_only_cc_test(name,
       size=size,
       args=args,
       copts= _cuda_copts() + tf_copts(),
-      data=data,
-      deps=deps + if_cuda([
+      data=data + tf_binary_dynamic_kernel_dsos(kernels),
+      deps=deps + tf_binary_dynamic_kernel_deps(kernels) + if_cuda([
           clean_dep("//tensorflow/core:cuda"),
           clean_dep("//tensorflow/core:gpu_lib")]),
       linkopts=if_not_windows(["-lpthread", "-lm"]) + linkopts + _rpath_linkopts(name),
@@ -829,9 +858,11 @@ def tf_cc_tests(srcs,
 def tf_cc_test_mkl(srcs,
                    deps,
                    name="",
+                   data=[],
                    linkstatic=0,
                    tags=[],
                    size="medium",
+                   kernels=[],
                    args=None):
   # -fno-exceptions in nocopts breaks compilation if header modules are enabled.
   disable_header_modules = ["-use_header_modules"]
@@ -852,11 +883,12 @@ def tf_cc_test_mkl(srcs,
             "-lm"
         ],
       }) + _rpath_linkopts(src_to_test_name(src)),
-      deps=deps + if_mkl(
+      deps=deps + tf_binary_dynamic_kernel_deps(kernels) + if_mkl(
           [
               "//third_party/mkl:intel_binary_blob",
           ],
       ),
+      data=data + tf_binary_dynamic_kernel_dsos(kernels),
       linkstatic=linkstatic,
       tags=tags,
       size=size,
@@ -896,12 +928,13 @@ def tf_cuda_cc_tests(srcs,
 def tf_java_test(name,
                  srcs=[],
                  deps=[],
+                 kernels=[],
                  *args,
                  **kwargs):
   native.java_test(
       name=name,
       srcs=srcs,
-      deps=deps + tf_binary_additional_srcs(),
+      deps=deps + tf_binary_additional_srcs() + tf_binary_dynamic_kernel_dsos(kernels) + tf_binary_dynamic_kernel_deps(kernels),
       *args,
       **kwargs)
 
@@ -1075,6 +1108,15 @@ def tf_kernel_library(
       deps=deps,
       **kwargs)
 
+  # TODO(gunan): CUDA dependency not clear here. Fix it.
+  tf_cc_shared_object(
+      name="libtfkernel_%s.so" % name,
+      srcs=srcs + hdrs,
+      copts=copts,
+      deps=deps,
+      tags=["manual", "notap"])
+
+
 register_extension_info(
     extension_name = "tf_kernel_library",
     label_regex_for_dep = "{extension_name}(_gpu)?",
@@ -1351,7 +1393,7 @@ def tf_custom_op_library(name, srcs=[], gpu_srcs=[], deps=[], linkopts=[]):
       name=name,
       srcs=srcs,
       deps=deps + if_cuda(cuda_deps),
-      data=[name + "_check_deps"],
+      data=if_static([name + "_check_deps"]),
       copts=tf_copts(is_external=True),
       features = ["windows_export_all_symbols"],
       linkopts=linkopts + select({
@@ -1447,7 +1489,7 @@ def tf_py_wrap_cc(name,
       srcs=srcs,
       swig_includes=swig_includes,
       deps=deps + extra_deps,
-      toolchain_deps=["//tools/defaults:crosstool"],
+      toolchain_deps=["@bazel_tools//tools/cpp:current_cc_toolchain"],
       module_name=module_name,
       py_module_name=name)
   vscriptname=name+"_versionscript"
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-classifier.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-classifier.pbtxt
index 9dbb5d16a4..c23b04b4ef 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-classifier.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\'], "
+    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\', \'pruning_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\', \'none\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-regressor.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-regressor.pbtxt
index 34a30c2874..6878d28fff 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-regressor.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'label_dimension\', \'weight_column\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\'], "
+    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'label_dimension\', \'weight_column\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\', \'pruning_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\', \'none\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt
index 8295905975..65cfad77d1 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt
@@ -267,6 +267,10 @@ tf_class {
     argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
+    name: "symbolic_set_inputs"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
     name: "test_on_batch"
     argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.activations.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.activations.pbtxt
index 2cd83baf65..2e9de9ebb2 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.activations.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.activations.pbtxt
@@ -22,7 +22,7 @@ tf_module {
   }
   member_method {
     name: "relu"
-    argspec: "args=[\'x\', \'alpha\', \'max_value\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\'], "
+    argspec: "args=[\'x\', \'alpha\', \'max_value\', \'threshold\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\', \'0\'], "
   }
   member_method {
     name: "selu"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.backend.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.backend.pbtxt
index fddac63b78..126ce8db6a 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.backend.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.backend.pbtxt
@@ -366,7 +366,7 @@ tf_module {
   }
   member_method {
     name: "relu"
-    argspec: "args=[\'x\', \'alpha\', \'max_value\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\'], "
+    argspec: "args=[\'x\', \'alpha\', \'max_value\', \'threshold\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\', \'0\'], "
   }
   member_method {
     name: "repeat"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-re-l-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-re-l-u.pbtxt
index c00fa79adf..4d3de58bd1 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-re-l-u.pbtxt
@@ -82,7 +82,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'max_value\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'max_value\', \'negative_slope\', \'threshold\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'0\', \'0\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.metrics.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.metrics.pbtxt
index a97a9b5758..73b577da37 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.metrics.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.metrics.pbtxt
@@ -22,7 +22,7 @@ tf_module {
   }
   member_method {
     name: "binary_accuracy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'threshold\'], varargs=None, keywords=None, defaults=[\'0.5\'], "
   }
   member_method {
     name: "binary_crossentropy"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt
index 5211657414..6a83129f7d 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt
@@ -267,6 +267,10 @@ tf_class {
     argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
+    name: "symbolic_set_inputs"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
     name: "test_on_batch"
     argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-zeros.__metaclass__.pbtxt b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-zeros.__metaclass__.pbtxt
new file mode 100644
index 0000000000..49ff85728f
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-zeros.__metaclass__.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.linalg.LinearOperatorZeros.__metaclass__"
+tf_class {
+  is_instance: "<class \'abc.ABCMeta\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "mro"
+  }
+  member_method {
+    name: "register"
+    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-zeros.pbtxt b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-zeros.pbtxt
new file mode 100644
index 0000000000..a1b0e06b47
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-zeros.pbtxt
@@ -0,0 +1,130 @@
+path: "tensorflow.linalg.LinearOperatorZeros"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_zeros.LinearOperatorZeros\'>"
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "batch_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "domain_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph_parents"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_non_singular"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_positive_definite"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_self_adjoint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_square"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "range_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "tensor_rank"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_rows\', \'num_columns\', \'batch_shape\', \'dtype\', \'is_non_singular\', \'is_self_adjoint\', \'is_positive_definite\', \'is_square\', \'assert_proper_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'True\', \'False\', \'True\', \'False\', \'LinearOperatorZeros\'], "
+  }
+  member_method {
+    name: "add_to_tensor"
+    argspec: "args=[\'self\', \'mat\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
+  }
+  member_method {
+    name: "assert_non_singular"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
+  }
+  member_method {
+    name: "assert_positive_definite"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_positive_definite\'], "
+  }
+  member_method {
+    name: "assert_self_adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_self_adjoint\'], "
+  }
+  member_method {
+    name: "batch_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
+  }
+  member_method {
+    name: "determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
+  }
+  member_method {
+    name: "diag_part"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'diag_part\'], "
+  }
+  member_method {
+    name: "domain_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
+  }
+  member_method {
+    name: "log_abs_determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
+  }
+  member_method {
+    name: "matmul"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'matmul\'], "
+  }
+  member_method {
+    name: "matvec"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'matvec\'], "
+  }
+  member_method {
+    name: "range_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'range_dimension_tensor\'], "
+  }
+  member_method {
+    name: "shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'shape_tensor\'], "
+  }
+  member_method {
+    name: "solve"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'solve\'], "
+  }
+  member_method {
+    name: "solvevec"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'solve\'], "
+  }
+  member_method {
+    name: "tensor_rank_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'tensor_rank_tensor\'], "
+  }
+  member_method {
+    name: "to_dense"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'to_dense\'], "
+  }
+  member_method {
+    name: "trace"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'trace\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.pbtxt b/tensorflow/tools/api/golden/tensorflow.linalg.pbtxt
index 3b5845f99a..d979116887 100644
--- a/tensorflow/tools/api/golden/tensorflow.linalg.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.linalg.pbtxt
@@ -52,6 +52,10 @@ tf_module {
     name: "LinearOperatorScaledIdentity"
     mtype: "<class \'abc.ABCMeta\'>"
   }
+  member {
+    name: "LinearOperatorZeros"
+    mtype: "<class \'abc.ABCMeta\'>"
+  }
   member_method {
     name: "adjoint"
     argspec: "args=[\'matrix\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/ci_build/Dockerfile.gpu b/tensorflow/tools/ci_build/Dockerfile.gpu
index 6ab703d93d..383f9545c9 100644
--- a/tensorflow/tools/ci_build/Dockerfile.gpu
+++ b/tensorflow/tools/ci_build/Dockerfile.gpu
@@ -15,10 +15,6 @@ RUN add-apt-repository -y ppa:openjdk-r/ppa && \
     add-apt-repository -y ppa:george-edison55/cmake-3.x
 RUN /install/install_deb_packages.sh
 
-# Install NCCL
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        libnccl2=2.2.13-1+cuda9.0
-
 RUN /install/install_pip_packages.sh
 RUN /install/install_bazel.sh
 RUN /install/install_golang.sh
@@ -27,6 +23,11 @@ RUN /install/install_golang.sh
 COPY install/.bazelrc /etc/bazel.bazelrc
 ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
 
+# Link NCCL libray and header where the build script expects them.
+RUN mkdir /usr/local/cuda-9.0/lib &&  \
+    ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/local/cuda/lib/libnccl.so.2 && \
+    ln -s /usr/include/nccl.h /usr/local/cuda/include/nccl.h
+
 # Configure the build for our CUDA configuration.
 ENV TF_NEED_CUDA 1
 ENV TF_CUDA_COMPUTE_CAPABILITIES 3.0
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index db37edf809..866fe95d2b 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -354,7 +354,7 @@ do_external_licenses_check(){
 
   # Whitelist
   echo ${EXTRA_LICENSE_FILE}
-  grep -e "@bazel_tools//src" -e "@bazel_tools//tools/" -e "@com_google_absl//" -e "//external" -e "@local" -e "@com_github_googlecloudplatform_google_cloud_cpp//" -v ${EXTRA_LICENSES_FILE} > temp.txt
+  grep -e "@bazel_tools//src" -e "@bazel_tools//tools/" -e "@com_google_absl//" -e "//external" -e "@local" -e "@com_github_googlecloudplatform_google_cloud_cpp//" -e "@embedded_jdk//" -v ${EXTRA_LICENSES_FILE} > temp.txt
   mv temp.txt ${EXTRA_LICENSES_FILE}
 
 
diff --git a/tensorflow/tools/ci_build/install/install_bazel.sh b/tensorflow/tools/ci_build/install/install_bazel.sh
index adbff8f6ef..e284401b8a 100755
--- a/tensorflow/tools/ci_build/install/install_bazel.sh
+++ b/tensorflow/tools/ci_build/install/install_bazel.sh
@@ -15,7 +15,7 @@
 # ==============================================================================
 
 # Select bazel version.
-BAZEL_VERSION="0.14.1"
+BAZEL_VERSION="0.15.0"
 
 set +e
 local_bazel_ver=$(bazel version 2>&1 | grep -i label | awk '{print $3}')
diff --git a/tensorflow/tools/ci_build/install/install_bazel_from_source.sh b/tensorflow/tools/ci_build/install/install_bazel_from_source.sh
index 9d24b3e421..87be81577d 100755
--- a/tensorflow/tools/ci_build/install/install_bazel_from_source.sh
+++ b/tensorflow/tools/ci_build/install/install_bazel_from_source.sh
@@ -18,7 +18,7 @@
 # It will compile bazel from source and install it in /usr/local/bin
 
 # Select bazel version.
-BAZEL_VERSION="0.14.1"
+BAZEL_VERSION="0.15.0"
 
 set +e
 local_bazel_ver=$(bazel version 2>&1 | grep -i label | awk '{print $3}')
diff --git a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
index c03cbd9c66..0482cf619a 100644
--- a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
@@ -33,10 +33,10 @@ function set_remote_cache_options {
   echo "build --tls_enabled=true" >> "${TMP_BAZELRC}"
   echo "build --remote_timeout=3600" >> "${TMP_BAZELRC}"
   echo "build --auth_enabled=true" >> "${TMP_BAZELRC}"
-  echo "build --spawn_strategy=remote" >> "${TMP_BAZELRC}"
-  echo "build --strategy=Javac=remote" >> "${TMP_BAZELRC}"
-  echo "build --strategy=Closure=remote" >> "${TMP_BAZELRC}"
-  echo "build --genrule_strategy=remote" >> "${TMP_BAZELRC}"
+  echo "build --spawn_strategy=standalone" >> "${TMP_BAZELRC}"
+  echo "build --strategy=Javac=standalone" >> "${TMP_BAZELRC}"
+  echo "build --strategy=Closure=standalone" >> "${TMP_BAZELRC}"
+  echo "build --genrule_strategy=standalone" >> "${TMP_BAZELRC}"
   echo "build --google_credentials=$GOOGLE_CLOUD_CREDENTIAL" >> "${TMP_BAZELRC}"
 }
 
diff --git a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
index 42f58deb42..47e0e5dd59 100644
--- a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
@@ -53,20 +53,31 @@ function cleanup {
 }
 trap cleanup EXIT
 
-skip_test=0
-release_build=0
+PY_TEST_DIR="py_test_dir"
 
+SKIP_TEST=0
+RELEASE_BUILD=0
+TEST_TARGET="//${PY_TEST_DIR}/tensorflow/python/... \
+    //${PY_TEST_DIR}/tensorflow/contrib/... "
+
+# --skip_test            Skip running tests
+# --enable_remote_cache  Add options to enable remote cache for build and test
+# --release_build        Build for release, compilation time will be longer to
+#                        ensure performance
+# --test_core_only       Use tensorflow/python/... as test target
+# --test_contrib_only    Use tensorflow/contrib/... as test target
 for ARG in "$@"; do
-  if [[ "$ARG" == --skip_test ]]; then
-    skip_test=1
-  elif [[ "$ARG" == --enable_remote_cache ]]; then
-    set_remote_cache_options
-  elif [[ "$ARG" == --release_build ]]; then
-    release_build=1
-  fi
+  case "$ARG" in
+    --skip_test) SKIP_TEST=1 ;;
+    --enable_remote_cache) set_remote_cache_options ;;
+    --release_build) RELEASE_BUILD=1 ;;
+    --test_core_only) TEST_TARGET="//${PY_TEST_DIR}/tensorflow/python/..." ;;
+    --test_contrib_only) TEST_TARGET="//${PY_TEST_DIR}/tensorflow/contrib/..." ;;
+    *)
+  esac
 done
 
-if [[ "$release_build" == 1 ]]; then
+if [[ "$RELEASE_BUILD" == 1 ]]; then
   # Overriding eigen strong inline speeds up the compiling of conv_grad_ops_3d.cc and conv_ops_3d.cc
   # by 20 minutes. See https://github.com/tensorflow/tensorflow/issues/10521
   # Because this hurts the performance of TF, we don't override it in release build.
@@ -86,12 +97,11 @@ run_configure_for_cpu_build
 
 bazel build --announce_rc --config=opt tensorflow/tools/pip_package:build_pip_package || exit $?
 
-if [[ "$skip_test" == 1 ]]; then
+if [[ "$SKIP_TEST" == 1 ]]; then
   exit 0
 fi
 
 # Create a python test directory to avoid package name conflict
-PY_TEST_DIR="py_test_dir"
 create_python_test_dir "${PY_TEST_DIR}"
 
 ./bazel-bin/tensorflow/tools/pip_package/build_pip_package "$PWD/${PY_TEST_DIR}"
@@ -112,5 +122,4 @@ bazel test --announce_rc --config=opt -k --test_output=errors \
   --test_size_filters=small,medium \
   --jobs="${N_JOBS}" --test_timeout="300,450,1200,3600" \
   --flaky_test_attempts=3 \
-  //${PY_TEST_DIR}/tensorflow/python/... \
-  //${PY_TEST_DIR}/tensorflow/contrib/...
+  ${TEST_TARGET}
diff --git a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
index 2a8c2d9167..e3eee11080 100644
--- a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
@@ -53,20 +53,31 @@ function cleanup {
 }
 trap cleanup EXIT
 
-skip_test=0
-release_build=0
+PY_TEST_DIR="py_test_dir"
 
+SKIP_TEST=0
+RELEASE_BUILD=0
+TEST_TARGET="//${PY_TEST_DIR}/tensorflow/python/... \
+  //${PY_TEST_DIR}/tensorflow/contrib/... "
+
+# --skip_test            Skip running tests
+# --enable_remote_cache  Add options to enable remote cache for build and test
+# --release_build        Build for release, compilation time will be longer to
+#                        ensure performance
+# --test_core_only       Use tensorflow/python/... as test target
+# --test_contrib_only    Use tensorflow/contrib/... as test target
 for ARG in "$@"; do
-  if [[ "$ARG" == --skip_test ]]; then
-    skip_test=1
-  elif [[ "$ARG" == --enable_remote_cache ]]; then
-    set_remote_cache_options
-  elif [[ "$ARG" == --release_build ]]; then
-    release_build=1
-  fi
+  case "$ARG" in
+    --skip_test) SKIP_TEST=1 ;;
+    --enable_remote_cache) set_remote_cache_options ;;
+    --release_build) RELEASE_BUILD=1 ;;
+    --test_core_only) TEST_TARGET="//${PY_TEST_DIR}/tensorflow/python/..." ;;
+    --test_contrib_only) TEST_TARGET="//${PY_TEST_DIR}/tensorflow/contrib/..." ;;
+    *)
+  esac
 done
 
-if [[ "$release_build" == 1 ]]; then
+if [[ "$RELEASE_BUILD" == 1 ]]; then
   # Overriding eigen strong inline speeds up the compiling of conv_grad_ops_3d.cc and conv_ops_3d.cc
   # by 20 minutes. See https://github.com/tensorflow/tensorflow/issues/10521
   # Because this hurts the performance of TF, we don't override it in release build.
@@ -89,12 +100,11 @@ run_configure_for_gpu_build
 
 bazel build --announce_rc --config=opt tensorflow/tools/pip_package:build_pip_package || exit $?
 
-if [[ "$skip_test" == 1 ]]; then
+if [[ "$SKIP_TEST" == 1 ]]; then
   exit 0
 fi
 
 # Create a python test directory to avoid package name conflict
-PY_TEST_DIR="py_test_dir"
 create_python_test_dir "${PY_TEST_DIR}"
 
 ./bazel-bin/tensorflow/tools/pip_package/build_pip_package "$PWD/${PY_TEST_DIR}"
@@ -117,5 +127,4 @@ bazel test --announce_rc --config=opt -k --test_output=errors \
   --test_size_filters=small,medium \
   --local_test_jobs=$TF_GPU_COUNT --test_timeout="300,450,1200,3600" \
   --flaky_test_attempts=3 \
-  //${PY_TEST_DIR}/tensorflow/python/... \
-  //${PY_TEST_DIR}/tensorflow/contrib/...
+  ${TEST_TARGET}
diff --git a/tensorflow/tools/compatibility/BUILD b/tensorflow/tools/compatibility/BUILD
index b7bfb29aae..55792c51fe 100644
--- a/tensorflow/tools/compatibility/BUILD
+++ b/tensorflow/tools/compatibility/BUILD
@@ -8,10 +8,17 @@ load(
     "tf_cc_test",  # @unused
 )
 
+py_library(
+    name = "ast_edits",
+    srcs = ["ast_edits.py"],
+    srcs_version = "PY2AND3",
+)
+
 py_binary(
     name = "tf_upgrade",
     srcs = ["tf_upgrade.py"],
     srcs_version = "PY2AND3",
+    deps = [":ast_edits"],
 )
 
 py_test(
@@ -26,6 +33,28 @@ py_test(
     ],
 )
 
+py_binary(
+    name = "tf_upgrade_v2",
+    srcs = [
+        "renames_v2.py",
+        "tf_upgrade_v2.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [":ast_edits"],
+)
+
+py_test(
+    name = "tf_upgrade_v2_test",
+    srcs = ["tf_upgrade_v2_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":tf_upgrade_v2",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+        "@six_archive//:six",
+    ],
+)
+
 # Keep for reference, this test will succeed in 0.11 but fail in 1.0
 # py_test(
 #     name = "test_file_v0_11",
@@ -62,9 +91,37 @@ py_test(
     ],
 )
 
+genrule(
+    name = "generate_upgraded_file_v2",
+    testonly = 1,
+    srcs = ["testdata/test_file_v1_10.py"],
+    outs = [
+        "test_file_v2_0.py",
+        "report_v2.txt",
+    ],
+    cmd = ("$(location :tf_upgrade_v2)" +
+           " --infile $(location testdata/test_file_v1_10.py)" +
+           " --outfile $(location test_file_v2_0.py)" +
+           " --reportfile $(location report_v2.txt)"),
+    tools = [":tf_upgrade_v2"],
+)
+
+py_test(
+    name = "test_file_v2_0",
+    size = "small",
+    srcs = ["test_file_v2_0.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
 exports_files(
     [
+        "ast_edits.py",
         "tf_upgrade.py",
+        "renames_v2.py",
         "testdata/test_file_v0_11.py",
+        "testdata/test_file_v1_10.py",
     ],
 )
diff --git a/tensorflow/tools/compatibility/renames_v2.py b/tensorflow/tools/compatibility/renames_v2.py
new file mode 100644
index 0000000000..216aa41b60
--- /dev/null
+++ b/tensorflow/tools/compatibility/renames_v2.py
@@ -0,0 +1,134 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=line-too-long
+"""List of renames to apply when converting from TF 1.0 to TF 2.0.
+
+THIS FILE IS AUTOGENERATED: To update, please run:
+  bazel build tensorflow/tools/compatibility/update:generate_v2_renames_map
+  bazel-bin/tensorflow/tools/compatibility/update/generate_v2_renames_map
+This file should be updated whenever endpoints are deprecated.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+renames = {
+    'tf.acos': 'tf.math.acos',
+    'tf.acosh': 'tf.math.acosh',
+    'tf.add': 'tf.math.add',
+    'tf.as_string': 'tf.dtypes.as_string',
+    'tf.asin': 'tf.math.asin',
+    'tf.asinh': 'tf.math.asinh',
+    'tf.atan': 'tf.math.atan',
+    'tf.atan2': 'tf.math.atan2',
+    'tf.atanh': 'tf.math.atanh',
+    'tf.batch_to_space_nd': 'tf.manip.batch_to_space_nd',
+    'tf.betainc': 'tf.math.betainc',
+    'tf.ceil': 'tf.math.ceil',
+    'tf.check_numerics': 'tf.debugging.check_numerics',
+    'tf.cholesky': 'tf.linalg.cholesky',
+    'tf.cos': 'tf.math.cos',
+    'tf.cosh': 'tf.math.cosh',
+    'tf.cross': 'tf.linalg.cross',
+    'tf.decode_base64': 'tf.io.decode_base64',
+    'tf.decode_compressed': 'tf.io.decode_compressed',
+    'tf.decode_json_example': 'tf.io.decode_json_example',
+    'tf.decode_raw': 'tf.io.decode_raw',
+    'tf.dequantize': 'tf.quantization.dequantize',
+    'tf.diag': 'tf.linalg.tensor_diag',
+    'tf.diag_part': 'tf.linalg.tensor_diag_part',
+    'tf.digamma': 'tf.math.digamma',
+    'tf.encode_base64': 'tf.io.encode_base64',
+    'tf.equal': 'tf.math.equal',
+    'tf.erfc': 'tf.math.erfc',
+    'tf.exp': 'tf.math.exp',
+    'tf.expm1': 'tf.math.expm1',
+    'tf.extract_image_patches': 'tf.image.extract_image_patches',
+    'tf.fake_quant_with_min_max_args': 'tf.quantization.fake_quant_with_min_max_args',
+    'tf.fake_quant_with_min_max_args_gradient': 'tf.quantization.fake_quant_with_min_max_args_gradient',
+    'tf.fake_quant_with_min_max_vars': 'tf.quantization.fake_quant_with_min_max_vars',
+    'tf.fake_quant_with_min_max_vars_gradient': 'tf.quantization.fake_quant_with_min_max_vars_gradient',
+    'tf.fake_quant_with_min_max_vars_per_channel': 'tf.quantization.fake_quant_with_min_max_vars_per_channel',
+    'tf.fake_quant_with_min_max_vars_per_channel_gradient': 'tf.quantization.fake_quant_with_min_max_vars_per_channel_gradient',
+    'tf.fft': 'tf.spectral.fft',
+    'tf.floor': 'tf.math.floor',
+    'tf.gather_nd': 'tf.manip.gather_nd',
+    'tf.greater': 'tf.math.greater',
+    'tf.greater_equal': 'tf.math.greater_equal',
+    'tf.ifft': 'tf.spectral.ifft',
+    'tf.igamma': 'tf.math.igamma',
+    'tf.igammac': 'tf.math.igammac',
+    'tf.invert_permutation': 'tf.math.invert_permutation',
+    'tf.is_finite': 'tf.debugging.is_finite',
+    'tf.is_inf': 'tf.debugging.is_inf',
+    'tf.is_nan': 'tf.debugging.is_nan',
+    'tf.less': 'tf.math.less',
+    'tf.less_equal': 'tf.math.less_equal',
+    'tf.lgamma': 'tf.math.lgamma',
+    'tf.log': 'tf.math.log',
+    'tf.log1p': 'tf.math.log1p',
+    'tf.logical_and': 'tf.math.logical_and',
+    'tf.logical_not': 'tf.math.logical_not',
+    'tf.logical_or': 'tf.math.logical_or',
+    'tf.matching_files': 'tf.io.matching_files',
+    'tf.matrix_band_part': 'tf.linalg.band_part',
+    'tf.matrix_determinant': 'tf.linalg.det',
+    'tf.matrix_diag': 'tf.linalg.diag',
+    'tf.matrix_diag_part': 'tf.linalg.diag_part',
+    'tf.matrix_inverse': 'tf.linalg.inv',
+    'tf.matrix_set_diag': 'tf.linalg.set_diag',
+    'tf.matrix_solve': 'tf.linalg.solve',
+    'tf.matrix_triangular_solve': 'tf.linalg.triangular_solve',
+    'tf.maximum': 'tf.math.maximum',
+    'tf.minimum': 'tf.math.minimum',
+    'tf.not_equal': 'tf.math.not_equal',
+    'tf.parse_tensor': 'tf.io.parse_tensor',
+    'tf.polygamma': 'tf.math.polygamma',
+    'tf.qr': 'tf.linalg.qr',
+    'tf.quantized_concat': 'tf.quantization.quantized_concat',
+    'tf.read_file': 'tf.io.read_file',
+    'tf.reciprocal': 'tf.math.reciprocal',
+    'tf.regex_replace': 'tf.strings.regex_replace',
+    'tf.reshape': 'tf.manip.reshape',
+    'tf.reverse': 'tf.manip.reverse',
+    'tf.reverse_v2': 'tf.manip.reverse',
+    'tf.rint': 'tf.math.rint',
+    'tf.rsqrt': 'tf.math.rsqrt',
+    'tf.scatter_nd': 'tf.manip.scatter_nd',
+    'tf.segment_max': 'tf.math.segment_max',
+    'tf.segment_mean': 'tf.math.segment_mean',
+    'tf.segment_min': 'tf.math.segment_min',
+    'tf.segment_prod': 'tf.math.segment_prod',
+    'tf.segment_sum': 'tf.math.segment_sum',
+    'tf.sin': 'tf.math.sin',
+    'tf.sinh': 'tf.math.sinh',
+    'tf.space_to_batch_nd': 'tf.manip.space_to_batch_nd',
+    'tf.squared_difference': 'tf.math.squared_difference',
+    'tf.string_join': 'tf.strings.join',
+    'tf.string_strip': 'tf.strings.strip',
+    'tf.string_to_hash_bucket': 'tf.strings.to_hash_bucket',
+    'tf.string_to_hash_bucket_fast': 'tf.strings.to_hash_bucket_fast',
+    'tf.string_to_hash_bucket_strong': 'tf.strings.to_hash_bucket_strong',
+    'tf.string_to_number': 'tf.strings.to_number',
+    'tf.substr': 'tf.strings.substr',
+    'tf.tan': 'tf.math.tan',
+    'tf.tile': 'tf.manip.tile',
+    'tf.unsorted_segment_max': 'tf.math.unsorted_segment_max',
+    'tf.unsorted_segment_min': 'tf.math.unsorted_segment_min',
+    'tf.unsorted_segment_prod': 'tf.math.unsorted_segment_prod',
+    'tf.unsorted_segment_sum': 'tf.math.unsorted_segment_sum',
+    'tf.write_file': 'tf.io.write_file',
+    'tf.zeta': 'tf.math.zeta'
+}
diff --git a/tensorflow/tools/compatibility/testdata/test_file_v1_10.py b/tensorflow/tools/compatibility/testdata/test_file_v1_10.py
new file mode 100644
index 0000000000..a49035a1a0
--- /dev/null
+++ b/tensorflow/tools/compatibility/testdata/test_file_v1_10.py
@@ -0,0 +1,34 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf upgrader."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tensorflow as tf
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test as test_lib
+
+
+class TestUpgrade(test_util.TensorFlowTestCase):
+  """Test various APIs that have been changed in 2.0."""
+
+  def testRenames(self):
+    with self.test_session():
+      self.assertAllClose(1.04719755, tf.acos(0.5).eval())
+      self.assertAllClose(0.5, tf.rsqrt(4.0).eval())
+
+if __name__ == "__main__":
+  test_lib.main()
diff --git a/tensorflow/tools/compatibility/tf_upgrade.py b/tensorflow/tools/compatibility/tf_upgrade.py
index 1f8833582a..96705b1a4c 100644
--- a/tensorflow/tools/compatibility/tf_upgrade.py
+++ b/tensorflow/tools/compatibility/tf_upgrade.py
@@ -19,491 +19,11 @@ from __future__ import division
 from __future__ import print_function
 
 import argparse
-import ast
-import collections
-import os
-import shutil
-import sys
-import tempfile
-import traceback
 
+from tensorflow.tools.compatibility import ast_edits
 
-class APIChangeSpec(object):
-  """This class defines the transformations that need to happen.
 
-  This class must provide the following fields:
-
-  * `function_keyword_renames`: maps function names to a map of old -> new
-    argument names
-  * `function_renames`: maps function names to new function names
-  * `change_to_function`: a set of function names that have changed (for
-    notifications)
-  * `function_reorders`: maps functions whose argument order has changed to the
-    list of arguments in the new order
-  * `function_handle`: maps function names to custom handlers for the function
-
-  For an example, see `TFAPIChangeSpec`.
-  """
-
-
-class _FileEditTuple(
-    collections.namedtuple("_FileEditTuple",
-                           ["comment", "line", "start", "old", "new"])):
-  """Each edit that is recorded by a _FileEditRecorder.
-
-  Fields:
-    comment: A description of the edit and why it was made.
-    line: The line number in the file where the edit occurs (1-indexed).
-    start: The line number in the file where the edit occurs (0-indexed).
-    old: text string to remove (this must match what was in file).
-    new: text string to add in place of `old`.
-  """
-
-  __slots__ = ()
-
-
-class _FileEditRecorder(object):
-  """Record changes that need to be done to the file."""
-
-  def __init__(self, filename):
-    # all edits are lists of chars
-    self._filename = filename
-
-    self._line_to_edit = collections.defaultdict(list)
-    self._errors = []
-
-  def process(self, text):
-    """Process a list of strings, each corresponding to the recorded changes.
-
-    Args:
-      text: A list of lines of text (assumed to contain newlines)
-    Returns:
-      A tuple of the modified text and a textual description of what is done.
-    Raises:
-      ValueError: if substitution source location does not have expected text.
-    """
-
-    change_report = ""
-
-    # Iterate of each line
-    for line, edits in self._line_to_edit.items():
-      offset = 0
-      # sort by column so that edits are processed in order in order to make
-      # indexing adjustments cumulative for changes that change the string
-      # length
-      edits.sort(key=lambda x: x.start)
-
-      # Extract each line to a list of characters, because mutable lists
-      # are editable, unlike immutable strings.
-      char_array = list(text[line - 1])
-
-      # Record a description of the change
-      change_report += "%r Line %d\n" % (self._filename, line)
-      change_report += "-" * 80 + "\n\n"
-      for e in edits:
-        change_report += "%s\n" % e.comment
-      change_report += "\n    Old: %s" % (text[line - 1])
-
-      # Make underscore buffers for underlining where in the line the edit was
-      change_list = [" "] * len(text[line - 1])
-      change_list_new = [" "] * len(text[line - 1])
-
-      # Iterate for each edit
-      for e in edits:
-        # Create effective start, end by accounting for change in length due
-        # to previous edits
-        start_eff = e.start + offset
-        end_eff = start_eff + len(e.old)
-
-        # Make sure the edit is changing what it should be changing
-        old_actual = "".join(char_array[start_eff:end_eff])
-        if old_actual != e.old:
-          raise ValueError("Expected text %r but got %r" %
-                           ("".join(e.old), "".join(old_actual)))
-        # Make the edit
-        char_array[start_eff:end_eff] = list(e.new)
-
-        # Create the underline highlighting of the before and after
-        change_list[e.start:e.start + len(e.old)] = "~" * len(e.old)
-        change_list_new[start_eff:end_eff] = "~" * len(e.new)
-
-        # Keep track of how to generate effective ranges
-        offset += len(e.new) - len(e.old)
-
-      # Finish the report comment
-      change_report += "         %s\n" % "".join(change_list)
-      text[line - 1] = "".join(char_array)
-      change_report += "    New: %s" % (text[line - 1])
-      change_report += "         %s\n\n" % "".join(change_list_new)
-    return "".join(text), change_report, self._errors
-
-  def add(self, comment, line, start, old, new, error=None):
-    """Add a new change that is needed.
-
-    Args:
-      comment: A description of what was changed
-      line: Line number (1 indexed)
-      start: Column offset (0 indexed)
-      old: old text
-      new: new text
-      error: this "edit" is something that cannot be fixed automatically
-    Returns:
-      None
-    """
-
-    self._line_to_edit[line].append(
-        _FileEditTuple(comment, line, start, old, new))
-    if error:
-      self._errors.append("%s:%d: %s" % (self._filename, line, error))
-
-
-class _ASTCallVisitor(ast.NodeVisitor):
-  """AST Visitor that processes function calls.
-
-  Updates function calls from old API version to new API version using a given
-  change spec.
-  """
-
-  def __init__(self, filename, lines, api_change_spec):
-    self._filename = filename
-    self._file_edit = _FileEditRecorder(filename)
-    self._lines = lines
-    self._api_change_spec = api_change_spec
-
-  def process(self, lines):
-    return self._file_edit.process(lines)
-
-  def generic_visit(self, node):
-    ast.NodeVisitor.generic_visit(self, node)
-
-  def _rename_functions(self, node, full_name):
-    function_renames = self._api_change_spec.function_renames
-    try:
-      new_name = function_renames[full_name]
-      self._file_edit.add("Renamed function %r to %r" % (full_name, new_name),
-                          node.lineno, node.col_offset, full_name, new_name)
-    except KeyError:
-      pass
-
-  def _get_attribute_full_path(self, node):
-    """Traverse an attribute to generate a full name e.g. tf.foo.bar.
-
-    Args:
-      node: A Node of type Attribute.
-
-    Returns:
-      a '.'-delimited full-name or None if the tree was not a simple form.
-      i.e. `foo()+b).bar` returns None, while `a.b.c` would return "a.b.c".
-    """
-    curr = node
-    items = []
-    while not isinstance(curr, ast.Name):
-      if not isinstance(curr, ast.Attribute):
-        return None
-      items.append(curr.attr)
-      curr = curr.value
-    items.append(curr.id)
-    return ".".join(reversed(items))
-
-  def _find_true_position(self, node):
-    """Return correct line number and column offset for a given node.
-
-    This is necessary mainly because ListComp's location reporting reports
-    the next token after the list comprehension list opening.
-
-    Args:
-      node: Node for which we wish to know the lineno and col_offset
-    """
-    import re
-    find_open = re.compile("^\s*(\\[).*$")
-    find_string_chars = re.compile("['\"]")
-
-    if isinstance(node, ast.ListComp):
-      # Strangely, ast.ListComp returns the col_offset of the first token
-      # after the '[' token which appears to be a bug. Workaround by
-      # explicitly finding the real start of the list comprehension.
-      line = node.lineno
-      col = node.col_offset
-      # loop over lines
-      while 1:
-        # Reverse the text to and regular expression search for whitespace
-        text = self._lines[line - 1]
-        reversed_preceding_text = text[:col][::-1]
-        # First find if a [ can be found with only whitespace between it and
-        # col.
-        m = find_open.match(reversed_preceding_text)
-        if m:
-          new_col_offset = col - m.start(1) - 1
-          return line, new_col_offset
-        else:
-          if (reversed_preceding_text == "" or
-              reversed_preceding_text.isspace()):
-            line = line - 1
-            prev_line = self._lines[line - 1]
-            # TODO(aselle):
-            # this is poor comment detection, but it is good enough for
-            # cases where the comment does not contain string literal starting/
-            # ending characters. If ast gave us start and end locations of the
-            # ast nodes rather than just start, we could use string literal
-            # node ranges to filter out spurious #'s that appear in string
-            # literals.
-            comment_start = prev_line.find("#")
-            if comment_start == -1:
-              col = len(prev_line) - 1
-            elif find_string_chars.search(prev_line[comment_start:]) is None:
-              col = comment_start
-            else:
-              return None, None
-          else:
-            return None, None
-    # Most other nodes return proper locations (with notably does not), but
-    # it is not possible to use that in an argument.
-    return node.lineno, node.col_offset
-
-  def visit_Call(self, node):  # pylint: disable=invalid-name
-    """Handle visiting a call node in the AST.
-
-    Args:
-      node: Current Node
-    """
-
-    # Find a simple attribute name path e.g. "tf.foo.bar"
-    full_name = self._get_attribute_full_path(node.func)
-
-    # Make sure the func is marked as being part of a call
-    node.func.is_function_for_call = True
-
-    if full_name:
-      # Call special handlers
-      function_handles = self._api_change_spec.function_handle
-      if full_name in function_handles:
-        function_handles[full_name](self._file_edit, node)
-
-      # Examine any non-keyword argument and make it into a keyword argument
-      # if reordering required.
-      function_reorders = self._api_change_spec.function_reorders
-      function_keyword_renames = (
-          self._api_change_spec.function_keyword_renames)
-
-      if full_name in function_reorders:
-        reordered = function_reorders[full_name]
-        for idx, arg in enumerate(node.args):
-          lineno, col_offset = self._find_true_position(arg)
-          if lineno is None or col_offset is None:
-            self._file_edit.add(
-                "Failed to add keyword %r to reordered function %r" %
-                (reordered[idx], full_name),
-                arg.lineno,
-                arg.col_offset,
-                "",
-                "",
-                error="A necessary keyword argument failed to be inserted.")
-          else:
-            keyword_arg = reordered[idx]
-            if (full_name in function_keyword_renames and
-                keyword_arg in function_keyword_renames[full_name]):
-              keyword_arg = function_keyword_renames[full_name][keyword_arg]
-            self._file_edit.add("Added keyword %r to reordered function %r" %
-                                (reordered[idx], full_name), lineno, col_offset,
-                                "", keyword_arg + "=")
-
-      # Examine each keyword argument and convert it to the final renamed form
-      renamed_keywords = ({} if full_name not in function_keyword_renames else
-                          function_keyword_renames[full_name])
-      for keyword in node.keywords:
-        argkey = keyword.arg
-        argval = keyword.value
-
-        if argkey in renamed_keywords:
-          argval_lineno, argval_col_offset = self._find_true_position(argval)
-          if argval_lineno is not None and argval_col_offset is not None:
-            # TODO(aselle): We should scan backward to find the start of the
-            # keyword key. Unfortunately ast does not give you the location of
-            # keyword keys, so we are forced to infer it from the keyword arg
-            # value.
-            key_start = argval_col_offset - len(argkey) - 1
-            key_end = key_start + len(argkey) + 1
-            if (self._lines[argval_lineno - 1][key_start:key_end] == argkey +
-                "="):
-              self._file_edit.add("Renamed keyword argument from %r to %r" %
-                                  (argkey,
-                                   renamed_keywords[argkey]), argval_lineno,
-                                  argval_col_offset - len(argkey) - 1,
-                                  argkey + "=", renamed_keywords[argkey] + "=")
-              continue
-          self._file_edit.add(
-              "Failed to rename keyword argument from %r to %r" %
-              (argkey, renamed_keywords[argkey]),
-              argval.lineno,
-              argval.col_offset - len(argkey) - 1,
-              "",
-              "",
-              error="Failed to find keyword lexographically. Fix manually.")
-
-    ast.NodeVisitor.generic_visit(self, node)
-
-  def visit_Attribute(self, node):  # pylint: disable=invalid-name
-    """Handle bare Attributes i.e. [tf.foo, tf.bar].
-
-    Args:
-      node: Node that is of type ast.Attribute
-    """
-    full_name = self._get_attribute_full_path(node)
-    if full_name:
-      self._rename_functions(node, full_name)
-    if full_name in self._api_change_spec.change_to_function:
-      if not hasattr(node, "is_function_for_call"):
-        new_text = full_name + "()"
-        self._file_edit.add("Changed %r to %r" % (full_name, new_text),
-                            node.lineno, node.col_offset, full_name, new_text)
-
-    ast.NodeVisitor.generic_visit(self, node)
-
-
-class ASTCodeUpgrader(object):
-  """Handles upgrading a set of Python files using a given API change spec."""
-
-  def __init__(self, api_change_spec):
-    if not isinstance(api_change_spec, APIChangeSpec):
-      raise TypeError("Must pass APIChangeSpec to ASTCodeUpgrader, got %s" %
-                      type(api_change_spec))
-    self._api_change_spec = api_change_spec
-
-  def process_file(self, in_filename, out_filename):
-    """Process the given python file for incompatible changes.
-
-    Args:
-      in_filename: filename to parse
-      out_filename: output file to write to
-    Returns:
-      A tuple representing number of files processed, log of actions, errors
-    """
-
-    # Write to a temporary file, just in case we are doing an implace modify.
-    with open(in_filename, "r") as in_file, \
-        tempfile.NamedTemporaryFile("w", delete=False) as temp_file:
-      ret = self.process_opened_file(in_filename, in_file, out_filename,
-                                     temp_file)
-
-    shutil.move(temp_file.name, out_filename)
-    return ret
-
-  # Broad exceptions are required here because ast throws whatever it wants.
-  # pylint: disable=broad-except
-  def process_opened_file(self, in_filename, in_file, out_filename, out_file):
-    """Process the given python file for incompatible changes.
-
-    This function is split out to facilitate StringIO testing from
-    tf_upgrade_test.py.
-
-    Args:
-      in_filename: filename to parse
-      in_file: opened file (or StringIO)
-      out_filename: output file to write to
-      out_file: opened file (or StringIO)
-    Returns:
-      A tuple representing number of files processed, log of actions, errors
-    """
-    process_errors = []
-    text = "-" * 80 + "\n"
-    text += "Processing file %r\n outputting to %r\n" % (in_filename,
-                                                         out_filename)
-    text += "-" * 80 + "\n\n"
-
-    parsed_ast = None
-    lines = in_file.readlines()
-    try:
-      parsed_ast = ast.parse("".join(lines))
-    except Exception:
-      text += "Failed to parse %r\n\n" % in_filename
-      text += traceback.format_exc()
-    if parsed_ast:
-      visitor = _ASTCallVisitor(in_filename, lines, self._api_change_spec)
-      visitor.visit(parsed_ast)
-      out_text, new_text, process_errors = visitor.process(lines)
-      text += new_text
-      if out_file:
-        out_file.write(out_text)
-    text += "\n"
-    return 1, text, process_errors
-
-  # pylint: enable=broad-except
-
-  def process_tree(self, root_directory, output_root_directory,
-                   copy_other_files):
-    """Processes upgrades on an entire tree of python files in place.
-
-    Note that only Python files. If you have custom code in other languages,
-    you will need to manually upgrade those.
-
-    Args:
-      root_directory: Directory to walk and process.
-      output_root_directory: Directory to use as base.
-      copy_other_files: Copy files that are not touched by this converter.
-
-    Returns:
-      A tuple of files processed, the report string ofr all files, and errors
-    """
-
-    # make sure output directory doesn't exist
-    if output_root_directory and os.path.exists(output_root_directory):
-      print("Output directory %r must not already exist." %
-            (output_root_directory))
-      sys.exit(1)
-
-    # make sure output directory does not overlap with root_directory
-    norm_root = os.path.split(os.path.normpath(root_directory))
-    norm_output = os.path.split(os.path.normpath(output_root_directory))
-    if norm_root == norm_output:
-      print("Output directory %r same as input directory %r" %
-            (root_directory, output_root_directory))
-      sys.exit(1)
-
-    # Collect list of files to process (we do this to correctly handle if the
-    # user puts the output directory in some sub directory of the input dir)
-    files_to_process = []
-    files_to_copy = []
-    for dir_name, _, file_list in os.walk(root_directory):
-      py_files = [f for f in file_list if f.endswith(".py")]
-      copy_files = [f for f in file_list if not f.endswith(".py")]
-      for filename in py_files:
-        fullpath = os.path.join(dir_name, filename)
-        fullpath_output = os.path.join(output_root_directory,
-                                       os.path.relpath(fullpath,
-                                                       root_directory))
-        files_to_process.append((fullpath, fullpath_output))
-      if copy_other_files:
-        for filename in copy_files:
-          fullpath = os.path.join(dir_name, filename)
-          fullpath_output = os.path.join(output_root_directory,
-                                         os.path.relpath(
-                                             fullpath, root_directory))
-          files_to_copy.append((fullpath, fullpath_output))
-
-    file_count = 0
-    tree_errors = []
-    report = ""
-    report += ("=" * 80) + "\n"
-    report += "Input tree: %r\n" % root_directory
-    report += ("=" * 80) + "\n"
-
-    for input_path, output_path in files_to_process:
-      output_directory = os.path.dirname(output_path)
-      if not os.path.isdir(output_directory):
-        os.makedirs(output_directory)
-      file_count += 1
-      _, l_report, l_errors = self.process_file(input_path, output_path)
-      tree_errors += l_errors
-      report += l_report
-    for input_path, output_path in files_to_copy:
-      output_directory = os.path.dirname(output_path)
-      if not os.path.isdir(output_directory):
-        os.makedirs(output_directory)
-      shutil.copy(input_path, output_path)
-    return file_count, report, tree_errors
-
-
-class TFAPIChangeSpec(APIChangeSpec):
+class TFAPIChangeSpec(ast_edits.APIChangeSpec):
   """List of maps that describe what changed in the API."""
 
   def __init__(self):
@@ -718,7 +238,7 @@ Simple usage:
       default="report.txt")
   args = parser.parse_args()
 
-  upgrade = ASTCodeUpgrader(TFAPIChangeSpec())
+  upgrade = ast_edits.ASTCodeUpgrader(TFAPIChangeSpec())
   report_text = None
   report_filename = args.report_filename
   files_processed = 0
diff --git a/tensorflow/tools/compatibility/tf_upgrade_test.py b/tensorflow/tools/compatibility/tf_upgrade_test.py
index 3d02eacba6..66325ea2ad 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_test.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_test.py
@@ -22,6 +22,7 @@ import tempfile
 import six
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test as test_lib
+from tensorflow.tools.compatibility import ast_edits
 from tensorflow.tools.compatibility import tf_upgrade
 
 
@@ -36,7 +37,7 @@ class TestUpgrade(test_util.TensorFlowTestCase):
   def _upgrade(self, old_file_text):
     in_file = six.StringIO(old_file_text)
     out_file = six.StringIO()
-    upgrader = tf_upgrade.ASTCodeUpgrader(tf_upgrade.TFAPIChangeSpec())
+    upgrader = ast_edits.ASTCodeUpgrader(tf_upgrade.TFAPIChangeSpec())
     count, report, errors = (
         upgrader.process_opened_file("test.py", in_file,
                                      "test_out.py", out_file))
@@ -139,7 +140,7 @@ class TestUpgradeFiles(test_util.TensorFlowTestCase):
     upgraded = "tf.multiply(a, b)\n"
     temp_file.write(original)
     temp_file.close()
-    upgrader = tf_upgrade.ASTCodeUpgrader(tf_upgrade.TFAPIChangeSpec())
+    upgrader = ast_edits.ASTCodeUpgrader(tf_upgrade.TFAPIChangeSpec())
     upgrader.process_file(temp_file.name, temp_file.name)
     self.assertAllEqual(open(temp_file.name).read(), upgraded)
     os.unlink(temp_file.name)
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2.py b/tensorflow/tools/compatibility/tf_upgrade_v2.py
new file mode 100644
index 0000000000..9702430a12
--- /dev/null
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2.py
@@ -0,0 +1,115 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Upgrader for Python scripts from 1.* TensorFlow to 2.0 TensorFlow."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+
+from tensorflow.tools.compatibility import ast_edits
+from tensorflow.tools.compatibility import renames_v2
+
+
+class TFAPIChangeSpec(ast_edits.APIChangeSpec):
+  """List of maps that describe what changed in the API."""
+
+  def __init__(self):
+    # Maps from a function name to a dictionary that describes how to
+    # map from an old argument keyword to the new argument keyword.
+    self.function_keyword_renames = {}
+
+    # Mapping from function to the new name of the function
+    self.function_renames = renames_v2.renames
+
+    # Variables that should be changed to functions.
+    self.change_to_function = {}
+
+    # Functions that were reordered should be changed to the new keyword args
+    # for safety, if positional arguments are used. If you have reversed the
+    # positional arguments yourself, this could do the wrong thing.
+    self.function_reorders = {}
+
+    # Specially handled functions.
+    self.function_handle = {}
+
+
+if __name__ == "__main__":
+  parser = argparse.ArgumentParser(
+      formatter_class=argparse.RawDescriptionHelpFormatter,
+      description="""Convert a TensorFlow Python file to 2.0
+
+Simple usage:
+  tf_convert_v2.py --infile foo.py --outfile bar.py
+  tf_convert_v2.py --intree ~/code/old --outtree ~/code/new
+""")
+  parser.add_argument(
+      "--infile",
+      dest="input_file",
+      help="If converting a single file, the name of the file "
+      "to convert")
+  parser.add_argument(
+      "--outfile",
+      dest="output_file",
+      help="If converting a single file, the output filename.")
+  parser.add_argument(
+      "--intree",
+      dest="input_tree",
+      help="If converting a whole tree of files, the directory "
+      "to read from (relative or absolute).")
+  parser.add_argument(
+      "--outtree",
+      dest="output_tree",
+      help="If converting a whole tree of files, the output "
+      "directory (relative or absolute).")
+  parser.add_argument(
+      "--copyotherfiles",
+      dest="copy_other_files",
+      help=("If converting a whole tree of files, whether to "
+            "copy the other files."),
+      type=bool,
+      default=False)
+  parser.add_argument(
+      "--reportfile",
+      dest="report_filename",
+      help=("The name of the file where the report log is "
+            "stored."
+            "(default: %(default)s)"),
+      default="report.txt")
+  args = parser.parse_args()
+
+  upgrade = ast_edits.ASTCodeUpgrader(TFAPIChangeSpec())
+  report_text = None
+  report_filename = args.report_filename
+  files_processed = 0
+  if args.input_file:
+    files_processed, report_text, errors = upgrade.process_file(
+        args.input_file, args.output_file)
+    files_processed = 1
+  elif args.input_tree:
+    files_processed, report_text, errors = upgrade.process_tree(
+        args.input_tree, args.output_tree, args.copy_other_files)
+  else:
+    parser.print_help()
+  if report_text:
+    open(report_filename, "w").write(report_text)
+    print("TensorFlow 2.0 Upgrade Script")
+    print("-----------------------------")
+    print("Converted %d files\n" % files_processed)
+    print("Detected %d errors that require attention" % len(errors))
+    print("-" * 80)
+    print("\n".join(errors))
+    print("\nMake sure to read the detailed log %r\n" % report_filename)
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2_test.py b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
new file mode 100644
index 0000000000..57ac04de06
--- /dev/null
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
@@ -0,0 +1,83 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf 2.0 upgrader."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import tempfile
+import six
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test as test_lib
+from tensorflow.tools.compatibility import ast_edits
+from tensorflow.tools.compatibility import tf_upgrade_v2
+
+
+class TestUpgrade(test_util.TensorFlowTestCase):
+  """Test various APIs that have been changed in 2.0.
+
+  We also test whether a converted file is executable. test_file_v1_10.py
+  aims to exhaustively test that API changes are convertible and actually
+  work when run with current TensorFlow.
+  """
+
+  def _upgrade(self, old_file_text):
+    in_file = six.StringIO(old_file_text)
+    out_file = six.StringIO()
+    upgrader = ast_edits.ASTCodeUpgrader(tf_upgrade_v2.TFAPIChangeSpec())
+    count, report, errors = (
+        upgrader.process_opened_file("test.py", in_file,
+                                     "test_out.py", out_file))
+    return count, report, errors, out_file.getvalue()
+
+  def testParseError(self):
+    _, report, unused_errors, unused_new_text = self._upgrade(
+        "import tensorflow as tf\na + \n")
+    self.assertTrue(report.find("Failed to parse") != -1)
+
+  def testReport(self):
+    text = "tf.acos(a)\n"
+    _, report, unused_errors, unused_new_text = self._upgrade(text)
+    # This is not a complete test, but it is a sanity test that a report
+    # is generating information.
+    self.assertTrue(report.find("Renamed function `tf.acos` to `tf.math.acos`"))
+
+  def testRename(self):
+    text = "tf.acos(a)\n"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, "tf.math.acos(a)\n")
+    text = "tf.rsqrt(tf.log(3.8))\n"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, "tf.math.rsqrt(tf.math.log(3.8))\n")
+
+
+class TestUpgradeFiles(test_util.TensorFlowTestCase):
+
+  def testInplace(self):
+    """Check to make sure we don't have a file system race."""
+    temp_file = tempfile.NamedTemporaryFile("w", delete=False)
+    original = "tf.acos(a, b)\n"
+    upgraded = "tf.math.acos(a, b)\n"
+    temp_file.write(original)
+    temp_file.close()
+    upgrader = ast_edits.ASTCodeUpgrader(tf_upgrade_v2.TFAPIChangeSpec())
+    upgrader.process_file(temp_file.name, temp_file.name)
+    self.assertAllEqual(open(temp_file.name).read(), upgraded)
+    os.unlink(temp_file.name)
+
+
+if __name__ == "__main__":
+  test_lib.main()
diff --git a/tensorflow/tools/compatibility/update/BUILD b/tensorflow/tools/compatibility/update/BUILD
new file mode 100644
index 0000000000..feb37c902e
--- /dev/null
+++ b/tensorflow/tools/compatibility/update/BUILD
@@ -0,0 +1,15 @@
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//visibility:private"])
+
+py_binary(
+    name = "generate_v2_renames_map",
+    srcs = ["generate_v2_renames_map.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:lib",
+        "//tensorflow/tools/common:public_api",
+        "//tensorflow/tools/common:traverse",
+    ],
+)
diff --git a/tensorflow/tools/compatibility/update/generate_v2_renames_map.py b/tensorflow/tools/compatibility/update/generate_v2_renames_map.py
new file mode 100644
index 0000000000..567eceb0b6
--- /dev/null
+++ b/tensorflow/tools/compatibility/update/generate_v2_renames_map.py
@@ -0,0 +1,103 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=line-too-long
+"""Script for updating tensorflow/tools/compatibility/renames_v2.py.
+
+To update renames_v2.py, run:
+  bazel build tensorflow/tools/compatibility/update:generate_v2_renames_map
+  bazel-bin/tensorflow/tools/compatibility/update/generate_v2_renames_map
+"""
+# pylint: enable=line-too-long
+
+import tensorflow as tf
+
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_export
+from tensorflow.tools.common import public_api
+from tensorflow.tools.common import traverse
+
+
+_OUTPUT_FILE_PATH = 'third_party/tensorflow/tools/compatibility/renames_v2.py'
+_FILE_HEADER = """# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=line-too-long
+\"\"\"List of renames to apply when converting from TF 1.0 to TF 2.0.
+
+THIS FILE IS AUTOGENERATED: To update, please run:
+  bazel build tensorflow/tools/compatibility/update:generate_v2_renames_map
+  bazel-bin/tensorflow/tools/compatibility/update/generate_v2_renames_map
+This file should be updated whenever endpoints are deprecated.
+\"\"\"
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+"""
+
+
+def update_renames_v2(output_file_path):
+  """Writes a Python dictionary mapping deprecated to canonical API names.
+
+  Args:
+    output_file_path: File path to write output to. Any existing contents
+      would be replaced.
+  """
+  # Set of rename lines to write to output file in the form:
+  #   'tf.deprecated_name': 'tf.canonical_name'
+  rename_line_set = set()
+  # _tf_api_names attribute name
+  tensorflow_api_attr = tf_export.API_ATTRS[tf_export.TENSORFLOW_API_NAME].names
+
+  def visit(unused_path, unused_parent, children):
+    """Visitor that collects rename strings to add to rename_line_set."""
+    for child in children:
+      _, attr = tf_decorator.unwrap(child[1])
+      if not hasattr(attr, '__dict__'):
+        continue
+      api_names = attr.__dict__.get(tensorflow_api_attr, [])
+      deprecated_api_names = attr.__dict__.get('_tf_deprecated_api_names', [])
+      canonical_name = tf_export.get_canonical_name(
+          api_names, deprecated_api_names)
+      for name in deprecated_api_names:
+        rename_line_set.add('    \'tf.%s\': \'tf.%s\'' % (name, canonical_name))
+
+  visitor = public_api.PublicAPIVisitor(visit)
+  visitor.do_not_descend_map['tf'].append('contrib')
+  traverse.traverse(tf, visitor)
+
+  renames_file_text = '%srenames = {\n%s\n}\n' % (
+      _FILE_HEADER, ',\n'.join(sorted(rename_line_set)))
+  file_io.write_string_to_file(output_file_path, renames_file_text)
+
+
+def main(unused_argv):
+  update_renames_v2(_OUTPUT_FILE_PATH)
+
+
+if __name__ == '__main__':
+  tf.app.run(main=main)
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index fd94d64268..f7fe4119da 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -63,7 +63,7 @@ RUN echo "startup --batch" >>/etc/bazel.bazelrc
 RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
     >>/etc/bazel.bazelrc
 # Install the most recent bazel release.
-ENV BAZEL_VERSION 0.14.1
+ENV BAZEL_VERSION 0.15.0
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 44120bf274..340f96df48 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -40,10 +40,6 @@ RUN mkdir /usr/local/cuda-9.0/lib &&  \
     ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/local/cuda/lib/libnccl.so.2 && \
     ln -s /usr/include/nccl.h /usr/local/cuda/include/nccl.h
 
-# TODO(tobyboyd): Remove after license is excluded from BUILD file.
-RUN gunzip /usr/share/doc/libnccl2/NCCL-SLA.txt.gz && \
-    cp /usr/share/doc/libnccl2/NCCL-SLA.txt /usr/local/cuda/
-
 RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \
     python get-pip.py && \
     rm get-pip.py
@@ -83,7 +79,7 @@ RUN echo "startup --batch" >>/etc/bazel.bazelrc
 RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
     >>/etc/bazel.bazelrc
 # Install the most recent bazel release.
-ENV BAZEL_VERSION 0.14.1
+ENV BAZEL_VERSION 0.15.0
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7 b/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7
index 3bedc8cf34..30bc2d2806 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7
@@ -4,7 +4,7 @@ LABEL maintainer="Gunhan Gulsoy <gunan@google.com>"
 
 # It is possible to override these for releases.
 ARG TF_BRANCH=master
-ARG BAZEL_VERSION=0.5.4
+ARG BAZEL_VERSION=0.15.0
 ARG TF_AVAILABLE_CPUS=32
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
diff --git a/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc b/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc
index f1d361e07d..156636ab82 100644
--- a/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc
+++ b/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc
@@ -159,7 +159,7 @@ Status FuseScaleOffsetToConvWeights(const std::vector<float>& scale_values,
   NodeDef bias_add_node;
   bias_add_node.set_op("BiasAdd");
   bias_add_node.set_name(conv_output_name);
-  if (!conv_node.attr().count("data_format")) {
+  if (conv_node.attr().count("data_format") > 0) {
     CopyNodeAttr(conv_node, "data_format", "data_format", &bias_add_node);
   }
   CopyNodeAttr(conv_node, "T", "T", &bias_add_node);
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index e661fb1adc..ab39ed8d69 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -78,7 +78,7 @@ COMMON_PIP_DEPS = [
     "//tensorflow/contrib/labeled_tensor:labeled_tensor_pip",
     "//tensorflow/contrib/nn:nn_py",
     "//tensorflow/contrib/predictor:predictor_pip",
-    "//tensorflow/contrib/proto:proto_pip",
+    "//tensorflow/contrib/proto:proto",
     "//tensorflow/contrib/receptive_field:receptive_field_pip",
     "//tensorflow/contrib/rpc:rpc_pip",
     "//tensorflow/contrib/session_bundle:session_bundle_pip",
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index c630ca04b8..1f4c3d47bf 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -45,7 +45,7 @@ DOCLINES = __doc__.split('\n')
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.9.0-rc0'
+_VERSION = '1.9.0'
 
 REQUIRED_PACKAGES = [
     'absl-py >= 0.1.6',
@@ -55,7 +55,7 @@ REQUIRED_PACKAGES = [
     'six >= 1.10.0',
     'protobuf >= 3.6.0',
     'setuptools <= 39.1.0',
-    'tensorboard >= 1.8.0, < 1.9.0',
+    'tensorboard >= 1.10.0, < 1.11.0',
     'termcolor >= 1.1.0',
 ]
 
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 2c8658fc59..45b1abeb10 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -144,13 +144,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "ortools_archive",
       urls = [
-          "https://mirror.bazel.build/github.com/google/or-tools/archive/253f7955c6a1fd805408fba2e42ac6d45b312d15.tar.gz",
-          # Please uncomment me, when the next upgrade happens. Then
-          # remove the whitelist entry in third_party/repo.bzl.
-          # "https://github.com/google/or-tools/archive/253f7955c6a1fd805408fba2e42ac6d45b312d15.tar.gz",
+          "https://mirror.bazel.build/github.com/google/or-tools/archive/v6.7.2.tar.gz",
+          "https://github.com/google/or-tools/archive/v6.7.2.tar.gz",
       ],
-      sha256 = "932075525642b04ac6f1b50589f1df5cd72ec2f448b721fd32234cf183f0e755",
-      strip_prefix = "or-tools-253f7955c6a1fd805408fba2e42ac6d45b312d15/src",
+      sha256 = "d025a95f78b5fc5eaa4da5f395f23d11c23cf7dbd5069f1f627f002de87b86b9",
+      strip_prefix = "or-tools-6.7.2/src",
       build_file = clean_dep("//third_party:ortools.BUILD"),
   )
 
@@ -487,11 +485,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "llvm",
       urls = [
-          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/62b518b75a780a3bc75982cbe54b0e7bc262aa6e.tar.gz",
-	  "https://github.com/llvm-mirror/llvm/archive/62b518b75a780a3bc75982cbe54b0e7bc262aa6e.tar.gz",
+          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/7b3bfc8151f3a6bcd9642c49c1f86f66cc43a428.tar.gz",
+	  "https://github.com/llvm-mirror/llvm/archive/7b3bfc8151f3a6bcd9642c49c1f86f66cc43a428.tar.gz",
       ],
-      sha256 = "51ab0edcf7dde0207f5cf141aec16b14fcac5290112cdf1ea671a2757f719f8b",
-      strip_prefix = "llvm-62b518b75a780a3bc75982cbe54b0e7bc262aa6e",
+      sha256 = "c6cbb21acd46e3e00faa8c379595ecffb99ef77622da17f29371db2bfad1d3d3",
+      strip_prefix = "llvm-7b3bfc8151f3a6bcd9642c49c1f86f66cc43a428",
       build_file = clean_dep("//third_party/llvm:llvm.autogenerated.BUILD"),
   )