97 files changed, 2122 insertions, 857 deletions
diff --git a/configure b/configure
index a365fe14ed..3a26f73dfa 100755
--- a/configure
+++ b/configure
@@ -466,7 +466,7 @@ done
 while true; do
   # Configure the Cuda SDK version to use.
   if [ -z "$TF_CUDA_VERSION" ]; then
-    read -p "Please specify the CUDA SDK version you want to use, e.g. 7.0. [Leave empty to use system default]: " TF_CUDA_VERSION
+    read -p "Please specify the CUDA SDK version you want to use, e.g. 7.0. [Leave empty to default to CUDA 8.0]: " TF_CUDA_VERSION
   fi
 
   fromuser=""
@@ -509,7 +509,6 @@ while true; do
     export CUDA_TOOLKIT_PATH
     write_action_env_to_bazelrc "CUDA_TOOLKIT_PATH" "$CUDA_TOOLKIT_PATH"
     export TF_CUDA_VERSION
-    write_action_env_to_bazelrc "TF_CUDA_VERSION" "$TF_CUDA_VERSION"
     break
   fi
   echo "Invalid path to CUDA $TF_CUDA_VERSION toolkit. ${CUDA_TOOLKIT_PATH}/${CUDA_RT_LIB_PATH} cannot be found"
@@ -522,6 +521,13 @@ while true; do
   CUDA_TOOLKIT_PATH=""
 done
 
+# Set default CUDA version if not set
+if [ -z "$TF_CUDA_VERSION" ]; then
+  TF_CUDA_VERSION="8.0"
+  export TF_CUDA_VERSION 
+fi
+write_action_env_to_bazelrc "TF_CUDA_VERSION" "$TF_CUDA_VERSION" 
+
 # Set up which gcc nvcc should use as the host compiler
 # No need to set this on Windows
 while [[ "$TF_CUDA_CLANG" != "1" ]] && ! is_windows && true; do
@@ -555,7 +561,7 @@ done
 while true; do
   # Configure the cuDNN version to use.
   if [ -z "$TF_CUDNN_VERSION" ]; then
-    read -p "Please specify the cuDNN version you want to use. [Leave empty to use system default]: " TF_CUDNN_VERSION
+    read -p "Please specify the cuDNN version you want to use. [Leave empty to default to cuDNN 6.0]: " TF_CUDNN_VERSION
   fi
 
   fromuser=""
@@ -605,7 +611,6 @@ while true; do
     CUDNN_PATH_FROM_LDCONFIG="$($LDCONFIG_BIN -p | sed -n 's/.*libcudnn.so .* => \(.*\)/\1/p')"
     if [ -e "${CUDNN_PATH_FROM_LDCONFIG}${TF_CUDNN_EXT}" ]; then
       export TF_CUDNN_VERSION
-      write_action_env_to_bazelrc "TF_CUDNN_VERSION" "$TF_CUDNN_VERSION"
       export CUDNN_INSTALL_PATH="$(dirname ${CUDNN_PATH_FROM_LDCONFIG})"
       write_action_env_to_bazelrc "CUDNN_INSTALL_PATH" "$CUDNN_INSTALL_PATH"
       break
@@ -626,6 +631,13 @@ while true; do
   CUDNN_INSTALL_PATH=""
 done
 
+# Set default CUDNN version if not set
+if [ -z "$TF_CUDNN_VERSION" ]; then
+  TF_CUDNN_VERSION="6"
+  export TF_CUDNN_VERSION
+fi
+write_action_env_to_bazelrc "TF_CUDNN_VERSION" "$TF_CUDNN_VERSION"
+
 # Configure the compute capabilities that TensorFlow builds for.
 # Since Cuda toolkit is not backward-compatible, this is not guaranteed to work.
 while true; do
@@ -844,6 +856,4 @@ if [ "$TF_NEED_MPI" == "1" ]; then
 fi
 
 
-# TODO(gunan): Remove once bazel correctly handles changes in remote repositories.
-bazel clean
 echo "Configuration finished"
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 277e3c9906..5f857191da 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -84,7 +84,7 @@ cc_library(
         "//tensorflow/compiler/jit/kernels:xla_device_launch_op",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
-        "//tensorflow/compiler/xla/service:cpu_plugin",
+        "//tensorflow/compiler/xla/service:cpu_plugin",  # buildcleaner: keep
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
     ],
@@ -101,7 +101,7 @@ cc_library(
         "//tensorflow/compiler/jit/kernels:xla_device_launch_op",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
-        "//tensorflow/compiler/xla/service:gpu_plugin",
+        "//tensorflow/compiler/xla/service:gpu_plugin",  # buildcleaner: keep
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
     ],
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 8fa7d044d1..3f93a7a511 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -679,7 +679,6 @@ cc_test(
     srcs = ["buffer_liveness_test.cc"],
     deps = [
         ":buffer_liveness",
-        ":cpu_plugin",
         ":hlo",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
@@ -725,7 +724,6 @@ cc_test(
         ":call_graph",
         ":computation_tracker",
         ":copy_insertion",
-        ":cpu_plugin",
         ":flatten_call_graph",
         ":hlo",
         ":hlo_ordering",
@@ -790,7 +788,6 @@ cc_test(
     name = "hlo_ordering_test",
     srcs = ["hlo_ordering_test.cc"],
     deps = [
-        ":cpu_plugin",
         ":hlo",
         ":hlo_ordering",
         "//tensorflow/compiler/xla:shape_util",
@@ -860,7 +857,6 @@ cc_test(
     srcs = ["algebraic_simplifier_test.cc"],
     deps = [
         ":algebraic_simplifier",
-        ":cpu_plugin",
         ":hlo",
         ":hlo_matchers",
         ":hlo_pass",
@@ -928,7 +924,6 @@ cc_test(
     name = "inliner_test",
     srcs = ["inliner_test.cc"],
     deps = [
-        ":cpu_plugin",
         ":hlo",
         ":hlo_matchers",
         ":inliner",
@@ -1085,7 +1080,6 @@ cc_test(
     name = "hlo_computation_test",
     srcs = ["hlo_computation_test.cc"],
     deps = [
-        ":cpu_plugin",
         ":hlo",
         ":hlo_matchers",
         "//tensorflow/compiler/xla:literal_util",
@@ -1116,7 +1110,6 @@ cc_test(
     name = "hlo_module_test",
     srcs = ["hlo_module_test.cc"],
     deps = [
-        ":cpu_plugin",
         ":hlo",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
@@ -1256,7 +1249,6 @@ cc_test(
     srcs = ["copy_insertion_test.cc"],
     deps = [
         ":copy_insertion",
-        ":cpu_plugin",
         ":hlo",
         ":hlo_matchers",
         ":tuple_points_to_analysis",
@@ -1321,7 +1313,6 @@ cc_test(
     name = "hlo_rematerialization_test",
     srcs = ["hlo_rematerialization_test.cc"],
     deps = [
-        ":cpu_plugin",
         ":hlo",
         ":hlo_matchers",
         ":hlo_ordering",
@@ -1338,7 +1329,6 @@ cc_test(
     name = "hlo_dce_test",
     srcs = ["hlo_dce_test.cc"],
     deps = [
-        ":cpu_plugin",
         ":hlo",
         ":hlo_dce",
         "//tensorflow/compiler/xla:literal_util",
@@ -1361,7 +1351,6 @@ cc_test(
     deps = [
         ":algebraic_simplifier",
         ":computation_layout",
-        ":cpu_plugin",
         ":hlo",
         ":hlo_matchers",
         ":layout_assignment",
@@ -1434,7 +1423,6 @@ cc_test(
     name = "hlo_cse_test",
     srcs = ["hlo_cse_test.cc"],
     deps = [
-        ":cpu_plugin",
         ":hlo",
         ":hlo_cse",
         ":hlo_matchers",
@@ -1470,7 +1458,6 @@ cc_test(
     name = "hlo_constant_folding_test",
     srcs = ["hlo_constant_folding_test.cc"],
     deps = [
-        ":cpu_plugin",
         ":hlo",
         ":hlo_constant_folding",
         ":hlo_matchers",
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index 0840fdf930..afe1a54d3e 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -1103,7 +1103,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   for (int64 dim = 0; dim < slice_sizes.size(); ++dim) {
     const int64 input_dim_size = operand_shape.dimensions(dim);
     const int64 slice_dim_size = slice_sizes[dim];
-    if (slice_dim_size <= 0) {
+    if (slice_dim_size < 0) {
       return InvalidArgument("negative size index to dynamic slice: %lld",
                              slice_dim_size);
     }
@@ -1175,9 +1175,9 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   for (int64 dim = 0; dim < ShapeUtil::Rank(operand_shape); ++dim) {
     const int64 input_dim_size = operand_shape.dimensions(dim);
     const int64 update_dim_size = update_shape.dimensions(dim);
-    if (update_dim_size <= 0) {
+    if (update_dim_size < 0) {
       return InvalidArgument(
-          "size index %lld to dynamic update slice must be > 0",
+          "size index %lld to dynamic update slice must be >= 0",
           update_dim_size);
     }
     if (update_dim_size > input_dim_size) {
diff --git a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
index cdb4498f4e..ea7908f55c 100644
--- a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
@@ -58,6 +58,8 @@ class DynamicSliceTest : public ClientLibraryTestBase {
     // Slice at dimension boundaries, but with sizes that cause indices to wrap.
     RunR1<IndexT>({0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}, {6}, {4},
                   {6.0, 7.0, 0.0, 1.0});
+    // Zero element slice.
+    RunR1<IndexT>({0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}, {2}, {0}, {});
   }
 
   template <typename IndexT>
@@ -75,6 +77,12 @@ class DynamicSliceTest : public ClientLibraryTestBase {
     RunR2<IndexT>({{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}},
                   {1, 1}, {3, 3},
                   {{5.0f, 6.0f, 4.0f}, {8.0f, 9.0f, 7.0f}, {2.0f, 3.0f, 1.0f}});
+    // Zero element slice: 2x0.
+    RunR2<IndexT>({{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}},
+                  {0, 0}, {2, 0}, {{}, {}});
+    // Zero element slice: 0x2.
+    RunR2<IndexT>({{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}},
+                  {0, 0}, {0, 2}, Array2D<float>(0, 2));
   }
 
   template <typename IndexT>
@@ -200,6 +208,10 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
     RunR1<IndexT>({0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0},
                   {8.0, 9.0, 10.0}, {6},
                   {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 8.0, 9.0});
+    // Zero-sized update.
+    RunR1<IndexT>({0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0},
+                  {}, {2},
+                  {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0});
     // clang-format on
   }
 
@@ -226,6 +238,11 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
         {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}},
         {{10.0f, 11.0f}}, {2, 2},
         {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 10.0f}});
+    // Zero-sized update.
+    RunR2<IndexT>(
+        {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}},
+        {{}}, {2, 1},
+        {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}, {7.0f, 8.0f, 9.0f}});
     // clang-format on
   }
 
diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index 2c770e75dd..9ffe08eded 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -113,6 +113,7 @@ include(zlib)
 include(gif)
 include(png)
 include(jpeg)
+include(lmdb)
 include(eigen)
 include(gemmlowp)
 include(jsoncpp)
@@ -129,6 +130,7 @@ set(tensorflow_EXTERNAL_LIBRARIES
     ${gif_STATIC_LIBRARIES}
     ${png_STATIC_LIBRARIES}
     ${jpeg_STATIC_LIBRARIES}
+    ${lmdb_STATIC_LIBRARIES}
     ${jsoncpp_STATIC_LIBRARIES}
     ${farmhash_STATIC_LIBRARIES}
     ${fft2d_STATIC_LIBRARIES}
@@ -140,6 +142,7 @@ set(tensorflow_EXTERNAL_DEPENDENCIES
     gif_copy_headers_to_destination
     png_copy_headers_to_destination
     jpeg_copy_headers_to_destination
+    lmdb_copy_headers_to_destination
     jsoncpp
     farmhash_copy_headers_to_destination
     highwayhash_copy_headers_to_destination
@@ -158,6 +161,7 @@ include_directories(
     ${gif_INCLUDE_DIR}
     ${png_INCLUDE_DIR}
     ${jpeg_INCLUDE_DIR}
+    ${lmdb_INCLUDE_DIR}
     ${eigen_INCLUDE_DIRS}
     ${gemmlowp_INCLUDE_DIR}
     ${jsoncpp_INCLUDE_DIR}
diff --git a/tensorflow/contrib/cmake/external/lmdb.cmake b/tensorflow/contrib/cmake/external/lmdb.cmake
new file mode 100644
index 0000000000..28ec833bab
--- /dev/null
+++ b/tensorflow/contrib/cmake/external/lmdb.cmake
@@ -0,0 +1,60 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+include (ExternalProject)
+
+set(lmdb_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/lmdb)
+set(lmdb_URL http://mirror.bazel.build/github.com/LMDB/lmdb/archive/LMDB_0.9.19.tar.gz)
+set(lmdb_HASH SHA256=108532fb94c6f227558d45be3f3347b52539f0f58290a7bb31ec06c462d05326)
+set(lmdb_BUILD ${CMAKE_BINARY_DIR}/lmdb/src/lmdb)
+set(lmdb_INSTALL ${CMAKE_BINARY_DIR}/lmdb/install)
+
+ExternalProject_Add(lmdb
+    PREFIX lmdb
+    URL ${lmdb_URL}
+    URL_HASH ${lmdb_HASH}
+    PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different
+        ${CMAKE_CURRENT_SOURCE_DIR}/patches/lmdb/CMakeLists.txt ${lmdb_BUILD}
+    INSTALL_DIR ${lmdb_INSTALL}
+    DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
+    CMAKE_CACHE_ARGS
+        -DCMAKE_BUILD_TYPE:STRING=Release
+        -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
+        -DCMAKE_INSTALL_PREFIX:STRING=${lmdb_INSTALL}
+    -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+)
+
+if(WIN32)
+    set(lmdb_STATIC_LIBRARIES ${lmdb_INSTALL}/lib/lmdb.lib)
+else()
+    set(lmdb_STATIC_LIBRARIES ${lmdb_INSTALL}/lib/liblmdb.a)
+endif()
+
+set(lmdb_HEADERS
+    "${lmdb_INSTALL}/include/lmdb.h"
+    "${lmdb_INSTALL}/include/midl.h"
+)
+
+## put lmdb includes in the directory where they are expected
+add_custom_target(lmdb_create_destination_dir
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${lmdb_INCLUDE_DIR}
+    DEPENDS lmdb)
+
+add_custom_target(lmdb_copy_headers_to_destination
+    DEPENDS lmdb_create_destination_dir)
+
+foreach(header_file ${lmdb_HEADERS})
+  add_custom_command(TARGET lmdb_copy_headers_to_destination PRE_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${header_file} ${lmdb_INCLUDE_DIR}/)
+endforeach()
diff --git a/tensorflow/contrib/cmake/patches/lmdb/CMakeLists.txt b/tensorflow/contrib/cmake/patches/lmdb/CMakeLists.txt
new file mode 100644
index 0000000000..19fa607a10
--- /dev/null
+++ b/tensorflow/contrib/cmake/patches/lmdb/CMakeLists.txt
@@ -0,0 +1,26 @@
+cmake_minimum_required(VERSION 2.8.3)
+
+project(liblmdb)
+
+set(LIBLMDB_SRCS
+    "libraries/liblmdb/mdb.c"
+    "libraries/liblmdb/midl.c"
+)
+
+set(LIBLMDB_INCLUDES
+    "libraries/liblmdb/lmdb.h"
+    "libraries/liblmdb/midl.h"
+)
+
+include_directories("${CMAKE_CURRENT_SOURCE_DIR}")
+
+add_library(lmdb ${LIBLMDB_SRCS})
+
+install(TARGETS lmdb
+  RUNTIME DESTINATION bin COMPONENT RuntimeLibraries
+  LIBRARY DESTINATION lib COMPONENT RuntimeLibraries
+  ARCHIVE DESTINATION lib COMPONENT Development)
+
+foreach(LIBLMDB_INCLUDE ${LIBLMDB_INCLUDES})
+  install(FILES ${LIBLMDB_INCLUDE} DESTINATION include COMPONENT Development)
+endforeach()
diff --git a/tensorflow/contrib/keras/python/keras/datasets/boston_housing.py b/tensorflow/contrib/keras/python/keras/datasets/boston_housing.py
index ac0f0fd422..36b20451ff 100644
--- a/tensorflow/contrib/keras/python/keras/datasets/boston_housing.py
+++ b/tensorflow/contrib/keras/python/keras/datasets/boston_housing.py
@@ -37,8 +37,11 @@ def load_data(path='boston_housing.npz', seed=113, test_split=0.2):
       Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
   """
   assert 0 <= test_split < 1
+  fh = 'f553886a1f8d56431e820c5b82552d9d95cfcb96d1e678153f8839538947dff5'
   path = get_file(
-      path, origin='https://s3.amazonaws.com/keras-datasets/boston_housing.npz')
+      path,
+      origin='https://s3.amazonaws.com/keras-datasets/boston_housing.npz',
+      file_hash=fh)
   f = np.load(path)
   x = f['x']
   y = f['y']
diff --git a/tensorflow/contrib/keras/python/keras/layers/convolutional_test.py b/tensorflow/contrib/keras/python/keras/layers/convolutional_test.py
index 590cdc6504..7e567d3fb0 100644
--- a/tensorflow/contrib/keras/python/keras/layers/convolutional_test.py
+++ b/tensorflow/contrib/keras/python/keras/layers/convolutional_test.py
@@ -114,16 +114,19 @@ class Conv2DTest(test.TestCase):
           continue
 
         with self.test_session(use_gpu=True):
-          testing_utils.layer_test(
-              keras.layers.Conv2D,
-              kwargs={
-                  'filters': filters,
-                  'kernel_size': kernel_size,
-                  'padding': padding,
-                  'strides': strides,
-                  'data_format': 'channels_first'
-              },
-              input_shape=(num_samples, stack_size, num_row, num_col))
+          # Only runs on GPU with CUDA, channels_first is not supported on CPU.
+          # TODO(b/62340061): Support channels_first on CPU.
+          if test.is_gpu_available(cuda_only=True):
+            testing_utils.layer_test(
+                keras.layers.Conv2D,
+                kwargs={
+                    'filters': filters,
+                    'kernel_size': kernel_size,
+                    'padding': padding,
+                    'strides': strides,
+                    'data_format': 'channels_first'
+                },
+                input_shape=(num_samples, stack_size, num_row, num_col))
 
   def test_convolution_2d_regularization(self):
     # regularizers
diff --git a/tensorflow/contrib/keras/python/keras/layers/pooling_test.py b/tensorflow/contrib/keras/python/keras/layers/pooling_test.py
index bbf695a1ba..d8a6a1673b 100644
--- a/tensorflow/contrib/keras/python/keras/layers/pooling_test.py
+++ b/tensorflow/contrib/keras/python/keras/layers/pooling_test.py
@@ -100,15 +100,18 @@ class Pooling2DTest(test.TestCase):
                   'padding': 'valid',
                   'pool_size': (3, 3)},
           input_shape=(3, 5, 6, 4))
-      testing_utils.layer_test(
-          keras.layers.AveragePooling2D,
-          kwargs={
-              'strides': (1, 1),
-              'padding': 'valid',
-              'pool_size': (2, 2),
-              'data_format': 'channels_first'
-          },
-          input_shape=(3, 4, 5, 6))
+      # Only runs on GPU with CUDA, channels_first is not supported on CPU.
+      # TODO(b/62340061): Support channels_first on CPU.
+      if test.is_gpu_available(cuda_only=True):
+        testing_utils.layer_test(
+            keras.layers.AveragePooling2D,
+            kwargs={
+                'strides': (1, 1),
+                'padding': 'valid',
+                'pool_size': (2, 2),
+                'data_format': 'channels_first'
+            },
+            input_shape=(3, 4, 5, 6))
 
 
 class Pooling3DTest(test.TestCase):
diff --git a/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc b/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc
index 2e9544b0b5..e97e8d0163 100644
--- a/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc
+++ b/tensorflow/contrib/mpi/mpi_rendezvous_mgr.cc
@@ -257,7 +257,7 @@ void MPIRendezvousMgr::AddRequest(RecvTensorRequest request,
     this->QueueSendRequest(req);
 
     // Wait for the notification that indicates the tensor has been
-    // succesfully transmitted to the remote process. Only needed if we
+    // successfully transmitted to the remote process. Only needed if we
     // have not parsed the tensor to proto
     if (doOptimalTransfer) mpi_send_call->n_.WaitForNotification();
   };  // done_cb
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 1cc712c2e1..b76226f2c9 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -1894,7 +1894,6 @@ cc_library(
         ":framework",
         ":lib",
         ":lib_internal",
-        ":test",
     ],
 )
 
@@ -1909,7 +1908,7 @@ cc_library(
     deps = [
         ":lib",
         ":lib_internal",
-        ":test",
+        ":test",  # buildcleaner: keep
         "//tensorflow/core/platform/default/build_config:test_main",
     ],
     alwayslink = 1,
@@ -2887,6 +2886,20 @@ filegroup(
 )
 
 filegroup(
+    name = "lmdb_testdata",
+    testonly = 1,
+    srcs = [
+        # A simple key-value store:
+        #   0 : 'a'
+        #   1 : 'b'
+        #    ...
+        #   9 : 'j'
+        "lib/lmdb/testdata/data.mdb",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+filegroup(
     name = "example_parser_configuration_testdata",
     srcs = [
         "example/testdata/parse_example_graph_def.pbtxt",
diff --git a/tensorflow/core/common_runtime/gpu/process_state.cc b/tensorflow/core/common_runtime/gpu/process_state.cc
index 0e21e37fd3..7a1c10d900 100644
--- a/tensorflow/core/common_runtime/gpu/process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/process_state.cc
@@ -127,7 +127,7 @@ Allocator* ProcessState::GetGPUAllocator(const GPUOptions& options, int gpu_id,
         gpu_platform->ExecutorForDevice(gpu_id).ValueOrDie();
     int bus_id = se->GetDeviceDescription().numa_node();
     if (bus_id >= 0 && bus_id < static_cast<int64>(gpu_visitors_.size())) {
-      for (auto v : gpu_visitors_[bus_id]) {
+      for (const auto& v : gpu_visitors_[bus_id]) {
         gpu_allocators_[gpu_id]->AddAllocVisitor(v);
       }
     }
diff --git a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
index 7b5cc1c5cb..4a5b88d5fd 100644
--- a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
+++ b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
@@ -39,7 +39,8 @@ namespace tensorflow {
 namespace test {
 
 Benchmark::Benchmark(const string& device, Graph* g,
-                     const SessionOptions* options, Graph* init) {
+                     const SessionOptions* options, Graph* init,
+                     Rendezvous* rendez) {
   SessionOptions default_options;
   if (!options) {
     options = &default_options;
@@ -61,7 +62,11 @@ Benchmark::Benchmark(const string& device, Graph* g,
     pool_->Schedule(closure);
   };
 
-  rendez_ = NewLocalRendezvous();
+  if (rendez == nullptr) {
+    rendez_ = NewLocalRendezvous();
+  } else {
+    rendez_ = rendez;
+  }
 
   const int graph_def_version = g->versions().producer();
 
diff --git a/tensorflow/core/common_runtime/kernel_benchmark_testlib.h b/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
index 278a6b3f9f..3a7b3a5ace 100644
--- a/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
+++ b/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
@@ -35,10 +35,11 @@ namespace test {
 
 class Benchmark {
  public:
-  // "device" must be either "cpu" or "gpu".  Takes ownership of "g"
-  // and "init".
+  // "device" must be either "cpu" or "gpu".  Takes ownership of "g",
+  // "init", and one reference on "rendez" (if not null).
   Benchmark(const string& device, Graph* g,
-            const SessionOptions* options = nullptr, Graph* init = nullptr);
+            const SessionOptions* options = nullptr, Graph* init = nullptr,
+            Rendezvous* rendez = nullptr);
   ~Benchmark();
 
   // Executes the graph for "iters" times.
diff --git a/tensorflow/core/common_runtime/shape_refiner_test.cc b/tensorflow/core/common_runtime/shape_refiner_test.cc
index 466b779e9b..b705bd74c2 100644
--- a/tensorflow/core/common_runtime/shape_refiner_test.cc
+++ b/tensorflow/core/common_runtime/shape_refiner_test.cc
@@ -557,7 +557,7 @@ TEST(ShapeRefinerTest, ConstantValueAsShape_PackInt32) {
                    .Finalize(root.graph(), &result));
 
   ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
-  for (auto input : inputs) {
+  for (const auto& input : inputs) {
     TF_ASSERT_OK(m.AddNode(input.node()));
   }
   TF_ASSERT_OK(m.AddNode(pack.node()));
diff --git a/tensorflow/core/debug/debug_io_utils.cc b/tensorflow/core/debug/debug_io_utils.cc
index 25847a20a4..54366ce249 100644
--- a/tensorflow/core/debug/debug_io_utils.cc
+++ b/tensorflow/core/debug/debug_io_utils.cc
@@ -119,6 +119,18 @@ Status PublishEncodedGraphDefInChunks(const string& encoded_graph_def,
 
 }  // namespace
 
+// static
+const char* const DebugIO::kMetadataFilePrefix = "_tfdbg_";
+
+// static
+const char* const DebugIO::kCoreMetadataTag = "core_metadata_";
+
+// static
+const char* const DebugIO::kDeviceTag = "device_";
+
+// static
+const char* const DebugIO::kGraphTag = "graph_";
+
 DebugNodeKey::DebugNodeKey(const string& device_name, const string& node_name,
                            const int32 output_slot, const string& debug_op)
     : device_name(device_name),
@@ -126,7 +138,8 @@ DebugNodeKey::DebugNodeKey(const string& device_name, const string& node_name,
       output_slot(output_slot),
       debug_op(debug_op),
       debug_node_name(
-          strings::StrCat(node_name, ":", output_slot, ":", debug_op)) {}
+          strings::StrCat(node_name, ":", output_slot, ":", debug_op)),
+      device_path(DeviceNameToDevicePath(device_name)) {}
 
 Status ReadEventFromFile(const string& dump_file_path, Event* event) {
   Env* env(Env::Default());
@@ -158,6 +171,15 @@ Status ReadEventFromFile(const string& dump_file_path, Event* event) {
 }
 
 // static
+const string DebugNodeKey::DeviceNameToDevicePath(const string& device_name) {
+  return strings::StrCat(
+      DebugIO::kMetadataFilePrefix, DebugIO::kDeviceTag,
+      str_util::StringReplace(
+          str_util::StringReplace(device_name, ":", "_", true), "/", ",",
+          true));
+}
+
+// static
 const char* const DebugIO::kFileURLScheme = "file://";
 // static
 const char* const DebugIO::kGrpcURLScheme = "grpc://";
@@ -236,7 +258,8 @@ Status DebugIO::PublishDebugMetadata(
       const string core_metadata_path = AppendTimestampToFilePath(
           io::JoinPath(
               dump_root_dir,
-              strings::StrCat("_tfdbg_core_metadata_", "sessionrun",
+              strings::StrCat(DebugIO::kMetadataFilePrefix,
+                              DebugIO::kCoreMetadataTag, "sessionrun",
                               strings::Printf("%.14lld", session_run_index))),
           Env::Default()->NowMicros());
       status.Update(DebugFileIO::DumpEventProtoToFile(
@@ -325,10 +348,11 @@ Status DebugIO::PublishGraph(const Graph& graph, const string& device_name,
   Status status = Status::OK();
   for (const string& debug_url : debug_urls) {
     if (debug_url.find(kFileURLScheme) == 0) {
-      const string dump_root_dir = debug_url.substr(strlen(kFileURLScheme));
-      // TODO(cais): (b/38325442) Serialize the GraphDef to a directory that
-      // reflects the device name.
-      const string file_name = strings::StrCat("_tfdbg_graph_", now_micros);
+      const string dump_root_dir =
+          io::JoinPath(debug_url.substr(strlen(kFileURLScheme)),
+                       DebugNodeKey::DeviceNameToDevicePath(device_name));
+      const string file_name = strings::StrCat(DebugIO::kMetadataFilePrefix,
+                                               DebugIO::kGraphTag, now_micros);
 
       status.Update(
           DebugFileIO::DumpEventProtoToFile(event, dump_root_dir, file_name));
@@ -437,7 +461,7 @@ string DebugFileIO::GetDumpFilePath(const string& dump_root_dir,
                                     const DebugNodeKey& debug_node_key,
                                     const uint64 wall_time_us) {
   return AppendTimestampToFilePath(
-      io::JoinPath(dump_root_dir,
+      io::JoinPath(dump_root_dir, debug_node_key.device_path,
                    strings::StrCat(debug_node_key.node_name, "_",
                                    debug_node_key.output_slot, "_",
                                    debug_node_key.debug_op)),
diff --git a/tensorflow/core/debug/debug_io_utils.h b/tensorflow/core/debug/debug_io_utils.h
index f3e76cc0ee..69d8c7bd4e 100644
--- a/tensorflow/core/debug/debug_io_utils.h
+++ b/tensorflow/core/debug/debug_io_utils.h
@@ -44,11 +44,14 @@ struct DebugNodeKey {
   DebugNodeKey(const string& device_name, const string& node_name,
                const int32 output_slot, const string& debug_op);
 
+  static const string DeviceNameToDevicePath(const string& device_name);
+
   const string device_name;
   const string node_name;
   const int32 output_slot;
   const string debug_op;
   const string debug_node_name;
+  const string device_path;
 };
 
 class DebugIO {
@@ -136,6 +139,11 @@ class DebugIO {
 
   static Status CloseDebugURL(const string& debug_url);
 
+  static const char* const kMetadataFilePrefix;
+  static const char* const kCoreMetadataTag;
+  static const char* const kDeviceTag;
+  static const char* const kGraphTag;
+
   static const char* const kFileURLScheme;
   static const char* const kGrpcURLScheme;
 };
diff --git a/tensorflow/core/debug/debug_io_utils_test.cc b/tensorflow/core/debug/debug_io_utils_test.cc
index 406bcae07f..77039aa4ab 100644
--- a/tensorflow/core/debug/debug_io_utils_test.cc
+++ b/tensorflow/core/debug/debug_io_utils_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/util/event.pb.h"
@@ -47,6 +48,18 @@ class DebugIOUtilsTest : public ::testing::Test {
   std::unique_ptr<Tensor> tensor_b_;
 };
 
+TEST_F(DebugIOUtilsTest, ConstructDebugNodeKey) {
+  DebugNodeKey debug_node_key("/job:worker/replica:1/task:0/gpu:2",
+                              "hidden_1/MatMul", 0, "DebugIdentity");
+  EXPECT_EQ("/job:worker/replica:1/task:0/gpu:2", debug_node_key.device_name);
+  EXPECT_EQ("hidden_1/MatMul", debug_node_key.node_name);
+  EXPECT_EQ(0, debug_node_key.output_slot);
+  EXPECT_EQ("DebugIdentity", debug_node_key.debug_op);
+  EXPECT_EQ("hidden_1/MatMul:0:DebugIdentity", debug_node_key.debug_node_name);
+  EXPECT_EQ("_tfdbg_device_,job_worker,replica_1,task_0,gpu_2",
+            debug_node_key.device_path);
+}
+
 TEST_F(DebugIOUtilsTest, DumpFloatTensorToFileSunnyDay) {
   Initialize();
 
@@ -138,10 +151,14 @@ TEST_F(DebugIOUtilsTest, DumpTensorToFileCannotCreateDirectory) {
 
   // First, create the file at the path.
   const string test_dir = testing::TmpDir();
-  const string txt_file_name = strings::StrCat(test_dir, "/baz");
-
-  if (!env_->FileExists(test_dir).ok()) {
-    ASSERT_TRUE(env_->CreateDir(test_dir).ok());
+  const string kDeviceName = "/job:localhost/replica:0/task:0/cpu:0";
+  const DebugNodeKey kDebugNodeKey(kDeviceName, "baz/tensor_a", 0,
+                                   "DebugIdentity");
+  const string txt_file_dir =
+      io::JoinPath(test_dir, DebugNodeKey::DeviceNameToDevicePath(kDeviceName));
+  const string txt_file_name = io::JoinPath(txt_file_dir, "baz");
+  if (!env_->FileExists(txt_file_dir).ok()) {
+    ASSERT_TRUE(env_->RecursivelyCreateDir(txt_file_dir).ok());
   }
   ASSERT_EQ(error::Code::NOT_FOUND, env_->FileExists(txt_file_name).code());
 
@@ -157,8 +174,7 @@ TEST_F(DebugIOUtilsTest, DumpTensorToFileCannotCreateDirectory) {
 
   // Second, try to dump the tensor to a path that requires "baz" to be a
   // directory, which should lead to an error.
-  const DebugNodeKey kDebugNodeKey("/job:localhost/replica:0/task:0/cpu:0",
-                                   "baz/tensor_a", 0, "DebugIdentity");
+
   const uint64 wall_time = env_->NowMicros();
 
   string dump_file_name;
diff --git a/tensorflow/core/debug/grpc_session_debug_test.cc b/tensorflow/core/debug/grpc_session_debug_test.cc
index 6c68729410..9584d8b9f3 100644
--- a/tensorflow/core/debug/grpc_session_debug_test.cc
+++ b/tensorflow/core/debug/grpc_session_debug_test.cc
@@ -187,7 +187,10 @@ TEST_F(GrpcSessionDebugTest, FileDebugURL) {
     IsSingleFloatValue(outputs[0], 4.0);
 
     std::vector<Tensor> dumped_tensors;
-    LoadTensorDumps("n", &dumped_tensors);
+    LoadTensorDumps(io::JoinPath(DebugNodeKey::DeviceNameToDevicePath(
+                                     cluster->devices()[0].name()),
+                                 "n"),
+                    &dumped_tensors);
 
     if (i == 0 || i == 5) {
       ASSERT_EQ(0, dumped_tensors.size());
@@ -267,7 +270,10 @@ TEST_F(GrpcSessionDebugTest, MultiDevices_String) {
         TF_CHECK_OK(session->Close());
 
         std::vector<Tensor> dumped_tensors;
-        LoadTensorDumps("n", &dumped_tensors);
+        LoadTensorDumps(
+            io::JoinPath(DebugNodeKey::DeviceNameToDevicePath(a_dev.name()),
+                         "n"),
+            &dumped_tensors);
         ASSERT_EQ(1, dumped_tensors.size());
         ASSERT_EQ(TensorShape({2, 2}), dumped_tensors[0].shape());
         for (size_t i = 0; i < 4; ++i) {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
index 2b1a47a93f..1e8c30bad5 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
@@ -173,7 +173,7 @@ class GrpcRemoteWorker : public WorkerInterface {
     }
 
     IssueRequest(req_copy ? req_copy : request, response, recvtensor_,
-                 std::move(*cb_to_use), call_opts);
+                 *cb_to_use, call_opts);
   }
 
   void LoggingAsync(const LoggingRequest* request, LoggingResponse* response,
diff --git a/tensorflow/core/distributed_runtime/worker.cc b/tensorflow/core/distributed_runtime/worker.cc
index 32ea0cfaa4..16e450abb0 100644
--- a/tensorflow/core/distributed_runtime/worker.cc
+++ b/tensorflow/core/distributed_runtime/worker.cc
@@ -258,7 +258,7 @@ void Worker::DoPartialRunGraph(CallOptions* opts,
           }
         }
         if (request->is_last_partial_run()) {
-          partial_run_mgr_.PartialRunDone(step_id, std::move(finish), s);
+          partial_run_mgr_.PartialRunDone(step_id, finish, s);
         } else {
           finish(s);
         }
diff --git a/tensorflow/core/framework/shape_inference.cc b/tensorflow/core/framework/shape_inference.cc
index a4597080f0..1f9e98551f 100644
--- a/tensorflow/core/framework/shape_inference.cc
+++ b/tensorflow/core/framework/shape_inference.cc
@@ -885,14 +885,6 @@ Status InferenceContext::AttachContext(const Status& status) {
                 strings::StrCat(status.error_message(), error_context));
 }
 
-ShapeHandle InferenceContext::input_handle_shape(int idx) {
-  if (input_handle_shapes_and_types_[idx] == nullptr) {
-    input_handle_shapes_and_types_[idx].reset(
-        new std::vector<ShapeAndType>{{UnknownShape(), DT_INVALID}});
-  }
-  return (*input_handle_shapes_and_types_[idx])[0].shape;
-}
-
 bool InferenceContext::MergeHandleShapesAndTypes(
     const std::vector<ShapeAndType>& shapes_and_types,
     std::vector<ShapeAndType>* to_update) {
diff --git a/tensorflow/core/framework/shape_inference.h b/tensorflow/core/framework/shape_inference.h
index baeab93e30..119bed4071 100644
--- a/tensorflow/core/framework/shape_inference.h
+++ b/tensorflow/core/framework/shape_inference.h
@@ -491,37 +491,12 @@ class InferenceContext {
     return input_handle_shapes_and_types_[idx].get();
   }
 
-  // DEPRECATED: use input_handle_shapes_and_types.
-  ShapeHandle input_handle_shape(int idx);
-  // DEPRECATED: use input_handle_shapes_and_types.
-  DataType input_handle_dtype(int idx) const {
-    if (input_handle_shapes_and_types_[idx] == nullptr) {
-      return DT_INVALID;
-    } else {
-      DCHECK_EQ(input_handle_shapes_and_types_[idx]->size(), 1);
-      return (*input_handle_shapes_and_types_[idx])[0].dtype;
-    }
-  }
-
   void set_output_handle_shapes_and_types(
       int idx, const std::vector<ShapeAndType>& shapes_and_types) {
     output_handle_shapes_and_types_[idx].reset(
         new std::vector<ShapeAndType>(shapes_and_types));
   }
 
-  // DEPRECATED: use output_handle_shapes_and_types.
-  ShapeHandle output_handle_shape(int idx) {
-    return output_handle_shapes_and_types_[idx] == nullptr
-               ? UnknownShape()
-               : (*output_handle_shapes_and_types_[idx])[0].shape;
-  }
-  // DEPRECATED: use output_handle_shapes_and_types.
-  DataType output_handle_dtype(int idx) const {
-    return output_handle_shapes_and_types_[idx] == nullptr
-               ? DT_INVALID
-               : (*output_handle_shapes_and_types_[idx])[0].dtype;
-  }
-
   // Note that shape functions should usually call MakeShapeFromShapeTensor,
   // as it does more analysis to provide partial shapes.
   //
diff --git a/tensorflow/core/graph/graph_constructor.cc b/tensorflow/core/graph/graph_constructor.cc
index 28ebf7e8c3..8d433fc8c6 100644
--- a/tensorflow/core/graph/graph_constructor.cc
+++ b/tensorflow/core/graph/graph_constructor.cc
@@ -435,7 +435,7 @@ Status GraphConstructor::MakeNode(const NodeDef& node_def, Node** node) {
 Status GraphConstructor::ValidateShape(Node* node) {
   if (!opts_.importing) return Status::OK();
   TF_RETURN_IF_ERROR(refiner_->AddNode(node));
-  // For nodes with the _output_shapes atttribute, override the shape.
+  // For nodes with the _output_shapes attribute, override the shape.
   std::vector<TensorShapeProto> shape_attrs;
   const char* kAttrName = "_output_shapes";
   if (!GetNodeAttr(node->attrs(), kAttrName, &shape_attrs).ok()) {
@@ -481,7 +481,7 @@ Status GraphConstructor::ValidateShape(Node* node) {
           "MutableHashTableOfTensors", "Mutex", "CuckooTable", "IndexTable",
           "WholeFileReader", "TextLineReader", "FixedLengthRecordReader",
           "TFRecordReader", "IdentityReader", "RefSwitch", "RefEnter",
-          "RefNextIteration", "RefMerge", "RefIdentity",
+          "RefNextIteration", "RefMerge", "RefIdentity", "LMDBReader",
           // To be removed after 2017/04/24.
           "ConditionalAccumulator", "SparseConditionalAccumulator", "Table",
       };
diff --git a/tensorflow/core/grappler/costs/BUILD b/tensorflow/core/grappler/costs/BUILD
index d40e66cd16..bebd5d7bce 100644
--- a/tensorflow/core/grappler/costs/BUILD
+++ b/tensorflow/core/grappler/costs/BUILD
@@ -185,21 +185,13 @@ cc_test(
     name = "virtual_scheduler_test",
     srcs = ["virtual_scheduler_test.cc"],
     deps = [
-        ":graph_properties",
-        ":utils",
         ":virtual_placer",
         ":virtual_scheduler",
         "//tensorflow/cc:cc_ops",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:tensorflow",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "//tensorflow/core/grappler:grappler_item",
-        "//tensorflow/core/grappler:utils",
-        "//tensorflow/core/grappler/clusters:utils",
         "//tensorflow/core/grappler/clusters:virtual_cluster",
-        "//tensorflow/core/grappler/costs:cost_estimator",
     ],
 )
 
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index 721cf535b9..5c0515803d 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -218,9 +218,13 @@ Status GraphProperties::InferDynamically(Cluster* cluster) {
   TF_RETURN_IF_ERROR(
       cluster->Run(item_.graph, item_.feed, item_.fetch, &metadata));
 
+  return InferFromCostGraph(metadata.cost_graph());
+}
+
+Status GraphProperties::InferFromCostGraph(const CostGraphDef& cost_graph) {
   std::unordered_map<string, const CostGraphDef::Node*> name_to_cost;
   std::unordered_map<string, const NodeDef*> name_to_node;  // Empty
-  for (auto& node : metadata.cost_graph().node()) {
+  for (auto& node : cost_graph.node()) {
     name_to_cost[node.name()] = &node;
 
     std::vector<OpInfo::TensorProperties> output_properties;
diff --git a/tensorflow/core/grappler/costs/graph_properties.h b/tensorflow/core/grappler/costs/graph_properties.h
index d2b466175b..f2f6ad19a7 100644
--- a/tensorflow/core/grappler/costs/graph_properties.h
+++ b/tensorflow/core/grappler/costs/graph_properties.h
@@ -36,6 +36,7 @@ class GraphProperties {
 
   Status InferStatically();
   Status InferDynamically(Cluster* cluster);
+  Status InferFromCostGraph(const CostGraphDef& cost_graph);
 
   bool HasOutputProperties(const string& name) const;
   std::vector<OpInfo::TensorProperties> GetInputProperties(
diff --git a/tensorflow/core/grappler/costs/graph_properties_test.cc b/tensorflow/core/grappler/costs/graph_properties_test.cc
index 6eca083184..bff5f7acc5 100644
--- a/tensorflow/core/grappler/costs/graph_properties_test.cc
+++ b/tensorflow/core/grappler/costs/graph_properties_test.cc
@@ -149,6 +149,54 @@ TEST_F(GraphPropertiesTest, DynamicProperties) {
   }
 }
 
+TEST_F(GraphPropertiesTest, Variables) {
+  GrapplerItem item;
+  TF_CHECK_OK(NodeDefBuilder("Var", "Variable")
+                  .Attr("dtype", DT_FLOAT)
+                  .Attr("shape", TensorShape({3, 7}))
+                  .Finalize(item.graph.add_node()));
+  item.fetch.push_back("Var");
+
+  Tensor initial_val(DT_FLOAT, TensorShape({3, 7}));
+  TF_CHECK_OK(NodeDefBuilder("InitialVal", "Const")
+                  .Attr("dtype", DT_FLOAT)
+                  .Attr("value", initial_val)
+                  .Finalize(item.graph.add_node()));
+  TF_CHECK_OK(NodeDefBuilder("InitVar", "Assign")
+                  .Input("Var", 0, DT_FLOAT_REF)
+                  .Input("InitialVal", 0, DT_FLOAT)
+                  .Finalize(item.graph.add_node()));
+  item.init_ops.push_back("InitVar");
+
+  {
+    GraphProperties static_properties(item);
+    TF_CHECK_OK(static_properties.InferStatically());
+
+    const auto props = static_properties.GetOutputProperties("Var");
+    EXPECT_EQ(1, props.size());
+    const OpInfo::TensorProperties& prop = props[0];
+    EXPECT_EQ(DT_FLOAT_REF, prop.dtype());
+    EXPECT_FALSE(prop.shape().unknown_rank());
+    EXPECT_EQ(2, prop.shape().dim_size());
+    EXPECT_EQ(3, prop.shape().dim(0).size());
+    EXPECT_EQ(7, prop.shape().dim(1).size());
+  }
+  {
+    TF_CHECK_OK(cluster_->Initialize(item));
+    GraphProperties dynamic_properties(item);
+    TF_CHECK_OK(dynamic_properties.InferDynamically(cluster_.get()));
+
+    const auto props = dynamic_properties.GetOutputProperties("Var");
+    EXPECT_EQ(1, props.size());
+    const OpInfo::TensorProperties& prop = props[0];
+    EXPECT_EQ(DT_FLOAT_REF, prop.dtype());
+    EXPECT_FALSE(prop.shape().unknown_rank());
+    EXPECT_EQ(2, prop.shape().dim_size());
+    EXPECT_EQ(3, prop.shape().dim(0).size());
+    EXPECT_EQ(7, prop.shape().dim(1).size());
+  }
+}
+
 TEST_F(GraphPropertiesTest, VarHandles) {
   GrapplerItem item;
   TF_CHECK_OK(NodeDefBuilder("Var", "VarHandleOp")
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
index c42218e447..eb1e521fce 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
@@ -245,20 +245,20 @@ class NodeProcessor {
   virtual Status AddLayoutTransposeToInputs() {
     std::vector<int> input_pos = GetInputPos();
     for (const auto& pos : input_pos) {
-      string node_name_NHWCToNCHW = strings::StrCat(
-          kTransposeNHWCToNCHW, "-", node_->name(), "-", node_->input(pos));
+      string base_name = strings::StrCat(node_->name(), "-", node_->input(pos));
+      string node_name =
+          AddPrefixToNodeName(base_name, kTransposeNHWCToNCHW, "-");
       auto input_node = node_map_->GetNode(node_->input(pos));
       int output_pos = NodePosition(node_->input(pos));
       TF_RETURN_IF_ERROR(HasAttribute(*node_, "T"));
       TF_RETURN_IF_ERROR(HasAttribute(*input_node, "_output_shapes"));
       AddNodeTranspose(
-          node_name_NHWCToNCHW, node_->input(pos), node_->attr().at("T").type(),
+          node_name, node_->input(pos), node_->attr().at("T").type(),
           input_node->attr().at("_output_shapes").list().shape(output_pos),
           true);
-      node_map_->UpdateOutput(node_->input(pos), node_->name(),
-                              node_name_NHWCToNCHW);
-      node_map_->AddOutput(node_name_NHWCToNCHW, node_->name());
-      *node_->mutable_input(pos) = node_name_NHWCToNCHW;
+      node_map_->UpdateOutput(node_->input(pos), node_->name(), node_name);
+      node_map_->AddOutput(node_name, node_->name());
+      *node_->mutable_input(pos) = node_name;
     }
     return Status::OK();
   }
@@ -266,9 +266,10 @@ class NodeProcessor {
   virtual Status AddLayoutTransposeToOutputs() {
     auto outputs = node_map_->GetOutputs(node_->name());
     for (const auto& output : outputs) {
-      string node_name_NCHWToNHWC = strings::StrCat(
-          kTransposeNCHWToNHWC, "-", node_->name(), "-", output->name());
-      // TODO (yaozhang): handle the rare case where node A is connected to more
+      string base_name = strings::StrCat(node_->name(), "-", output->name());
+      string node_name =
+          AddPrefixToNodeName(base_name, kTransposeNCHWToNHWC, "-");
+      // TODO(yaozhang): handle the rare case where node A is connected to more
       // than one input of node B.
       auto it = std::find_if(output->mutable_input()->begin(),
                              output->mutable_input()->end(),
@@ -290,13 +291,12 @@ class NodeProcessor {
       }
       TF_RETURN_IF_ERROR(HasAttribute(*node_, "T"));
       TF_RETURN_IF_ERROR(HasAttribute(*node_, "_output_shapes"));
-      AddNodeTranspose(
-          node_name_NCHWToNHWC, node_->name(), node_->attr().at("T").type(),
-          node_->attr().at("_output_shapes").list().shape(0), false);
-      *it = node_name_NCHWToNHWC;
-      node_map_->UpdateOutput(node_->name(), output->name(),
-                              node_name_NCHWToNHWC);
-      node_map_->AddOutput(node_name_NCHWToNHWC, output->name());
+      AddNodeTranspose(node_name, node_->name(), node_->attr().at("T").type(),
+                       node_->attr().at("_output_shapes").list().shape(0),
+                       false);
+      *it = node_name;
+      node_map_->UpdateOutput(node_->name(), output->name(), node_name);
+      node_map_->AddOutput(node_name, output->name());
     }
     return Status::OK();
   }
@@ -468,7 +468,13 @@ class Conv2DBackpropInputProcessor : public Conv2DProcessor {
 
   Status CustomizedProcessing() override {
     NodeDef* node = node_map_->GetNode(node_->input(0));
-    return UpdateAttrValue(node);
+    NodeDef* added_node = graph_->add_node();
+    *added_node = *node;
+    string node_name =
+        AddPrefixToNodeName(node->name(), "LayoutOptimizer", "-");
+    added_node->set_name(node_name);
+    node_map_->AddNode(node_name, added_node);
+    return UpdateAttrValue(added_node);
   }
 };
 
@@ -621,9 +627,11 @@ class BinaryOpProcessor : public AgnosticNodeProcessor {
 
   Status CustomizedProcessing() override {
     if (is_4d_with_vector_) {
-      string suffix = strings::StrCat("-", node_->name(), "-", node_->input(1));
-      string reshape_node_name = strings::StrCat(kReshapeNHWCToNCHW, suffix);
-      string shape_const_node_name = strings::StrCat(kReshapeConst, suffix);
+      string base_name = strings::StrCat(node_->name(), "-", node_->input(1));
+      string reshape_node_name =
+          AddPrefixToNodeName(base_name, kReshapeNHWCToNCHW, "-");
+      string shape_const_node_name =
+          AddPrefixToNodeName(base_name, kReshapeConst, "-");
       auto input_node = node_map_->GetNode(node_->input(1));
       TF_RETURN_IF_ERROR(HasAttribute(*input_node, "_output_shapes"));
       int vector_size =
@@ -710,15 +718,15 @@ class SliceProcessor : public AgnosticNodeProcessor {
   Status CustomizedProcessing() override {
     // Skip the first input, which is the data to be sliced.
     for (int i = 1; i < node_->input_size(); i++) {
-      string node_name_NHWCToNCHW =
-          strings::StrCat(kPermVecNHWCToNCHW, "-", node_->name(), "-input", i);
+      string base_name = strings::StrCat(node_->name(), "-input", i);
+      string node_name =
+          AddPrefixToNodeName(base_name, kPermVecNHWCToNCHW, "-");
       TF_RETURN_IF_ERROR(HasAttribute(*node_, "Index"));
-      AddNodePermVec(node_name_NHWCToNCHW, node_->input(i),
+      AddNodePermVec(node_name, node_->input(i),
                      node_->attr().at("Index").type(), true);
-      node_map_->UpdateOutput(node_->input(i), node_->name(),
-                              node_name_NHWCToNCHW);
-      node_map_->AddOutput(node_name_NHWCToNCHW, node_->name());
-      *node_->mutable_input(i) = node_name_NHWCToNCHW;
+      node_map_->UpdateOutput(node_->input(i), node_->name(), node_name);
+      node_map_->AddOutput(node_name, node_->name());
+      *node_->mutable_input(i) = node_name;
     }
     return Status::OK();
   }
@@ -795,7 +803,7 @@ class SliceProcessorConcatOffset : public AgnosticNodeProcessor {
                                       "input 1 of ConcatOffset"));
       }
       // Need to process if the channel is at dimension 3, which indicates the
-      // NHWC format is being used. As mutiple Slice nodes may share the same
+      // NHWC format is being used. As multiple Slice nodes may share the same
       // ConcatOffset node, the NHWC to NCHW conversion may have already
       // been performed when processing other Slice nodes.
       TF_RETURN_IF_ERROR(HasAttribute(*axis_node, "value"));
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
index be38ca1a69..566ea1d87d 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
@@ -36,8 +36,8 @@ void AddOutputShape(Node* node, const TensorShape& shape) {
 
 class LayoutOptimizerTest : public ::testing::Test {
  protected:
-  Output SimpleConv(tensorflow::Scope* s, int input_size, int filter_size,
-                    const string& padding) {
+  Output SimpleConv2D(tensorflow::Scope* s, int input_size, int filter_size,
+                      const string& padding) {
     int batch_size = 128;
     int input_height = input_size;
     int input_width = input_size;
@@ -65,11 +65,80 @@ class LayoutOptimizerTest : public ::testing::Test {
     AddOutputShape(conv.node(), input_shape);
     return conv;
   }
+
+  Output SimpleConv2DBackpropInput(tensorflow::Scope* s, int input_size,
+                                   int filter_size, const string& padding) {
+    int batch_size = 128;
+    int input_height = input_size;
+    int input_width = input_size;
+    int input_depth = 3;
+    int filter_count = 2;
+    int stride = 1;
+    TensorShape input_sizes_shape({4});
+    Tensor input_data(DT_INT32, input_sizes_shape);
+    test::FillValues<int>(&input_data,
+                          {batch_size, input_height, input_width, input_depth});
+    Output input_sizes =
+        ops::Const(s->WithOpName("InputSizes"), Input::Initializer(input_data));
+    AddOutputShape(input_sizes.node(), input_sizes_shape);
+
+    TensorShape filter_shape(
+        {filter_size, filter_size, input_depth, filter_count});
+    Tensor filter_data(DT_FLOAT, filter_shape);
+    test::FillIota<float>(&filter_data, 1.0f);
+    Output filter =
+        ops::Const(s->WithOpName("Filter"), Input::Initializer(filter_data));
+    AddOutputShape(filter.node(), filter_shape);
+
+    int output_height = input_height;
+    int output_width = input_width;
+    TensorShape output_shape(
+        {batch_size, output_height, output_width, filter_count});
+    Tensor output_data(DT_FLOAT, output_shape);
+    test::FillIota<float>(&output_data, 1.0f);
+    Output output =
+        ops::Const(s->WithOpName("Output"), Input::Initializer(output_data));
+    AddOutputShape(output.node(), output_shape);
+
+    Output conv_backprop_input = ops::Conv2DBackpropInput(
+        s->WithOpName("Conv2DBackpropInput"), input_sizes, filter, output,
+        {1, stride, stride, 1}, padding);
+    TensorShape input_shape(
+        {batch_size, input_height, input_width, input_depth});
+    AddOutputShape(conv_backprop_input.node(), input_shape);
+    return conv_backprop_input;
+  }
+
+  Tensor GetAttrValue(const NodeDef& node) {
+    Tensor tensor;
+    CHECK(tensor.FromProto(node.attr().at({"value"}).tensor()));
+    return tensor;
+  }
 };
 
+TEST_F(LayoutOptimizerTest, Conv2DBackpropInput) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2DBackpropInput(&s, 7, 2, "SAME");
+  Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  optimizer.set_num_gpus(1);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  NodeMap node_map(&output);
+  auto input_sizes_node = node_map.GetNode(
+      AddPrefixToNodeName("InputSizes", "LayoutOptimizer", "-"));
+  CHECK(input_sizes_node);
+  auto input_sizes = GetAttrValue(*input_sizes_node);
+  Tensor input_sizes_expected(DT_INT32, {4});
+  test::FillValues<int>(&input_sizes_expected, {128, 3, 7, 7});
+  test::ExpectTensorEqual<int>(input_sizes_expected, input_sizes);
+}
+
 TEST_F(LayoutOptimizerTest, FilterSizeIsOne) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  auto conv = SimpleConv(&s, 2, 1, "SAME");
+  auto conv = SimpleConv2D(&s, 2, 1, "SAME");
   Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
@@ -84,7 +153,7 @@ TEST_F(LayoutOptimizerTest, FilterSizeIsOne) {
 
 TEST_F(LayoutOptimizerTest, FilterSizeNotOne) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  auto conv = SimpleConv(&s, 2, 1, "SAME");
+  auto conv = SimpleConv2D(&s, 2, 1, "SAME");
   Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
@@ -99,7 +168,7 @@ TEST_F(LayoutOptimizerTest, FilterSizeNotOne) {
 
 TEST_F(LayoutOptimizerTest, EqualSizeWithValidPadding) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  auto conv = SimpleConv(&s, 2, 2, "VALID");
+  auto conv = SimpleConv2D(&s, 2, 2, "VALID");
   Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
@@ -114,7 +183,7 @@ TEST_F(LayoutOptimizerTest, EqualSizeWithValidPadding) {
 
 TEST_F(LayoutOptimizerTest, EqualSizeWithSamePadding) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  auto conv = SimpleConv(&s, 2, 2, "SAME");
+  auto conv = SimpleConv2D(&s, 2, 2, "SAME");
   Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
@@ -129,7 +198,7 @@ TEST_F(LayoutOptimizerTest, EqualSizeWithSamePadding) {
 
 TEST_F(LayoutOptimizerTest, NotEqualSizeWithValidPadding) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  auto conv = SimpleConv(&s, 2, 3, "VALID");
+  auto conv = SimpleConv2D(&s, 2, 3, "VALID");
   Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
diff --git a/tensorflow/core/grappler/utils.cc b/tensorflow/core/grappler/utils.cc
index b7a04f4423..cc3cbdacb4 100644
--- a/tensorflow/core/grappler/utils.cc
+++ b/tensorflow/core/grappler/utils.cc
@@ -98,13 +98,18 @@ int NodePosition(const string& name) {
   return position;
 }
 
-string AddPrefixToNodeName(const string& name, const string& prefix) {
+string AddPrefixToNodeName(const string& name, const string& prefix,
+                           const string& delimiter) {
   if (!name.empty()) {
     if (name[0] == '^') {
-      return strings::StrCat("^", prefix, "/", name.substr(1));
+      return strings::StrCat("^", prefix, delimiter, name.substr(1));
     }
   }
-  return strings::StrCat(prefix, "/", name);
+  return strings::StrCat(prefix, delimiter, name);
+}
+
+string AddPrefixToNodeName(const string& name, const string& prefix) {
+  return AddPrefixToNodeName(name, prefix, "/");
 }
 
 bool ExecuteWithTimeout(std::function<void()> fn, const int64 timeout_in_ms,
diff --git a/tensorflow/core/grappler/utils.h b/tensorflow/core/grappler/utils.h
index 5a3c0614e7..f86fc608c5 100644
--- a/tensorflow/core/grappler/utils.h
+++ b/tensorflow/core/grappler/utils.h
@@ -60,7 +60,11 @@ int NodePosition(const string& name);
 // Returns the node name and position in a single call.
 string ParseNodeName(const string& name, int* position);
 
-// Add a prefix to a node name
+// Add a prefix to a node name with a custom delimiter.
+string AddPrefixToNodeName(const string& name, const string& prefix,
+                           const string& delimiter);
+
+// Add a prefix to a node name.
 string AddPrefixToNodeName(const string& name, const string& prefix);
 
 // Executes a 'fn' in the 'thread_pool'. The method waits for the configured
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 1e5bc0ceab..ed0379bb3f 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -1904,6 +1904,7 @@ cc_library(
     deps = [
         ":fixed_length_record_reader_op",
         ":identity_reader_op",
+        ":lmdb_reader_op",
         ":matching_files_op",
         ":reader_ops",
         ":restore_op",
@@ -1939,6 +1940,14 @@ tf_kernel_library(
 )
 
 tf_kernel_library(
+    name = "lmdb_reader_op",
+    prefix = "lmdb_reader_op",
+    deps = IO_DEPS + [
+        "@lmdb",
+    ],
+)
+
+tf_kernel_library(
     name = "matching_files_op",
     prefix = "matching_files_op",
     deps = IO_DEPS,
@@ -3170,6 +3179,21 @@ tf_kernel_library(
     deps = REQUIRED_DEPS,
 )
 
+tf_cc_test(
+    name = "sendrecv_ops_test",
+    srcs = ["sendrecv_ops_test.cc"],
+    linkstatic = tf_kernel_tests_linkstatic(),  # Required for benchmarking
+    deps = [
+        ":ops_testutil",
+        ":ops_util",
+        ":sendrecv_ops",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 cc_library(
     name = "sparse",
     deps = [
@@ -4313,6 +4337,7 @@ filegroup(
             # not used on Android. Those ops also do not compile if included,
             # unless we add the additional deps they need.
             "tf_record_reader_op.*",
+            "lmdb_reader_op.*",
             "string_to_hash_bucket_op.*",
             "sdca_ops.*",
             "sdca_internal.*",
diff --git a/tensorflow/core/kernels/cuda_solvers.cc b/tensorflow/core/kernels/cuda_solvers.cc
index 73446fbe05..914627992b 100644
--- a/tensorflow/core/kernels/cuda_solvers.cc
+++ b/tensorflow/core/kernels/cuda_solvers.cc
@@ -153,7 +153,7 @@ Status CudaSolver::CopyLapackInfoToHostAsync(
         info_checker_callback) const {
   std::vector<HostLapackInfo> host_lapack_infos;
   if (dev_lapack_infos.empty()) {
-    info_checker_callback(Status::OK(), std::move(host_lapack_infos));
+    info_checker_callback(Status::OK(), host_lapack_infos);
     return Status::OK();
   }
 
@@ -174,7 +174,7 @@ Status CudaSolver::CopyLapackInfoToHostAsync(
   auto wrapped_info_checker_callback =
       [info_checker_callback](std::vector<HostLapackInfo> host_lapack_infos) {
         Status status;
-        for (auto host_lapack_info : host_lapack_infos) {
+        for (const auto& host_lapack_info : host_lapack_infos) {
           for (int i = 0; i < host_lapack_info.size() && status.ok(); ++i) {
             const int info_value = (host_lapack_info.data())[i];
             if (info_value != 0) {
diff --git a/tensorflow/core/kernels/debug_ops_test.cc b/tensorflow/core/kernels/debug_ops_test.cc
index 037272e009..1830b2a178 100644
--- a/tensorflow/core/kernels/debug_ops_test.cc
+++ b/tensorflow/core/kernels/debug_ops_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
@@ -94,7 +95,22 @@ TEST_F(DebugIdentityOpTest, Int32Success_6_FileURLs) {
     ASSERT_TRUE(env_->FileExists(dump_roots[i]).ok());
     ASSERT_TRUE(env_->IsDirectory(dump_roots[i]).ok());
 
-    DIR* dir = opendir(dump_roots[i].c_str());
+    std::vector<string> device_roots;
+    DIR* dir0 = opendir(dump_roots[i].c_str());
+    struct dirent* ent0;
+    const string kDeviceDirPrefix =
+        strings::StrCat(DebugIO::kMetadataFilePrefix, DebugIO::kDeviceTag);
+    while ((ent0 = readdir(dir0)) != nullptr) {
+      if (!strncmp(ent0->d_name, kDeviceDirPrefix.c_str(),
+                   kDeviceDirPrefix.size())) {
+        device_roots.push_back(io::JoinPath(dump_roots[i], ent0->d_name));
+      }
+    }
+    ASSERT_EQ(1, device_roots.size());
+    closedir(dir0);
+
+    const string& device_root = device_roots[0];
+    DIR* dir = opendir(device_root.c_str());
     struct dirent* ent;
     int dump_files_found = 0;
     while ((ent = readdir(dir)) != nullptr) {
@@ -102,8 +118,7 @@ TEST_F(DebugIdentityOpTest, Int32Success_6_FileURLs) {
         dump_files_found++;
 
         // Try reading the file into a Event proto.
-        const string dump_file_path =
-            strings::StrCat(dump_roots[i], "/", ent->d_name);
+        const string dump_file_path = io::JoinPath(device_root, ent->d_name);
         std::fstream ifs(dump_file_path, std::ios::in | std::ios::binary);
         Event event;
         event.ParseFromIstream(&ifs);
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
index 7303c97c0e..0d4feb7c1d 100644
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
@@ -34,6 +34,19 @@ namespace tensorflow {
 
 using Eigen::GpuDevice;
 
+// Returns whether depthwise convolution forward pass can be performed using the
+// faster ('Small') variant of the kernel.
+EIGEN_DEVICE_FUNC bool CanLaunchDepthwiseConv2dGPUSmall(
+    const DepthwiseArgs args) {
+  return args.depth_multiplier == 1 && args.stride == 1 && args.in_rows <= 16 &&
+         args.in_cols <= 16 && args.in_rows == args.out_rows &&
+         args.in_cols == args.out_cols && args.pad_rows >= 0 &&
+         args.pad_rows < args.filter_rows && args.pad_cols >= 0 &&
+         args.pad_cols < args.filter_cols &&
+         args.filter_rows * args.filter_cols <=
+             (args.in_rows + 1) / 2 * args.in_cols;
+}
+
 // A Cuda kernel to compute the depthwise convolution forward pass
 // in NHWC format.
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
@@ -118,7 +131,8 @@ __global__ void __launch_bounds__(1024, 2)
 
 // CUDA kernel to compute the depthwise convolution forward pass in NCHW format,
 // tailored for small images up to 16x16. Stride and depth multiplier must be 1.
-// Padding must be 'SAME', which allows to reuse the index computation.
+// Padding must be 'SAME', which allows to reuse the index computation. Only
+// use this kernel if CanLaunchDepthwiseConv2dGPUSmall(args) returns true.
 // Tiles of the input and filter tensors are loaded into shared memory before
 // performing the convolution. Each thread handles two elements per iteration,
 // one each in the lower and upper half of a tile.
@@ -126,6 +140,7 @@ template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
           bool kKnownEvenRows>
 __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNHWCSmall(
     const DepthwiseArgs args, const T* input, const T* filter, T* output) {
+  assert(CanLaunchDepthwiseConv2dGPUSmall(args));
   // Holds block plus halo and filter data for blockDim.x depths.
   extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[];
   T* const shared_data = reinterpret_cast<T*>(shared_memory);
@@ -143,16 +158,15 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNHWCSmall(
 
   // Fixed blockDim.x, corresponding to Pascal's global load granularity of 32B.
   const int block_slices = 8;
-  const int block_cols = blockDim.y;
   const int block_rows = blockDim.z;
 
   // These values are the same for all threads and could
   // be precomputed on the CPU.
-  const int block_size = block_rows * block_cols * block_slices;
+  const int block_size = block_rows * in_cols * block_slices;
   const int in_row_size = in_cols * in_depth;
   const int in_size = in_rows * in_row_size;
   const int in_increment = (in_cols - 1) * block_slices;
-  const int filter_size = filter_rows * filter_cols;
+  const int filter_pixels = filter_rows * filter_cols;
   const int tile_cols = in_cols + filter_cols - 1;
   const int even_rows = kKnownEvenRows || (1 & ~in_rows);
   const int tile_rows = in_rows + filter_rows - even_rows;
@@ -170,7 +184,7 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNHWCSmall(
   const int thread_row = threadIdx.z;
 
   // Position in block.
-  const int thread_pix = thread_row * block_cols + thread_col;
+  const int thread_pix = thread_row * in_cols + thread_col;
   const int thread_idx = thread_pix * block_slices + thread_depth;
 
   // Initialize tile, in particular the padding.
@@ -192,7 +206,7 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNHWCSmall(
 
   const int max_depth = in_depth - thread_depth;
   const int filter_write_offset =
-      thread_pix < filter_size ? tile_size + thread_idx : 0;
+      thread_pix < filter_pixels ? tile_size + thread_idx : 0;
   const int filter_read_offset = tile_size + thread_depth;
   const bool skip_second =
       !kKnownEvenRows && thread_row + (in_rows & 1) == block_rows;
@@ -377,56 +391,198 @@ __global__ void __launch_bounds__(1024, 2)
   }
 }
 
-template <typename T, int kKnownFilterWidth, int kKnownFilterHeight>
-bool TryLaunchDepthwiseConv2dGPUSmall(const GpuDevice& d,
-                                      const DepthwiseArgs args, const T* input,
-                                      const T* filter, T* output,
-                                      TensorFormat data_format) {
-  if (data_format != FORMAT_NHWC || args.depth_multiplier != 1 ||
-      args.stride != 1 || args.in_rows > 16 || args.in_cols > 16 ||
-      args.in_rows != args.out_rows || args.in_cols != args.out_cols ||
-      args.pad_rows < 0 || args.pad_rows >= args.filter_rows ||
-      args.pad_cols < 0 || args.pad_cols >= args.filter_cols) {
-    return false;
+// CUDA kernel to compute the depthwise convolution forward pass in NCHW format,
+// tailored for small images up to 16x16. Stride and depth multiplier must be 1.
+// Padding must be 'SAME', which allows to reuse the index computation. Only
+// use this kernel if CanLaunchDepthwiseConv2dGPUSmall(args) returns true.
+// Tiles of the input and filter tensors are loaded into shared memory before
+// performing the convolution. Each thread handles two elements per iteration,
+// one each in the lower and upper half of a tile.
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          bool kKnownEvenRows>
+__global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNCHWSmall(
+    const DepthwiseArgs args, const T* input, const T* filter, T* output) {
+  assert(CanLaunchDepthwiseConv2dGPUSmall(args));
+  // Holds block plus halo and filter data for blockDim.z depths.
+  extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[];
+  T* const shared_data = reinterpret_cast<T*>(shared_memory);
+
+  const int batches = args.batch;
+  const int in_rows = args.in_rows;
+  const int in_cols = args.in_cols;
+  const int in_depth = args.in_depth;
+  const int filter_rows =
+      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
+  const int filter_cols =
+      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
+  const int pad_rows = args.pad_rows;
+  const int pad_cols = args.pad_cols;
+
+  // Fixed blockDim.z, tailored for maximum grid size for images of size 16x16.
+  const int block_rows = blockDim.y;
+  const int block_slices = 8;
+
+  // These values are the same for all threads and could
+  // be precomputed on the CPU.
+  const int block_pixels = in_cols * block_rows;
+  const int block_size = block_pixels * block_slices;
+  const int in_pixels = in_cols * in_rows;
+  const int in_increment = in_cols - 1;
+  const int filter_pixels = filter_rows * filter_cols;
+  const int tile_cols = in_cols + filter_cols - 1;
+  const int even_rows = kKnownEvenRows || (1 & ~in_rows);
+  const int tile_rows = in_rows + filter_rows - even_rows;
+  const int tile_pixels = tile_cols * tile_rows;
+  const int tile_size = tile_pixels * block_slices;
+  const int tile_offset = block_rows * tile_cols;
+  const int pad_offset = pad_rows * tile_cols + pad_cols;
+  const int in_slices = in_depth * batches;
+  const int in_blocks = (in_slices + block_slices - 1) / block_slices;
+
+  const int thread_col = threadIdx.x;
+  const int thread_row = threadIdx.y;
+  const int thread_depth = threadIdx.z;
+
+  // Position in block.
+  const int thread_pix = thread_row * in_cols + thread_col;
+  const int thread_idx = thread_depth * block_pixels + thread_pix;
+
+  // Initialize tile, in particular the padding.
+  for (int i = thread_idx; i < tile_size; i += block_size) {
+    shared_data[i] = T(0);
   }
+  __syncthreads();
 
-  const int block_rows = (args.in_rows + 1) / 2;
-  if (args.filter_rows * args.filter_cols > args.in_cols * block_rows) {
-    return false;
+  // Position in tensors.
+  const int tensor_idx = thread_depth * in_pixels + thread_pix;
+
+  // Position in (padded) shared memory.
+  const int data_pix = thread_row * tile_cols + thread_col;
+  const int data_idx = thread_depth * tile_pixels + data_pix;
+
+  // Position in shared memory, offset by pad_rows / pad_cols.
+  const int tile_idx = data_idx + pad_offset;
+
+  // Filter is always in HWCK format, irrespective of the input/output format.
+  const int filter_pix = thread_idx / block_slices;
+  const int filter_depth = thread_idx % block_slices;
+  const int filter_idx = filter_pix * in_depth;
+
+  const int max_slice = in_slices - thread_depth;
+  const int filter_write_offset =
+      filter_pix < filter_pixels ? tile_size + thread_idx : 0;
+  const int filter_read_offset = tile_size + thread_depth;
+  const bool skip_second =
+      !kKnownEvenRows && thread_row + (in_rows & 1) == block_rows;
+
+  for (int b = blockIdx.x; b < in_blocks; b += gridDim.x) {
+    const int slice = b * block_slices;
+
+    const int inout_offset = slice * in_pixels + tensor_idx;
+    const bool slice_in_range = slice < max_slice;
+
+    if (slice_in_range) {
+      const T* const in_ptr = inout_offset + input;
+      T* const tile_ptr = tile_idx + shared_data;
+      tile_ptr[0] = ldg(in_ptr);
+      if (!skip_second) {
+        tile_ptr[tile_offset] = ldg(block_pixels + in_ptr);
+      }
+    }
+
+    if (filter_write_offset != 0) {
+      const int filter_offset = filter_idx + (slice + filter_depth) % in_depth;
+      shared_data[filter_write_offset] = ldg(filter_offset + filter);
+    }
+
+    // Note: the condition to reach this is uniform across the entire block.
+    __syncthreads();
+
+    if (slice_in_range) {
+      T sum1 = 0;
+      T sum2 = 0;
+      int shared_offset = data_idx;
+      const T* filter_ptr = filter_read_offset + shared_data;
+      UNROLL for (int r = 0; r < filter_rows; ++r) {
+        UNROLL for (int c = 0; c < filter_cols; ++c) {
+          const T filter_value = *filter_ptr;
+          const T* const tile_ptr = shared_offset + shared_data;
+          sum1 += filter_value * tile_ptr[0];
+          sum2 += filter_value * tile_ptr[tile_offset];
+          ++shared_offset;
+          filter_ptr += block_slices;
+        }
+        shared_offset += in_increment;
+      }
+      T* const out_ptr = inout_offset + output;
+      out_ptr[0] = sum1;
+      if (!skip_second) {
+        out_ptr[block_pixels] = sum2;
+      }
+    }
+
+    // Note: the condition to reach this is uniform across the entire block.
+    __syncthreads();
   }
+}
 
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
+          bool kKnownEvenRows>
+void LaunchDepthwiseConv2dGPUSmall(const GpuDevice& d, const DepthwiseArgs args,
+                                   const T* input, const T* filter, T* output,
+                                   TensorFormat data_format) {
+  const int block_rows = (args.in_rows + 1) / 2;
+  const int block_slices = 8;
   const int tile_cols = args.in_cols + args.filter_cols - 1;
   const int tile_rows = block_rows * 2 + args.filter_rows - 1;
-  const int tile_size = tile_rows * tile_cols;
-  const int filter_size = args.filter_rows * args.filter_cols;
-  dim3 block_dim = dim3(8, args.in_cols, block_rows);
-  const int shared_memory_size =
-      block_dim.x * (tile_size + filter_size) * sizeof(T);
+  const int tile_pixels = tile_rows * tile_cols;
+  const int filter_pixels = args.filter_rows * args.filter_cols;
 
+  const int shared_memory_size =
+      block_slices * (tile_pixels + filter_pixels) * sizeof(T);
   const int num_outputs =
       args.batch * args.out_rows * args.out_cols * args.out_depth;
-  if (args.in_rows & 1) {
+
+  if (data_format == FORMAT_NHWC) {
+    dim3 block_dim = dim3(block_slices, args.in_cols, block_rows);
     CudaLaunchConfig config = GetCudaLaunchConfig(
         num_outputs, d,
         DepthwiseConv2dGPUKernelNHWCSmall<T, kKnownFilterWidth,
-                                          kKnownFilterHeight, false>,
+                                          kKnownFilterHeight, kKnownEvenRows>,
         shared_memory_size, block_dim.x * block_dim.y * block_dim.z);
     DepthwiseConv2dGPUKernelNHWCSmall<T, kKnownFilterWidth, kKnownFilterHeight,
-                                      false>
+                                      kKnownEvenRows>
         <<<config.block_count, block_dim, shared_memory_size, d.stream()>>>(
             args, input, filter, output);
-  } else {
+  } else if (data_format == FORMAT_NCHW) {
+    dim3 block_dim = dim3(args.in_cols, block_rows, block_slices);
     CudaLaunchConfig config = GetCudaLaunchConfig(
         num_outputs, d,
-        DepthwiseConv2dGPUKernelNHWCSmall<T, kKnownFilterWidth,
-                                          kKnownFilterHeight, true>,
+        DepthwiseConv2dGPUKernelNCHWSmall<T, kKnownFilterWidth,
+                                          kKnownFilterHeight, kKnownEvenRows>,
         shared_memory_size, block_dim.x * block_dim.y * block_dim.z);
-    DepthwiseConv2dGPUKernelNHWCSmall<T, kKnownFilterWidth, kKnownFilterHeight,
-                                      true>
+    DepthwiseConv2dGPUKernelNCHWSmall<T, kKnownFilterWidth, kKnownFilterHeight,
+                                      kKnownEvenRows>
         <<<config.block_count, block_dim, shared_memory_size, d.stream()>>>(
             args, input, filter, output);
+  } else {
+    assert(false);
+  }
+}
+
+template <typename T, int kKnownFilterWidth, int kKnownFilterHeight>
+void LaunchDepthwiseConv2dGPUSmall(const GpuDevice& d, const DepthwiseArgs args,
+                                   const T* input, const T* filter, T* output,
+                                   TensorFormat data_format) {
+  if (args.in_rows & 1) {
+    LaunchDepthwiseConv2dGPUSmall<T, kKnownFilterWidth, kKnownFilterHeight,
+                                  /*kKnownEvenRows=*/false>(
+        d, args, input, filter, output, data_format);
+  } else {
+    LaunchDepthwiseConv2dGPUSmall<T, kKnownFilterWidth, kKnownFilterHeight,
+                                  /*kKnownEvenRows=*/true>(
+        d, args, input, filter, output, data_format);
   }
-  return true;
 }
 
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
@@ -434,9 +590,9 @@ template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
 void LaunchDepthwiseConv2dGPU(const GpuDevice& d, const DepthwiseArgs args,
                               const T* input, const T* filter, T* output,
                               TensorFormat data_format) {
-  if (TryLaunchDepthwiseConv2dGPUSmall<T, kKnownFilterWidth,
-                                       kKnownFilterHeight>(
-          d, args, input, filter, output, data_format)) {
+  if (CanLaunchDepthwiseConv2dGPUSmall(args)) {
+    LaunchDepthwiseConv2dGPUSmall<T, kKnownFilterWidth, kKnownFilterHeight>(
+        d, args, input, filter, output, data_format);
     return;
   }
   const int num_outputs =
@@ -588,16 +744,15 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropInputGPUKernelNHWCSmall(
 
   // Fixed blockDim.x, corresponding to Pascal's global load granularity of 32B.
   const int block_slices = 8;
-  const int block_cols = blockDim.y;
   const int block_rows = blockDim.z;
 
   // These values are the same for all threads and could
   // be precomputed on the CPU.
-  const int block_size = block_rows * block_cols * block_slices;
+  const int block_size = block_rows * in_cols * block_slices;
   const int in_row_size = in_cols * in_depth;
   const int in_size = in_rows * in_row_size;
   const int in_increment = (in_cols - 1) * block_slices;
-  const int filter_size = filter_rows * filter_cols;
+  const int filter_pixels = filter_rows * filter_cols;
   const int tile_cols = in_cols + filter_cols - 1;
   const int even_rows = kKnownEvenRows || (1 & ~in_rows);
   const int tile_rows = in_rows + filter_rows - even_rows;
@@ -615,7 +770,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropInputGPUKernelNHWCSmall(
   const int thread_row = threadIdx.z;
 
   // Position in block.
-  const int thread_pix = thread_row * block_cols + thread_col;
+  const int thread_pix = thread_row * in_cols + thread_col;
   const int thread_idx = thread_pix * block_slices + thread_depth;
 
   // Initialize tile, in particular the padding.
@@ -637,9 +792,9 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropInputGPUKernelNHWCSmall(
 
   const int max_depth = in_depth - thread_depth;
   const int filter_write_offset =
-      thread_pix < filter_size ? tile_size + thread_idx : 0;
+      thread_pix < filter_pixels ? tile_size + thread_idx : 0;
   const int filter_read_offset =
-      tile_size + filter_size * block_slices + thread_depth;
+      tile_size + filter_pixels * block_slices + thread_depth;
   const bool skip_second =
       !kKnownEvenRows && thread_row + (in_rows & 1) == block_rows;
 
@@ -786,11 +941,11 @@ bool TryLaunchDepthwiseConv2dBackpropInputGPUSmall(
 
   const int tile_cols = args.in_cols + args.filter_cols - 1;
   const int tile_rows = block_rows * 2 + args.filter_rows - 1;
-  const int tile_size = tile_rows * tile_cols;
-  const int filter_size = args.filter_rows * args.filter_cols;
+  const int tile_pixels = tile_rows * tile_cols;
+  const int filter_pixels = args.filter_rows * args.filter_cols;
   dim3 block_dim = dim3(8, args.in_cols, block_rows);
   const int shared_memory_size =
-      block_dim.x * (tile_size + filter_size) * sizeof(T);
+      block_dim.x * (tile_pixels + filter_pixels) * sizeof(T);
 
   const int num_in_backprop =
       args.batch * args.in_rows * args.in_cols * args.in_depth;
@@ -1007,16 +1162,15 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall(
 
   // Fixed blockDim.x, corresponding to Pascal's global load granularity of 32B.
   const int block_slices = 8;
-  const int block_cols = blockDim.y;
   const int block_rows = blockDim.z;
 
   // These values are the same for all threads and could
   // be precomputed on the CPU.
-  const int block_size = block_rows * block_cols * block_slices;
+  const int block_size = block_rows * in_cols * block_slices;
   const int in_row_size = in_cols * in_depth;
   const int in_size = in_rows * in_row_size;
   const int in_increment = (in_cols - 1) * block_slices;
-  const int filter_size = filter_rows * filter_cols;
+  const int filter_pixels = filter_rows * filter_cols;
   const int tile_cols = in_cols + filter_cols - 1;
   const int tile_rows = 2 * block_rows + filter_rows - 1;
   const int tile_row_size = tile_cols * block_slices;
@@ -1028,14 +1182,14 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall(
   const int tensor_offset = block_rows * in_row_size;
   const int accum_pixels = 32;
   const int accum_increment = accum_pixels * block_slices;
-  const int accum_size = filter_size * accum_increment;
+  const int accum_size = filter_pixels * accum_increment;
 
   const int thread_depth = threadIdx.x;
   const int thread_col = threadIdx.y;
   const int thread_row = threadIdx.z;
 
   // Position in block.
-  const int thread_pix = thread_row * block_cols + thread_col;
+  const int thread_pix = thread_row * in_cols + thread_col;
   const int thread_idx = thread_pix * block_slices + thread_depth;
 
   // Initialize tile, in particular the padding and accumulator.
@@ -1247,11 +1401,11 @@ bool TryLaunchDepthwiseConv2dBackpropFilterGPUSmall(
   const int block_rows = (args.in_rows + 1) / 2 + rows_mask & ~rows_mask;
   const int tile_cols = args.in_cols + args.filter_cols - 1;
   const int tile_rows = block_rows * 2 + args.filter_rows - 1;
-  const int tile_size = tile_rows * tile_cols;
+  const int tile_pixels = tile_rows * tile_cols;
   const int accum_size = args.filter_rows * args.filter_cols * 32;
   dim3 block_dim = dim3(8, args.in_cols, block_rows);
   const int shared_memory_size =
-      block_dim.x * (tile_size + accum_size) * sizeof(T);
+      block_dim.x * (tile_pixels + accum_size) * sizeof(T);
 
   if (block_rows > args.in_rows ||
       args.filter_rows * args.filter_cols > args.in_cols * block_rows ||
diff --git a/tensorflow/core/kernels/hexagon/i_graph_transfer_ops_definitions.h b/tensorflow/core/kernels/hexagon/i_graph_transfer_ops_definitions.h
index 3d6f493a9c..d5b4cf7451 100644
--- a/tensorflow/core/kernels/hexagon/i_graph_transfer_ops_definitions.h
+++ b/tensorflow/core/kernels/hexagon/i_graph_transfer_ops_definitions.h
@@ -48,4 +48,4 @@ class IGraphTransferOpsDefinitions {
 
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_I_GRAPH_TRANSFER_OPS_DEFINITIONS_H
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_HEXAGON_I_GRAPH_TRANSFER_OPS_DEFINITIONS_H_
diff --git a/tensorflow/core/kernels/iterator_ops.cc b/tensorflow/core/kernels/iterator_ops.cc
index fa3f3a4db6..51f11f9d2e 100644
--- a/tensorflow/core/kernels/iterator_ops.cc
+++ b/tensorflow/core/kernels/iterator_ops.cc
@@ -246,7 +246,7 @@ class OneShotIteratorOp : public OpKernel {
                                      n.Notify();
                                    });
       n.WaitForNotification();
-      OP_REQUIRES_OK(ctx, std::move(factory_status));
+      OP_REQUIRES_OK(ctx, factory_status);
       OP_REQUIRES(
           ctx,
           return_values.size() == 1 &&
diff --git a/tensorflow/core/kernels/lmdb_reader_op.cc b/tensorflow/core/kernels/lmdb_reader_op.cc
new file mode 100755
index 0000000000..23cabe7b54
--- /dev/null
+++ b/tensorflow/core/kernels/lmdb_reader_op.cc
@@ -0,0 +1,134 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "lmdb.h"
+#include "tensorflow/core/framework/reader_op_kernel.h"
+#include "tensorflow/core/framework/reader_base.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+#include <sys/stat.h>
+
+namespace tensorflow {
+
+inline void MDB_CHECK(int mdb_status) {
+  CHECK_EQ(mdb_status, MDB_SUCCESS) << mdb_strerror(mdb_status);
+}
+
+class LMDBReader : public ReaderBase {
+ public:
+  LMDBReader(const string& node_name, Env* env)
+      : ReaderBase(strings::StrCat("LMDBReader '", node_name, "'")),
+        env_(env),
+        mdb_env_(nullptr),
+        mdb_dbi_(0),
+        mdb_txn_(nullptr),
+        mdb_cursor_(nullptr) {}
+
+  Status OnWorkStartedLocked() override {
+    MDB_CHECK(mdb_env_create(&mdb_env_));
+    int flags = MDB_RDONLY | MDB_NOTLS;
+
+    // Check if the LMDB filename is actually a file instead of a directory.
+    // If so, set appropriate flags so we can open it.
+    struct stat source_stat;
+    if (stat(current_work().c_str(), &source_stat) == 0 &&
+        (source_stat.st_mode & S_IFREG)) {
+      flags |= MDB_NOSUBDIR;
+    }
+
+    MDB_CHECK(mdb_env_open(mdb_env_, current_work().c_str(), flags, 0664));
+    MDB_CHECK(mdb_txn_begin(mdb_env_, nullptr, MDB_RDONLY, &mdb_txn_));
+    MDB_CHECK(mdb_dbi_open(mdb_txn_, nullptr, 0, &mdb_dbi_));
+
+    return Status::OK();
+  }
+
+  Status OnWorkFinishedLocked() override {
+    if (mdb_env_ != nullptr) {
+      if (mdb_cursor_) {
+        mdb_cursor_close(mdb_cursor_);
+      }
+      mdb_txn_abort(mdb_txn_);
+      mdb_dbi_close(mdb_env_, mdb_dbi_);
+      mdb_env_close(mdb_env_);
+      mdb_env_ = nullptr;
+    }
+    return Status::OK();
+  }
+
+  Status ReadLocked(string* key, string* value, bool* produced,
+                    bool* at_end) override {
+    if (mdb_cursor_ == nullptr) {
+      MDB_CHECK(mdb_cursor_open(mdb_txn_, mdb_dbi_, &mdb_cursor_));
+      if (Seek(MDB_FIRST) == false) {
+        *at_end = true;
+        return Status::OK();
+      }
+    }
+    else {
+      if (Seek(MDB_NEXT) == false) {
+        *at_end = true;
+        return Status::OK();
+      }
+    }
+    *key = string(static_cast<const char*>(mdb_key_.mv_data),
+                  mdb_key_.mv_size);
+    *value = string(static_cast<const char*>(mdb_value_.mv_data),
+                    mdb_value_.mv_size);
+    *produced = true;
+    return Status::OK();
+  }
+
+  Status ResetLocked() override {
+    CHECK_EQ(Seek(MDB_FIRST), true);
+    return ReaderBase::ResetLocked();
+  }
+
+ private:
+  bool Seek(MDB_cursor_op op) {
+    CHECK_NOTNULL(mdb_cursor_);
+    int mdb_status = mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, op);
+    if (mdb_status == MDB_NOTFOUND) {
+      return false;
+    } else {
+      MDB_CHECK(mdb_status);
+      return true;
+    }
+  }
+
+  Env* const env_;
+  MDB_env* mdb_env_;
+  MDB_dbi mdb_dbi_;
+
+  MDB_txn* mdb_txn_;
+  MDB_cursor* mdb_cursor_;
+  MDB_val mdb_key_, mdb_value_;
+};
+
+class LMDBReaderOp : public ReaderOpKernel {
+ public:
+  explicit LMDBReaderOp(OpKernelConstruction* context)
+      : ReaderOpKernel(context) {
+    Env* env = context->env();
+    SetReaderFactory([this, env]() {
+      return new LMDBReader(name(), env);
+    });
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("LMDBReader").Device(DEVICE_CPU),
+                        LMDBReaderOp);
+
+}
diff --git a/tensorflow/core/kernels/matrix_band_part_op.cc b/tensorflow/core/kernels/matrix_band_part_op.cc
index f119cb1902..894b0113c2 100644
--- a/tensorflow/core/kernels/matrix_band_part_op.cc
+++ b/tensorflow/core/kernels/matrix_band_part_op.cc
@@ -83,7 +83,7 @@ class MatrixBandPartOp : public OpKernel {
   REGISTER_KERNEL_BUILDER(                                                 \
       Name("MatrixBandPart").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
       MatrixBandPartOp<CPUDevice, type>);
-TF_CALL_NUMBER_TYPES(REGISTER_MATRIX_BAND_PART);
+TF_CALL_POD_TYPES(REGISTER_MATRIX_BAND_PART);
 #undef REGISTER_MATRIX_BAND_PART
 
 // Registration of the deprecated kernel.
@@ -143,6 +143,7 @@ namespace functor {
   extern template struct MatrixBandPart<GPUDevice, T>;
 
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
+TF_CALL_bool(DECLARE_GPU_SPEC);
 TF_CALL_complex64(DECLARE_GPU_SPEC);
 TF_CALL_complex128(DECLARE_GPU_SPEC);
 }  // namespace functor
@@ -156,6 +157,7 @@ TF_CALL_complex128(DECLARE_GPU_SPEC);
                               .HostMemory("num_upper"),  \
                           MatrixBandPartOp<GPUDevice, type>);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_MATRIX_BAND_PART_GPU);
+TF_CALL_bool(REGISTER_MATRIX_BAND_PART_GPU);
 TF_CALL_complex64(REGISTER_MATRIX_BAND_PART_GPU);
 TF_CALL_complex128(REGISTER_MATRIX_BAND_PART_GPU);
 #undef REGISTER_MATRIX_BAND_PART_GPU
diff --git a/tensorflow/core/kernels/matrix_band_part_op_gpu.cu.cc b/tensorflow/core/kernels/matrix_band_part_op_gpu.cu.cc
index fab3fb4a1f..ccc10ebada 100644
--- a/tensorflow/core/kernels/matrix_band_part_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/matrix_band_part_op_gpu.cu.cc
@@ -29,6 +29,7 @@ typedef Eigen::GpuDevice GPUDevice;
   template struct functor::MatrixBandPart<GPUDevice, T>;
 
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPEC);
+TF_CALL_bool(DEFINE_GPU_SPEC);
 TF_CALL_complex64(DEFINE_GPU_SPEC);
 TF_CALL_complex128(DEFINE_GPU_SPEC);
 
diff --git a/tensorflow/core/kernels/matrix_diag_op.cc b/tensorflow/core/kernels/matrix_diag_op.cc
index bc193357ad..75c49baaa8 100644
--- a/tensorflow/core/kernels/matrix_diag_op.cc
+++ b/tensorflow/core/kernels/matrix_diag_op.cc
@@ -123,7 +123,7 @@ class MatrixDiagOp : public OpKernel {
   REGISTER_KERNEL_BUILDER(                                                 \
       Name("MatrixDiagPart").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
       MatrixDiagPartOp<CPUDevice, type>);
-TF_CALL_NUMBER_TYPES(REGISTER_MATRIX_DIAG);
+TF_CALL_POD_TYPES(REGISTER_MATRIX_DIAG);
 #undef REGISTER_MATRIX_DIAG
 
 // Registration of the deprecated kernel.
@@ -136,7 +136,7 @@ TF_CALL_NUMBER_TYPES(REGISTER_MATRIX_DIAG);
                               .Device(DEVICE_CPU)                           \
                               .TypeConstraint<type>("T"),                   \
                           MatrixDiagPartOp<CPUDevice, type>);
-TF_CALL_NUMBER_TYPES(REGISTER_BATCH_MATRIX_DIAG);
+TF_CALL_POD_TYPES(REGISTER_BATCH_MATRIX_DIAG);
 #undef REGISTER_BATCH_MATRIX_DIAG
 
 // Implementation of the functor specialization for CPU.
@@ -187,6 +187,7 @@ namespace functor {
   extern template struct MatrixDiagPart<GPUDevice, T>;
 
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
+TF_CALL_bool(DECLARE_GPU_SPEC);
 TF_CALL_complex64(DECLARE_GPU_SPEC);
 TF_CALL_complex128(DECLARE_GPU_SPEC);
 
@@ -201,6 +202,7 @@ TF_CALL_complex128(DECLARE_GPU_SPEC);
       Name("MatrixDiagPart").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
       MatrixDiagPartOp<GPUDevice, type>);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_MATRIX_DIAG_GPU);
+TF_CALL_bool(REGISTER_MATRIX_DIAG_GPU);
 TF_CALL_complex64(REGISTER_MATRIX_DIAG_GPU);
 TF_CALL_complex128(REGISTER_MATRIX_DIAG_GPU);
 #undef REGISTER_MATRIX_DIAG_GPU
diff --git a/tensorflow/core/kernels/matrix_diag_op_gpu.cu.cc b/tensorflow/core/kernels/matrix_diag_op_gpu.cu.cc
index 14774d5404..cfb1fa10fc 100644
--- a/tensorflow/core/kernels/matrix_diag_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/matrix_diag_op_gpu.cu.cc
@@ -31,6 +31,7 @@ typedef Eigen::GpuDevice GPUDevice;
   template struct functor::MatrixDiagPart<GPUDevice, T>;
 
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPEC);
+TF_CALL_bool(DEFINE_GPU_SPEC);
 TF_CALL_complex64(DEFINE_GPU_SPEC);
 TF_CALL_complex128(DEFINE_GPU_SPEC);
 
diff --git a/tensorflow/core/kernels/matrix_set_diag_op.cc b/tensorflow/core/kernels/matrix_set_diag_op.cc
index cbb2b68b7f..3397af56bc 100644
--- a/tensorflow/core/kernels/matrix_set_diag_op.cc
+++ b/tensorflow/core/kernels/matrix_set_diag_op.cc
@@ -100,7 +100,7 @@ class MatrixSetDiagOp : public OpKernel {
   REGISTER_KERNEL_BUILDER(                                                \
       Name("MatrixSetDiag").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
       MatrixSetDiagOp<CPUDevice, type>);
-TF_CALL_NUMBER_TYPES(REGISTER_MATRIX_SET_DIAG);
+TF_CALL_POD_TYPES(REGISTER_MATRIX_SET_DIAG);
 #undef REGISTER_MATRIX_SET_DIAG
 
 // Registration of the deprecated kernel.
@@ -109,7 +109,7 @@ TF_CALL_NUMBER_TYPES(REGISTER_MATRIX_SET_DIAG);
   REGISTER_KERNEL_BUILDER(                                                     \
       Name("BatchMatrixSetDiag").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
       MatrixSetDiagOp<CPUDevice, type>);
-TF_CALL_NUMBER_TYPES(REGISTER_BATCH_MATRIX_SET_DIAG);
+TF_CALL_POD_TYPES(REGISTER_BATCH_MATRIX_SET_DIAG);
 #undef REGISTER_BATCH_MATRIX_SET_DIAG
 
 namespace functor {
@@ -131,6 +131,21 @@ struct MatrixSetDiag<CPUDevice, T> {
   }
 };
 
+template <>
+struct MatrixSetDiag<CPUDevice, bool> {
+  static void Compute(const CPUDevice& d, TTypes<bool, 3>::ConstTensor input,
+                      TTypes<bool, 2>::ConstTensor diag,
+                      TTypes<bool>::Scalar scratch,
+                      TTypes<bool, 3>::Tensor output) {
+    output.device(d) = input;
+    for (int64 r = 0; r < output.dimension(0); ++r) {
+      for (int64 d = 0; d < diag.dimension(1); ++d) {
+        output(r, d, d) = diag(r, d);
+      }
+    }
+  }
+};
+
 }  // namespace functor
 
 #if GOOGLE_CUDA
@@ -147,6 +162,7 @@ namespace functor {
   extern template struct MatrixSetDiag<GPUDevice, T>;
 
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
+TF_CALL_bool(DECLARE_GPU_SPEC);
 TF_CALL_complex64(DECLARE_GPU_SPEC);
 TF_CALL_complex128(DECLARE_GPU_SPEC);
 
@@ -158,6 +174,7 @@ TF_CALL_complex128(DECLARE_GPU_SPEC);
       Name("MatrixSetDiag").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
       MatrixSetDiagOp<GPUDevice, type>);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_MATRIX_SET_DIAG_GPU);
+TF_CALL_bool(REGISTER_MATRIX_SET_DIAG_GPU);
 TF_CALL_complex64(REGISTER_MATRIX_SET_DIAG_GPU);
 TF_CALL_complex128(REGISTER_MATRIX_SET_DIAG_GPU);
 #undef REGISTER_MATRIX_SET_DIAG_GPU
diff --git a/tensorflow/core/kernels/matrix_set_diag_op.h b/tensorflow/core/kernels/matrix_set_diag_op.h
index 8ba2f3756a..63e5650bf0 100644
--- a/tensorflow/core/kernels/matrix_set_diag_op.h
+++ b/tensorflow/core/kernels/matrix_set_diag_op.h
@@ -71,6 +71,23 @@ struct MatrixSetDiag {
   }
 };
 
+template <typename Device>
+struct MatrixSetDiag<Device, bool> {
+  EIGEN_ALWAYS_INLINE static void Compute(const Device& d,
+                                          TTypes<bool, 3>::ConstTensor input,
+                                          TTypes<bool, 2>::ConstTensor diag,
+                                          TTypes<bool>::Scalar scratch,
+                                          TTypes<bool, 3>::Tensor output) {
+    output.device(d) = input;
+    generator::OverwriteDiagGenerator<bool> generator(diag, output);
+    // Use all() to force the generation to aggregate to the scalar
+    // output scratch.  This in turn forces each element of the
+    // generator to execute.  The side effect of the execution is to
+    // update the diagonal components of output with diag.
+    scratch.device(d) = diag.generate(generator).all();
+  }
+};
+
 }  // namespace functor
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/matrix_set_diag_op_gpu.cu.cc b/tensorflow/core/kernels/matrix_set_diag_op_gpu.cu.cc
index bd097ff328..8e41ce5860 100644
--- a/tensorflow/core/kernels/matrix_set_diag_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/matrix_set_diag_op_gpu.cu.cc
@@ -29,6 +29,7 @@ typedef Eigen::GpuDevice GPUDevice;
   template struct functor::MatrixSetDiag<GPUDevice, T>;
 
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPEC);
+TF_CALL_bool(DEFINE_GPU_SPEC);
 TF_CALL_complex64(DEFINE_GPU_SPEC);
 TF_CALL_complex128(DEFINE_GPU_SPEC);
 
diff --git a/tensorflow/core/kernels/sendrecv_ops.cc b/tensorflow/core/kernels/sendrecv_ops.cc
index 1c7d50e161..4c656bd74b 100644
--- a/tensorflow/core/kernels/sendrecv_ops.cc
+++ b/tensorflow/core/kernels/sendrecv_ops.cc
@@ -52,17 +52,16 @@ SendOp::SendOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
   OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name));
   key_prefix_ = GetRendezvousKeyPrefix(send_device, recv_device,
                                        send_device_incarnation, tensor_name);
+  // The vast majority of Send nodes are outside any loop context, so
+  // proactively cache the rendezvous key for the top-level.
+  GetRendezvousKey(key_prefix_, {0, 0}, &parsed_key_.buf_);
+  OP_REQUIRES_OK(ctx, Rendezvous::ParseKey(parsed_key_.buf_, &parsed_key_));
 }
 
 void SendOp::Compute(OpKernelContext* ctx) {
   OP_REQUIRES(
       ctx, ctx->rendezvous() != nullptr,
       errors::Internal("Op kernel context needs to provide a rendezvous."));
-  Rendezvous::ParsedKey parsed;
-  GetRendezvousKey(key_prefix_, ctx->frame_iter(), &parsed.buf_);
-  VLOG(2) << "Send " << parsed.buf_;
-
-  OP_REQUIRES_OK(ctx, Rendezvous::ParseKey(parsed.buf_, &parsed));
 
   // The device context may be passed between the Send/Recv
   // boundary, so that the device context used to produce the Tensor
@@ -71,8 +70,24 @@ void SendOp::Compute(OpKernelContext* ctx) {
   Rendezvous::Args args;
   args.device_context = ctx->op_device_context();
   args.alloc_attrs = ctx->input_alloc_attr(0);
-  OP_REQUIRES_OK(ctx, ctx->rendezvous()->Send(parsed, args, ctx->input(0),
-                                              ctx->is_input_dead()));
+
+  if (ctx->frame_iter() == FrameAndIter(0, 0)) {
+    // Use the cached rendezvous key.
+    VLOG(2) << "Send " << parsed_key_.buf_;
+    OP_REQUIRES_OK(ctx,
+                   ctx->rendezvous()->Send(parsed_key_, args, ctx->input(0),
+                                           ctx->is_input_dead()));
+  } else {
+    Rendezvous::ParsedKey in_loop_parsed;
+    GetRendezvousKey(key_prefix_, ctx->frame_iter(), &in_loop_parsed.buf_);
+    VLOG(2) << "Send " << in_loop_parsed.buf_;
+    OP_REQUIRES_OK(ctx,
+                   Rendezvous::ParseKey(in_loop_parsed.buf_, &in_loop_parsed));
+
+    OP_REQUIRES_OK(ctx,
+                   ctx->rendezvous()->Send(in_loop_parsed, args, ctx->input(0),
+                                           ctx->is_input_dead()));
+  }
 }
 
 REGISTER_KERNEL_BUILDER(Name("_Send").Device(DEVICE_CPU), SendOp);
@@ -101,17 +116,16 @@ RecvOp::RecvOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
   OP_REQUIRES_OK(ctx, ctx->GetAttr("tensor_name", &tensor_name));
   key_prefix_ = GetRendezvousKeyPrefix(send_device, recv_device,
                                        send_device_incarnation, tensor_name);
+  // The vast majority of Recv nodes are outside any loop context, so
+  // proactively cache the rendezvous key for the top-level.
+  GetRendezvousKey(key_prefix_, {0, 0}, &parsed_key_.buf_);
+  OP_REQUIRES_OK(ctx, Rendezvous::ParseKey(parsed_key_.buf_, &parsed_key_));
 }
 
 void RecvOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
   OP_REQUIRES(
       ctx, ctx->rendezvous() != nullptr,
       errors::Internal("Op kernel context needs to provide a rendezvous."));
-  Rendezvous::ParsedKey parsed;
-  GetRendezvousKey(key_prefix_, ctx->frame_iter(), &parsed.buf_);
-  VLOG(2) << "Recv " << parsed.buf_;
-
-  OP_REQUIRES_OK_ASYNC(ctx, Rendezvous::ParseKey(parsed.buf_, &parsed), done);
 
   Rendezvous::Args args;
   args.device_context = ctx->op_device_context();
@@ -136,7 +150,19 @@ void RecvOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
         done();
       },
       std::move(done), _1, _2, _3, _4, _5);
-  ctx->rendezvous()->RecvAsync(parsed, args, std::move(done_cb));
+
+  if (ctx->frame_iter() == FrameAndIter(0, 0)) {
+    VLOG(2) << "Recv " << parsed_key_.buf_;
+    ctx->rendezvous()->RecvAsync(parsed_key_, args, std::move(done_cb));
+  } else {
+    Rendezvous::ParsedKey in_loop_parsed;
+    GetRendezvousKey(key_prefix_, ctx->frame_iter(), &in_loop_parsed.buf_);
+    VLOG(2) << "Recv " << in_loop_parsed.buf_;
+    OP_REQUIRES_OK_ASYNC(
+        ctx, Rendezvous::ParseKey(in_loop_parsed.buf_, &in_loop_parsed), done);
+
+    ctx->rendezvous()->RecvAsync(in_loop_parsed, args, std::move(done_cb));
+  }
 }
 
 REGISTER_KERNEL_BUILDER(Name("_Recv").Device(DEVICE_CPU), RecvOp);
diff --git a/tensorflow/core/kernels/sendrecv_ops.h b/tensorflow/core/kernels/sendrecv_ops.h
index 6e91422682..67867e3308 100644
--- a/tensorflow/core/kernels/sendrecv_ops.h
+++ b/tensorflow/core/kernels/sendrecv_ops.h
@@ -28,6 +28,7 @@ class SendOp : public OpKernel {
 
  private:
   string key_prefix_;
+  Rendezvous::ParsedKey parsed_key_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(SendOp);
 };
@@ -39,6 +40,7 @@ class RecvOp : public AsyncOpKernel {
 
  private:
   string key_prefix_;
+  Rendezvous::ParsedKey parsed_key_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(RecvOp);
 };
diff --git a/tensorflow/core/kernels/sendrecv_ops_test.cc b/tensorflow/core/kernels/sendrecv_ops_test.cc
new file mode 100644
index 0000000000..092a29f2f3
--- /dev/null
+++ b/tensorflow/core/kernels/sendrecv_ops_test.cc
@@ -0,0 +1,74 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+
+namespace {
+
+// Implement a trivial version of the Rendezvous interface, to avoid
+// clouding the benchmark results with the time spent in the various
+// implementations, and to avoid the duplicate-send or duplicate-recv
+// errors that would arise from running either benchmark in a loop.
+class DummyRendezvous : public Rendezvous {
+  Status Send(const ParsedKey& key, const Args& args, const Tensor& val,
+              const bool is_dead) override {
+    return Status::OK();
+  }
+  void RecvAsync(const ParsedKey& key, const Args& args,
+                 DoneCallback done) override {
+    static Tensor* t = new Tensor(DT_FLOAT, TensorShape({0}));
+    done(Status::OK(), args, args, *t, false);
+  }
+  void StartAbort(const Status& status) override {}
+};
+
+static Graph* Send() {
+  Graph* g = new Graph(OpRegistry::Global());
+  Tensor in0(DT_FLOAT, TensorShape({0}));
+  test::graph::Send(g, test::graph::Constant(g, in0), "T", "/cpu:0", 1,
+                    "/cpu:0");
+  test::graph::Recv(g, "T", "float", "/cpu:0", 1, "/cpu:0");
+  return g;
+}
+
+static Graph* Recv() {
+  Graph* g = new Graph(OpRegistry::Global());
+  test::graph::Recv(g, "T", "float", "/cpu:0", 1, "/cpu:0");
+  return g;
+}
+
+static void BM_Send(int iters) {
+  testing::UseRealTime();
+  testing::ItemsProcessed(static_cast<int64>(iters));
+  test::Benchmark("cpu", Send(), nullptr, nullptr, new DummyRendezvous)
+      .Run(iters);
+}
+BENCHMARK(BM_Send);
+
+static void BM_Recv(int iters) {
+  testing::UseRealTime();
+  testing::ItemsProcessed(static_cast<int64>(iters));
+  test::Benchmark("cpu", Recv(), nullptr, nullptr, new DummyRendezvous)
+      .Run(iters);
+}
+BENCHMARK(BM_Recv);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/lib/io/inputbuffer_test.cc b/tensorflow/core/lib/io/inputbuffer_test.cc
index 4571df9a98..6771697a16 100644
--- a/tensorflow/core/lib/io/inputbuffer_test.cc
+++ b/tensorflow/core/lib/io/inputbuffer_test.cc
@@ -298,9 +298,9 @@ TEST(InputBuffer, ReadVarint32) {
   // Generates data.
   std::vector<uint32> data;
   uint32 i = 0;
-  for (; i < (1 << 10); i += 1) data.push_back(i);
-  for (; i < (1 << 15); i += 5) data.push_back(i);
-  for (; i < (1 << 31); i += 132817) data.push_back(i);
+  for (; i < (1U << 10); i += 1) data.push_back(i);
+  for (; i < (1U << 15); i += 5) data.push_back(i);
+  for (; i < (1U << 31); i += 132817) data.push_back(i);
   data.push_back(std::numeric_limits<uint32>::max());
 
   // Writes the varints.
diff --git a/tensorflow/core/lib/io/zlib_inputstream.cc b/tensorflow/core/lib/io/zlib_inputstream.cc
index 3de157a1fc..4999d5cc90 100644
--- a/tensorflow/core/lib/io/zlib_inputstream.cc
+++ b/tensorflow/core/lib/io/zlib_inputstream.cc
@@ -58,16 +58,14 @@ void ZlibInputStream::InitZlibBuffer() {
   z_stream_->avail_in = 0;
 
   int status = inflateInit2(z_stream_.get(), zlib_options_.window_bits);
-  if (status != Z_OK) {
-    LOG(FATAL) << "inflateInit failed with status " << status;
-    z_stream_.reset(nullptr);
-  } else {
-    z_stream_->next_in = z_stream_input_.get();
-    z_stream_->next_out = z_stream_output_.get();
-    next_unread_byte_ = reinterpret_cast<char*>(z_stream_output_.get());
-    z_stream_->avail_in = 0;
-    z_stream_->avail_out = output_buffer_capacity_;
-  }
+
+  CHECK_EQ(status, Z_OK) << "inflateInit failed with status " << status;
+
+  z_stream_->next_in = z_stream_input_.get();
+  z_stream_->next_out = z_stream_output_.get();
+  next_unread_byte_ = reinterpret_cast<char*>(z_stream_output_.get());
+  z_stream_->avail_in = 0;
+  z_stream_->avail_out = output_buffer_capacity_;
 }
 
 Status ZlibInputStream::ReadFromStream() {
diff --git a/tensorflow/core/lib/lmdb/testdata/data.mdb b/tensorflow/core/lib/lmdb/testdata/data.mdb
new file mode 100644
index 0000000000..3ea75699cb
--- /dev/null
+++ b/tensorflow/core/lib/lmdb/testdata/data.mdb
diff --git a/tensorflow/core/ops/array_ops_test.cc b/tensorflow/core/ops/array_ops_test.cc
index a7b4422bab..39a3b42a87 100644
--- a/tensorflow/core/ops/array_ops_test.cc
+++ b/tensorflow/core/ops/array_ops_test.cc
@@ -175,7 +175,11 @@ TEST(ArrayOpsTest, Identity_ShapeFnHandles) {
   TF_ASSERT_OK(c.construction_status());
   ASSERT_TRUE(op_reg_data->shape_inference_fn != nullptr);
   TF_ASSERT_OK(c.Run(op_reg_data->shape_inference_fn));
-  EXPECT_TRUE(c.output_handle_dtype(0) == DT_BOOL);
+
+  const auto* shapes_and_types = c.output_handle_shapes_and_types(0);
+  ASSERT_TRUE(shapes_and_types != nullptr);
+  ASSERT_EQ(1, shapes_and_types->size());
+  EXPECT_EQ((*shapes_and_types)[0].dtype, DT_BOOL);
 }
 
 TEST(ArrayOpsTest, Diag_ShapeFn) {
diff --git a/tensorflow/core/ops/io_ops.cc b/tensorflow/core/ops/io_ops.cc
index 0bce6fc0ea..b7afab84a1 100644
--- a/tensorflow/core/ops/io_ops.cc
+++ b/tensorflow/core/ops/io_ops.cc
@@ -520,6 +520,21 @@ shared_name: If non-empty, this reader is named in the given bucket
              with this shared_name. Otherwise, the node name is used instead.
 )doc");
 
+REGISTER_OP("LMDBReader")
+    .Output("reader_handle: Ref(string)")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .SetIsStateful()
+    .SetShapeFn(TwoElementOutput)
+    .Doc(R"doc(
+A Reader that outputs the records from a LMDB file.
+reader_handle: The handle to reference the Reader.
+container: If non-empty, this reader is placed in the given container.
+        Otherwise, a default container is used.
+shared_name: If non-empty, this reader is named in the given bucket
+             with this shared_name. Otherwise, the node name is used instead.
+)doc");
+
 // TODO(cwhipkey): mark this deprecated in favor of V2.
 REGISTER_OP("IdentityReader")
     .Output("reader_handle: Ref(string)")
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index d82e98beb9..997b7de3d9 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -26050,6 +26050,33 @@ op {
   is_stateful: true
 }
 op {
+  name: "LMDBReader"
+  output_arg {
+    name: "reader_handle"
+    description: "The handle to reference the Reader."
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "If non-empty, this reader is placed in the given container.\nOtherwise, a default container is used."
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+    description: "If non-empty, this reader is named in the given bucket\nwith this shared_name. Otherwise, the node name is used instead."
+  }
+  summary: "A Reader that outputs the records from a LMDB database."
+  is_stateful: true
+}
+op {
   name: "TakeDataset"
   input_arg {
     name: "input_dataset"
diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD
index 615ca85ab7..39446b6ca2 100644
--- a/tensorflow/python/debug/BUILD
+++ b/tensorflow/python/debug/BUILD
@@ -546,6 +546,22 @@ cuda_py_test(
     tags = ["notsan"],
 )
 
+cuda_py_test(
+    name = "session_debug_multi_gpu_test",
+    size = "small",
+    srcs = ["lib/session_debug_multi_gpu_test.py"],
+    additional_deps = [
+        ":debug_data",
+        ":debug_utils",
+        "//tensorflow/python:client",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:variables",
+    ],
+)
+
 py_test(
     name = "debugger_cli_common_test",
     size = "small",
diff --git a/tensorflow/python/debug/lib/debug_data.py b/tensorflow/python/debug/lib/debug_data.py
index 35e9fef29f..748e5d78ba 100644
--- a/tensorflow/python/debug/lib/debug_data.py
+++ b/tensorflow/python/debug/lib/debug_data.py
@@ -19,10 +19,12 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import glob
 import json
 import os
 
 import numpy as np
+import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.core.framework import graph_pb2
@@ -32,9 +34,12 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.platform import gfile
 
 
+# TODO(cais): Tie these string constants in with C++?
 METADATA_FILE_PREFIX = "_tfdbg_"
 CORE_METADATA_TAG = "core_metadata_"
 GRAPH_FILE_TAG = "graph_"
+DEVICE_TAG = "device_"
+
 FETCHES_INFO_FILE_TAG = "fetches_info_"
 FEED_KEYS_INFO_FILE_TAG = "feed_keys_info_"
 
@@ -158,10 +163,6 @@ def parse_node_or_tensor_name(name):
     return name, None
 
 
-def _is_core_metadata_file(file_name):
-  return file_name.startswith(METADATA_FILE_PREFIX + CORE_METADATA_TAG)
-
-
 def _is_graph_file(file_name):
   return file_name.startswith(METADATA_FILE_PREFIX + GRAPH_FILE_TAG)
 
@@ -344,6 +345,28 @@ def extract_core_metadata_from_event_proto(event):
                        json_metadata["target_nodes"])
 
 
+def device_name_to_device_path(device_name):
+  """Convert device name to device path."""
+  device_name_items = device_name.split("/")
+  device_name_items = [item.replace(":", "_") for item in device_name_items]
+  return METADATA_FILE_PREFIX + DEVICE_TAG + ",".join(device_name_items)
+
+
+def device_path_to_device_name(device_dir):
+  """Parse device name from device path.
+
+  Args:
+    device_dir: (str) a directory name for the device.
+
+  Returns:
+    (str) parsed device name.
+  """
+  path_items = os.path.basename(device_dir)[
+      len(METADATA_FILE_PREFIX) + len(DEVICE_TAG):].split(",")
+  return "/".join([
+      path_item.replace("_", ":", 1) for path_item in path_items])
+
+
 class DebugTensorDatum(object):
   """A single tensor dumped by TensorFlow Debugger (tfdbg).
 
@@ -360,13 +383,17 @@ class DebugTensorDatum(object):
     """`DebugTensorDatum` constructor.
 
     Args:
-      dump_root: (`str`) Debug dump root directory.
+      dump_root: (`str`) Debug dump root directory. This path should not include
+        the path component that represents the device name (see also below).
       debug_dump_rel_path: (`str`) Path to a debug dump file, relative to the
-          `dump_root`. For example, suppose the debug dump root
-          directory is `/tmp/tfdbg_1` and the dump file is at
-          `/tmp/tfdbg_1/ns_1/node_a_0_DebugIdentity_123456789`, then
-          the value of the debug_dump_rel_path should be
-          `ns_1/node_a_0_DebugIdenity_1234456789`.
+        `dump_root`. The first item of this relative path is assumed to be
+        a path representing the name of the device that the Tensor belongs to.
+        See `device_path_to_device_name` for more details on the device path.
+        For example, suppose the debug dump root
+        directory is `/tmp/tfdbg_1` and the dump file is at
+        `/tmp/tfdbg_1/<device_path>/>ns_1/node_a_0_DebugIdentity_123456789`,
+        then the value of the debug_dump_rel_path should be
+        `<device_path>/ns_1/node_a_0_DebugIdenity_1234456789`.
 
     Raises:
       ValueError: If the base file name of the dump file does not conform to
@@ -374,15 +401,13 @@ class DebugTensorDatum(object):
         `node_name`_`output_slot`_`debug_op`_`timestamp`
     """
 
-    base = os.path.basename(debug_dump_rel_path)
-
+    path_components = os.path.normpath(debug_dump_rel_path).split(os.sep)
+    self._device_name = device_path_to_device_name(path_components[0])
+    base = path_components[-1]
     if base.count("_") < 3:
       raise ValueError(
           "Dump file path does not conform to the naming pattern: %s" % base)
 
-    # TODO(cais): Add hostname and pid to support dumps from distributed
-    #             sessions.
-
     self._extended_timestamp = base.split("_")[-1]
     # It may include an index suffix at the end if file path collision happened
     # due to identical timestamps.
@@ -395,31 +420,23 @@ class DebugTensorDatum(object):
     self._debug_op = base.split("_")[-2]
     self._output_slot = int(base.split("_")[-3])
 
-    namespace = os.path.dirname(debug_dump_rel_path).replace("\\", "/")
     node_base_name = "_".join(base.split("_")[:-3])
-    if not namespace or namespace == ".":
-      self._node_name = node_base_name
-    else:
-      self._node_name = namespace + "/" + node_base_name
+    self._node_name = "/".join(path_components[1:-1] + [node_base_name])
 
     self._file_path = os.path.join(dump_root, debug_dump_rel_path)
     self._dump_size_bytes = (gfile.Stat(self._file_path).length if
                              gfile.Exists(self._file_path) else None)
 
-    self._run_fetches_info = None
-    self._run_feed_keys_info = None
-
   def __str__(self):
-    return "{DebugTensorDatum: %s:%d @ %s @ %d}" % (self.node_name,
-                                                    self.output_slot,
-                                                    self.debug_op,
-                                                    self.timestamp)
+    return "{DebugTensorDatum (%s) %s:%d @ %s @ %d}" % (self.device_name,
+                                                        self.node_name,
+                                                        self.output_slot,
+                                                        self.debug_op,
+                                                        self.timestamp)
 
   def __repr__(self):
     return self.__str__()
 
-  # TODO(cais): (b/38325442) Add device name information to this class.
-
   def get_tensor(self):
     """Get tensor from the dump (`Event`) file.
 
@@ -465,6 +482,16 @@ class DebugTensorDatum(object):
     return self._debug_op
 
   @property
+  def device_name(self):
+    """Name of the device that the tensor belongs to.
+
+    Returns:
+      (`str`) device name.
+    """
+
+    return self._device_name
+
+  @property
   def node_name(self):
     """Name of the node from which the tensor value was dumped.
 
@@ -529,6 +556,8 @@ class WatchKeyDoesNotExistInDebugDumpDirError(ValueError):
   pass
 
 
+# TODO(cais): This class is getting too large in line count. Refactor to make it
+# smaller and easier to maintain.
 class DebugDumpDir(object):
   """Data set from a debug-dump directory on filesystem.
 
@@ -548,23 +577,54 @@ class DebugDumpDir(object):
 
     Raises:
       IOError: If dump_root does not exist as a directory.
+      ValueError: If more than one core metadata file is found under the dump
+        root directory.
     """
 
     if not gfile.IsDirectory(dump_root):
       raise IOError("Dump root directory %s does not exist" % dump_root)
 
-    self._core_metadata = None
-    self._load_dumps(dump_root)
-    self._create_tensor_watch_maps()
-    self._load_partition_graphs(partition_graphs, validate)
+    self._core_metadata = []
+
+    # Find the list of devices.
+    self._dump_root = dump_root
+
+    self._load_core_metadata()
+    self._load_fetches_info()
+    self._load_feeds_info()
+    self._load_all_device_dumps(partition_graphs, validate)
 
     self._python_graph = None
 
-  def _load_dumps(self, dump_root):
-    """Load `DebugTensorDatum` instances from the dump root.
+  def _load_all_device_dumps(self, partition_graphs, validate):
+    """Load the dump data for all devices."""
+    device_dirs = glob.glob(os.path.join(
+        self._dump_root, METADATA_FILE_PREFIX + DEVICE_TAG + "*"))
+
+    self._device_names = []
+    self._t0s = {}
+    self._dump_tensor_data = {}
+    self._dump_graph_file_paths = {}
+    self._debug_watches = {}
+    self._watch_key_to_devices = {}
+    self._watch_key_to_datum = {}
+    self._watch_key_to_rel_time = {}
+    self._watch_key_to_dump_size_bytes = {}
+    for device_dir in device_dirs:
+      device_name = device_path_to_device_name(device_dir)
+      self._device_names.append(device_name)
+      self._load_device_dumps(device_name, device_dir)
+    self._load_partition_graphs(partition_graphs, validate)
+    self._calculate_t0()
+
+    for device_name in self._device_names:
+      self._create_tensor_watch_maps(device_name)
 
-    Populates a list of `DebugTensorDatum` instance and sorts the list by
-    ascending timestamp.
+  def _load_device_dumps(self, device_name, device_root):
+    """Load `DebugTensorDatum` instances from the dump root of a given device.
+
+    Populates a map {device_name: a list of `DebugTensorDatum`}, where the list
+    is sorted by  ascending timestamp.
 
     This sorting order reflects the order in which the TensorFlow executor
     processed the nodes of the graph. It is (one of many possible) topological
@@ -584,55 +644,67 @@ class DebugDumpDir(object):
     graphs may not be available, e.g., when the run errors out.
 
     Args:
-      dump_root: (`str`) Dump root directory.
-    """
+      device_name: (`str`) name of the device.
+      device_root: (`str`) dump root directory of the given device.
 
-    self._dump_root = dump_root
-    self._dump_tensor_data = []
-    self._dump_graph_file_paths = []
+    Raises:
+      ValueError: If GraphDef for the device is not available.
+    """
 
-    self._debug_watches = collections.defaultdict(
+    self._dump_tensor_data[device_name] = []
+    self._debug_watches[device_name] = collections.defaultdict(
         lambda: collections.defaultdict(set))
 
-    for root, _, files in gfile.Walk(self._dump_root):
+    for root, _, files in gfile.Walk(device_root):
       for f in files:
-        if f.startswith(METADATA_FILE_PREFIX):
-          if _is_core_metadata_file(f):
-            self._load_core_metadata(os.path.join(self._dump_root, root, f))
-
-          if _is_graph_file(f):
-            self._dump_graph_file_paths.append(
-                os.path.join(self._dump_root, root, f))
-
-          if _is_run_fetches_info_file(f):
-            self._run_fetches_info = _load_log_message_from_event_file(
-                os.path.join(root, f))
-
-          if _is_run_feed_keys_info_file(f):
-            self._run_feed_keys_info = _load_log_message_from_event_file(
-                os.path.join(root, f))
-
-          continue
-
-        datum = self._dump_file_name_to_datum(root, f)
-        self._dump_tensor_data.append(datum)
-
-        self._debug_watches[datum.node_name][datum.output_slot].add(
-            datum.debug_op)
-
-    self._dump_tensor_data = sorted(
-        self._dump_tensor_data, key=lambda x: x.extended_timestamp)
-
-    if self._dump_tensor_data:
-      self._t0 = self._dump_tensor_data[0].timestamp
+        if _is_graph_file(f):
+          self._dump_graph_file_paths[device_name] = os.path.join(
+              device_root, root, f)
+        else:
+          datum = self._dump_file_name_to_datum(root, f)
+          self._dump_tensor_data[device_name].append(datum)
+          self._debug_watches[device_name][datum.node_name][
+              datum.output_slot].add(datum.debug_op)
+
+    self._dump_tensor_data[device_name] = sorted(
+        self._dump_tensor_data[device_name],
+        key=lambda x: x.extended_timestamp)
+
+    if self._dump_tensor_data[device_name]:
+      self._t0s[device_name] = self._dump_tensor_data[device_name][0].timestamp
     else:
-      self._t0 = None
-
-  def _load_core_metadata(self, event_file_path):
-    event = event_pb2.Event()
-    with gfile.Open(event_file_path, "rb") as f:
-      event.ParseFromString(f.read())
-      self._core_metadata = extract_core_metadata_from_event_proto(event)
+      self._t0s[device_name] = None
+
+  def _calculate_t0(self):
+    """Calculate the first timestamp across all devices."""
+    t0s = [t0 for t0 in six.itervalues(self._t0s) if t0 is not None]
+    self._t0 = min(t0s) if t0s else None
+
+  def _load_core_metadata(self):
+    core_metadata_files = glob.glob(os.path.join(
+        self._dump_root, METADATA_FILE_PREFIX + CORE_METADATA_TAG + "*"))
+    for core_metadata_file in core_metadata_files:
+      with gfile.Open(core_metadata_file, "rb") as f:
+        event = event_pb2.Event()
+        event.ParseFromString(f.read())
+        self._core_metadata.append(
+            extract_core_metadata_from_event_proto(event))
+
+  def _load_fetches_info(self):
+    fetches_info_files = glob.glob(os.path.join(
+        self._dump_root, METADATA_FILE_PREFIX + FETCHES_INFO_FILE_TAG + "*"))
+    self._run_fetches_info = []
+    for fetches_info_file in fetches_info_files:
+      self._run_fetches_info.append(
+          _load_log_message_from_event_file(fetches_info_file))
+
+  def _load_feeds_info(self):
+    feeds_info_files = glob.glob(os.path.join(
+        self._dump_root, METADATA_FILE_PREFIX + FEED_KEYS_INFO_FILE_TAG + "*"))
+    self._run_feed_keys_info = []
+    for feeds_info_file in feeds_info_files:
+      self._run_feed_keys_info.append(
+          _load_log_message_from_event_file(feeds_info_file))
 
   def _dump_file_name_to_datum(self, dir_name, file_name):
     """Obtain a DebugTensorDatum from the directory and file name.
@@ -648,34 +720,39 @@ class DebugDumpDir(object):
     # Calculate the relative path of the dump file with respect to the root.
     debug_dump_rel_path = os.path.join(
         os.path.relpath(dir_name, self._dump_root), file_name)
-
     return DebugTensorDatum(self._dump_root, debug_dump_rel_path)
 
-  def _create_tensor_watch_maps(self):
+  def _create_tensor_watch_maps(self, device_name):
     """Create maps from tensor watch keys to datum and to timestamps.
 
     Create a map from watch key (tensor name + debug op) to `DebugTensorDatum`
     item. Also make a map from watch key to relative timestamp.
     "relative" means (absolute timestamp - t0).
+
+    Args:
+      device_name: (str) name of the device.
     """
 
-    self._watch_key_to_datum = {}
-    self._watch_key_to_rel_time = {}
-    self._watch_key_to_dump_size_bytes = {}
-    for datum in self._dump_tensor_data:
-      if datum.watch_key not in self._watch_key_to_datum:
-        self._watch_key_to_datum[datum.watch_key] = [datum]
-        self._watch_key_to_rel_time[datum.watch_key] = [
-            datum.timestamp - self._t0
-        ]
-        self._watch_key_to_dump_size_bytes[datum.watch_key] = [
-            datum.dump_size_bytes
-        ]
+    self._watch_key_to_datum[device_name] = {}
+    self._watch_key_to_rel_time[device_name] = {}
+    self._watch_key_to_dump_size_bytes[device_name] = {}
+    for datum in self._dump_tensor_data[device_name]:
+      if datum.watch_key not in self._watch_key_to_devices:
+        self._watch_key_to_devices[datum.watch_key] = {device_name}
+      else:
+        self._watch_key_to_devices[datum.watch_key].add(device_name)
+
+      if datum.watch_key not in self._watch_key_to_datum[device_name]:
+        self._watch_key_to_datum[device_name][datum.watch_key] = [datum]
+        self._watch_key_to_rel_time[device_name][datum.watch_key] = [
+            datum.timestamp - self._t0]
+        self._watch_key_to_dump_size_bytes[device_name][datum.watch_key] = [
+            datum.dump_size_bytes]
       else:
-        self._watch_key_to_datum[datum.watch_key].append(datum)
-        self._watch_key_to_rel_time[datum.watch_key].append(datum.timestamp -
-                                                            self._t0)
-        self._watch_key_to_dump_size_bytes[datum.watch_key].append(
+        self._watch_key_to_datum[device_name][datum.watch_key].append(datum)
+        self._watch_key_to_rel_time[device_name][datum.watch_key].append(
+            datum.timestamp - self._t0)
+        self._watch_key_to_dump_size_bytes[device_name][datum.watch_key].append(
             datum.dump_size_bytes)
 
   def set_python_graph(self, python_graph):
@@ -733,22 +810,32 @@ class DebugDumpDir(object):
         `output_names`: Names of the output (fetched) Tensors.
         `target_nodes`: Names of the target nodes.
       If the core metadata have not been loaded, `None`.
+      If more than one core metadata files exist, return a list of the
+        `nametuple` described above.
     """
 
-    return self._core_metadata
+    output = self._core_metadata
+    return output[0] if len(output) == 1 else output
 
   @property
   def dumped_tensor_data(self):
-    return self._dump_tensor_data
+    """Retrieve dumped tensor data."""
+    if len(self.devices()) == 1:
+      return self._dump_tensor_data[self.devices()[0]]
+    else:
+      all_devices_data = six.itervalues(self._dump_tensor_data)
+      data = []
+      for device_data in all_devices_data:
+        data.extend(device_data)
+      return sorted(data, key=lambda x: x.extended_timestamp)
 
   @property
   def t0(self):
-    """Absolute timestamp of the first dumped tensor.
+    """Absolute timestamp of the first dumped tensor across all devices.
 
     Returns:
       (`int`) absolute timestamp of the first dumped tensor, in microseconds.
     """
-
     return self._t0
 
   @property
@@ -756,10 +843,10 @@ class DebugDumpDir(object):
     """Total number of dumped tensors in the dump root directory.
 
     Returns:
-      (`int`) total number of dumped tensors in the dump root directory.
+      (`int`) The total number of dumped tensors in the dump root directory.
     """
-
-    return len(self._dump_tensor_data)
+    return sum(len(self._dump_tensor_data[device_name])
+               for device_name in self._dump_tensor_data)
 
   def _load_partition_graphs(self, partition_graphs, validate):
     """Load and process partition graphs.
@@ -770,56 +857,73 @@ class DebugDumpDir(object):
     tensor dumps.
 
     Args:
-      partition_graphs: Partition graphs executed by the TensorFlow runtime,
-        represented as repeated fields of GraphDef.
-        If no partition_graph is available, use None.
+      partition_graphs: A repeated field of GraphDefs representing the
+          partition graphs executed by the TensorFlow runtime.
       validate: (`bool`) Whether the dump files are to be validated against the
         partition graphs.
-    """
 
-    if partition_graphs:
-      self._partition_graphs = partition_graphs
-    elif self._dump_graph_file_paths:
-      # In case partition graphs are not available from arguments, load them
-      # from the dump directory.
-      self._partition_graphs = [
-          _load_graph_def_from_event_file(dump_file_path)
-          for dump_file_path in self._dump_graph_file_paths
-      ]
-    else:
-      self._partition_graphs = None
-      return
+    Raises:
+      ValueError: If the partition GraphDef of one or more devices fail to be
+        loaded.
+    """
 
     self._node_attributes = {}
-
     self._node_inputs = {}
     self._node_ctrl_inputs = {}
-
     self._node_recipients = {}
     self._node_ctrl_recipients = {}
-
-    self._devices = []
     self._node_devices = {}
     self._node_op_types = {}
+    self._copy_send_nodes = {}
+
+    self._partition_graphs = {}
+    for device_name in self._device_names:
+      partition_graph = None
+      if device_name in self._dump_graph_file_paths:
+        partition_graph = _load_graph_def_from_event_file(
+            self._dump_graph_file_paths[device_name])
+      else:
+        partition_graph = self._find_partition_graph(partition_graphs,
+                                                     device_name)
+
+      if partition_graph:
+        self._partition_graphs[device_name] = partition_graph
 
-    self._copy_send_nodes = []
+      self._node_attributes[device_name] = {}
+      self._node_inputs[device_name] = {}
+      self._node_ctrl_inputs[device_name] = {}
+      self._node_recipients[device_name] = {}
+      self._node_ctrl_recipients[device_name] = {}
+      self._node_op_types[device_name] = {}
+      self._copy_send_nodes[device_name] = []
 
-    for pg in self._partition_graphs:
-      for node in pg.node:
-        self._process_partition_graph_node(node)
+      if partition_graph:
+        for node in partition_graph.node:
+          self._process_partition_graph_node(device_name, node)
 
-    self._prune_non_control_edges_of_debug_ops()
-    self._prune_control_edges_of_debug_ops()
+      self._prune_non_control_edges_of_debug_ops(device_name)
+      self._prune_control_edges_of_debug_ops(device_name)
 
-    self._populate_recipient_maps()
+      self._populate_recipient_maps(device_name)
 
-    if validate:
-      self._validate_dump_with_graphs()
+      if device_name in self._partition_graphs and validate:
+        self._validate_dump_with_graphs(device_name)
+
+  def _find_partition_graph(self, partition_graphs, device_name):
+    if partition_graphs is None:
+      return None
+    else:
+      for graph_def in partition_graphs:
+        for node_def in graph_def.node:
+          if node_def.device == device_name:
+            return graph_def
+      return None
 
-  def _process_partition_graph_node(self, node):
+  def _process_partition_graph_node(self, device_name, node):
     """Process a node from the partition graphs.
 
     Args:
+      device_name: (str) device name.
       node: (NodeDef) A partition-graph node to be processed.
 
     Raises:
@@ -833,84 +937,91 @@ class DebugDumpDir(object):
       (watched_node_name, watched_output_slot, _,
        debug_op) = parse_debug_node_name(node.name)
 
-      self._debug_watches[watched_node_name][watched_output_slot].add(
-          debug_op)
+      self._debug_watches[device_name][watched_node_name][
+          watched_output_slot].add(debug_op)
 
       return
 
-    if node.name in self._node_inputs:
-      raise ValueError("Duplicate node name: '%s'" % node.name)
+    if node.name in self._node_inputs[device_name]:
+      raise ValueError("Duplicate node name on device %s: '%s'" %
+                       (device_name, node.name))
 
-    self._node_attributes[node.name] = node.attr
+    self._node_attributes[device_name][node.name] = node.attr
 
-    if node.device not in self._devices and node.device:
-      self._devices.append(node.device)
+    self._node_inputs[device_name][node.name] = []
+    self._node_ctrl_inputs[device_name][node.name] = []
+    self._node_recipients[device_name][node.name] = []
+    self._node_ctrl_recipients[device_name][node.name] = []
 
-    self._node_inputs[node.name] = []
-    self._node_ctrl_inputs[node.name] = []
-    self._node_recipients[node.name] = []
-    self._node_ctrl_recipients[node.name] = []
-
-    self._node_devices[node.name] = node.device
-    self._node_op_types[node.name] = node.op
+    if node.name not in self._node_devices:
+      self._node_devices[node.name] = set()
+    self._node_devices[node.name].add(node.device)
+    self._node_op_types[device_name][node.name] = node.op
 
     for inp in node.input:
       if is_copy_node(inp) and (node.op == "_Send" or node.op == "_Retval"):
-        self._copy_send_nodes.append(node.name)
+        self._copy_send_nodes[device_name].append(node.name)
 
       if inp.startswith("^"):
         cinp = inp[1:]
-        self._node_ctrl_inputs[node.name].append(cinp)
+        self._node_ctrl_inputs[device_name][node.name].append(cinp)
       else:
-        self._node_inputs[node.name].append(inp)
+        self._node_inputs[device_name][node.name].append(inp)
 
-  def _prune_nodes_from_input_and_recipient_maps(self, nodes_to_prune):
+  def _prune_nodes_from_input_and_recipient_maps(self,
+                                                 device_name,
+                                                 nodes_to_prune):
     """Prune nodes out of input and recipient maps.
 
     Args:
+      device_name: (`str`) device name.
       nodes_to_prune: (`list` of `str`) Names of the nodes to be pruned.
     """
 
     for node in nodes_to_prune:
-      del self._node_inputs[node]
-      del self._node_ctrl_inputs[node]
-      del self._node_recipients[node]
-      del self._node_ctrl_recipients[node]
+      del self._node_inputs[device_name][node]
+      del self._node_ctrl_inputs[device_name][node]
+      del self._node_recipients[device_name][node]
+      del self._node_ctrl_recipients[device_name][node]
 
-  def _prune_non_control_edges_of_debug_ops(self):
+  def _prune_non_control_edges_of_debug_ops(self, device_name):
     """Prune (non-control) edges related to debug ops.
 
     Prune the Copy ops and associated _Send ops inserted by the debugger out
     from the non-control inputs and output recipients map. Replace the inputs
     and recipients with original ones.
+
+    Args:
+      device_name: (`str`) device name.
     """
 
     copy_nodes = []
-    for node in self._node_inputs:
-      if node in self._copy_send_nodes:
+    for node in self._node_inputs[device_name]:
+      if node in self._copy_send_nodes[device_name]:
         continue
 
       if is_copy_node(node):
         copy_nodes.append(node)
 
-      inputs = self._node_inputs[node]
+      inputs = self._node_inputs[device_name][node]
 
       for i in xrange(len(inputs)):
         inp = inputs[i]
         if is_copy_node(inp):
           # Find the input to the Copy node, which should be the original
           # input to the node.
-          orig_inp = self._node_inputs[inp][0]
+          orig_inp = self._node_inputs[device_name][inp][0]
           inputs[i] = orig_inp
 
-    self._prune_nodes_from_input_and_recipient_maps(copy_nodes)
-    self._prune_nodes_from_input_and_recipient_maps(self._copy_send_nodes)
+    self._prune_nodes_from_input_and_recipient_maps(device_name, copy_nodes)
+    self._prune_nodes_from_input_and_recipient_maps(
+        device_name, self._copy_send_nodes[device_name])
 
-  def _prune_control_edges_of_debug_ops(self):
+  def _prune_control_edges_of_debug_ops(self, device_name):
     """Prune control edges related to the debug ops."""
 
-    for node in self._node_ctrl_inputs:
-      ctrl_inputs = self._node_ctrl_inputs[node]
+    for node in self._node_ctrl_inputs[device_name]:
+      ctrl_inputs = self._node_ctrl_inputs[device_name][node]
       debug_op_inputs = []
       for ctrl_inp in ctrl_inputs:
         if is_debug_node(ctrl_inp):
@@ -918,33 +1029,36 @@ class DebugDumpDir(object):
       for debug_op_inp in debug_op_inputs:
         ctrl_inputs.remove(debug_op_inp)
 
-  def _populate_recipient_maps(self):
+  def _populate_recipient_maps(self, device_name):
     """Populate the map from node name to recipient(s) of its output(s)."""
 
-    for node in self._node_inputs:
-      inputs = self._node_inputs[node]
+    for node in self._node_inputs[device_name]:
+      inputs = self._node_inputs[device_name][node]
       for inp in inputs:
         inp = get_node_name(inp)
-        if inp not in self._node_recipients:
-          self._node_recipients[inp] = []
-        self._node_recipients[inp].append(node)
+        if inp not in self._node_recipients[device_name]:
+          self._node_recipients[device_name][inp] = []
+        self._node_recipients[device_name][inp].append(node)
 
-    for node in self._node_ctrl_inputs:
-      ctrl_inputs = self._node_ctrl_inputs[node]
+    for node in self._node_ctrl_inputs[device_name]:
+      ctrl_inputs = self._node_ctrl_inputs[device_name][node]
       for ctrl_inp in ctrl_inputs:
-        if ctrl_inp in self._copy_send_nodes:
+        if ctrl_inp in self._copy_send_nodes[device_name]:
           continue
 
-        if ctrl_inp not in self._node_ctrl_recipients:
-          self._node_ctrl_recipients[ctrl_inp] = []
-        self._node_ctrl_recipients[ctrl_inp].append(node)
+        if ctrl_inp not in self._node_ctrl_recipients[device_name]:
+          self._node_ctrl_recipients[device_name][ctrl_inp] = []
+        self._node_ctrl_recipients[device_name][ctrl_inp].append(node)
 
-  def _validate_dump_with_graphs(self):
+  def _validate_dump_with_graphs(self, device_name):
     """Validate the dumped tensor data against the partition graphs.
 
     Only the watched nodes are validated by this method, because tfdbg allows
     clients to watch only a subset of the nodes.
 
+    Args:
+      device_name: (`str`) device name.
+
     Raises:
       LookupError: If the partition graphs have not been loaded yet.
       ValueError: If dumps contain node names not found in partition graph.
@@ -952,33 +1066,35 @@ class DebugDumpDir(object):
         input relations on the partition graphs.
     """
 
-    if not self._partition_graphs:
-      raise LookupError("No partition graphs loaded.")
+    if not self._partition_graphs[device_name]:
+      raise LookupError(
+          "No partition graphs loaded for device %s" % device_name)
 
     # Verify that the node names in the dump data are all present in the
     # partition graphs.
-    for datum in self._dump_tensor_data:
-      if datum.node_name not in self._node_inputs:
-        raise ValueError("Node name '%s' is not found in partition graphs." %
-                         datum.node_name)
+    for datum in self._dump_tensor_data[device_name]:
+      if datum.node_name not in self._node_inputs[device_name]:
+        raise ValueError("Node name '%s' is not found in partition graphs of "
+                         "device %s." % (datum.node_name, device_name))
 
     pending_inputs = {}
-    for node in self._node_inputs:
+    for node in self._node_inputs[device_name]:
       pending_inputs[node] = []
-      inputs = self._node_inputs[node]
+      inputs = self._node_inputs[device_name][node]
       for inp in inputs:
         inp_node = get_node_name(inp)
         inp_output_slot = get_output_slot(inp)
         # Inputs from Enter and NextIteration nodes are not validated because
         # DebugNodeInserter::InsertNodes() in the debugger core skips creating
         # control edges from debug ops watching these types of nodes.
-        if (inp_node in self._debug_watches and
-            inp_output_slot in self._debug_watches[inp_node] and
-            self._node_op_types.get(inp) not in ("Enter", "NextIteration") and
+        if (inp_node in self._debug_watches[device_name] and
+            inp_output_slot in self._debug_watches[device_name][inp_node] and
+            self._node_op_types[device_name].get(inp) not in (
+                "Enter", "NextIteration") and
             (inp_node, inp_output_slot) not in pending_inputs[node]):
           pending_inputs[node].append((inp_node, inp_output_slot))
 
-    for i, datum in enumerate(self._dump_tensor_data):
+    for i, datum in enumerate(self._dump_tensor_data[device_name]):
       node = datum.node_name
       slot = datum.output_slot
       # In some cases (e.g., system clocks with insufficient precision),
@@ -986,13 +1102,13 @@ class DebugDumpDir(object):
       # following check examines this possibility and avoids raising an error if
       # that is the case.
       if not self._satisfied_at_timestamp(
-          pending_inputs[node], datum.timestamp, start_i=i + 1):
+          device_name, pending_inputs[node], datum.timestamp, start_i=i + 1):
         raise ValueError("Causality violated in timing relations of debug "
                          "dumps: %s (%d): "
                          "these input(s) are not satisfied: %s" %
                          (node, datum.timestamp, repr(pending_inputs[node])))
 
-      recipients = self._node_recipients[node]
+      recipients = self._node_recipients[device_name][node]
       for recipient in recipients:
         recipient_pending_inputs = pending_inputs[recipient]
         if (node, slot) in recipient_pending_inputs:
@@ -1004,12 +1120,13 @@ class DebugDumpDir(object):
             del recipient_pending_inputs[
                 recipient_pending_inputs.index((node, slot))]
 
-  def _satisfied_at_timestamp(self, pending, timestamp, start_i=0):
+  def _satisfied_at_timestamp(self, device_name, pending, timestamp, start_i=0):
     """Determine whether pending inputs are satisfied at given timestamp.
 
     Note: This method mutates the input argument "pending".
 
     Args:
+      device_name: (str) device name.
       pending: A list of 2-tuple (node_name, output_slot): the dependencies to
         check.
       timestamp: (int) the timestamp in question.
@@ -1023,7 +1140,7 @@ class DebugDumpDir(object):
     if not pending:
       return True
 
-    for datum in self._dump_tensor_data[start_i:]:
+    for datum in self._dump_tensor_data[device_name][start_i:]:
       if datum.timestamp > timestamp:
         break
       if (datum.timestamp == timestamp and
@@ -1042,7 +1159,7 @@ class DebugDumpDir(object):
     """Get the partition graphs.
 
     Returns:
-      Partition graphs as repeated fields of GraphDef.
+      Partition graphs as a list of GraphDef.
 
     Raises:
       LookupError: If no partition graphs have been loaded.
@@ -1051,50 +1168,90 @@ class DebugDumpDir(object):
     if self._partition_graphs is None:
       raise LookupError("No partition graphs have been loaded.")
 
-    return self._partition_graphs
+    return self._partition_graphs.values()
 
   @property
   def run_fetches_info(self):
     """Get a str representation of the fetches used in the Session.run() call.
 
     Returns:
-      If the information is available, a `str` obtained from `repr(fetches)`.
+      If the information is available from one `Session.run` call, a `str`
+        obtained from `repr(fetches)`.
+      If the information is available from multiple `Session.run` calls, a
+        `list` of `str` from `repr(fetches)`.
       If the information is not available, `None`.
     """
 
-    return self._run_fetches_info
+    output = self._run_fetches_info
+    return output[0] if len(output) == 1 else output
 
   @property
   def run_feed_keys_info(self):
     """Get a str representation of the feed_dict used in the Session.run() call.
 
     Returns:
-      If the information is available, a `str` obtained from `repr(feed_dict)`.
+      If the information is available from one `Session.run` call, a `str`
+        obtained from `repr(feed_dict)`.
+      If the information is available from multiple `Session.run` calls, a
+        `list` of `str` obtained from `repr(feed_dict)`.
       If the information is not available, `None`.
     """
 
-    return self._run_feed_keys_info
+    output = self._run_feed_keys_info
+    return output[0] if len(output) == 1 else output
+
+  def _infer_device_name(self, device_name, node_name):
+    if device_name is None:
+      if len(self.devices()) == 1:
+        return self.devices()[0]
+      else:
+        if node_name in self._node_devices:
+          if len(self._node_devices[node_name]) == 1:
+            return list(self._node_devices[node_name])[0]
+          else:
+            raise ValueError(
+                "There are multiple (%d) devices with nodes named '%s' but "
+                "device_name is not specified." %
+                (len(self._node_devices[node_name]), node_name))
+        else:
+          raise ValueError("None of the %d devices has a node named '%s'." %
+                           (len(self._device_names), node_name))
+    else:
+      return device_name
 
-  def nodes(self):
+  def nodes(self, device_name=None):
     """Get a list of all nodes from the partition graphs.
 
+    Args:
+      device_name: (`str`) name of device. If there is only one device, this
+        argumnet is optional.
+
     Returns:
       All nodes' names, as a list of str.
 
     Raises:
       LookupError: If no partition graphs have been loaded.
+      ValueError: If there are multiple devices, but device_name is not
+        specified.
     """
-
     if self._partition_graphs is None:
       raise LookupError("No partition graphs have been loaded.")
+    if device_name is None:
+      if len(self.devices()) == 1:
+        device_name = self.devices()[0]
+      else:
+        raise ValueError(
+            "There are multiple (%d) devices, but "
+            "device_name is not specified." % len(self.devices()))
+    return [node_name for node_name in self._node_inputs[device_name]]
 
-    return [node_name for node_name in self._node_inputs]
-
-  def node_attributes(self, node_name):
+  def node_attributes(self, node_name, device_name=None):
     """Get the attributes of a node.
 
     Args:
       node_name: Name of the node in question.
+      device_name: (`str`) name of the device. If there is only one device or if
+        node_name exists on only one device, this argumnet is optional.
 
     Returns:
       Attributes of the node.
@@ -1103,22 +1260,25 @@ class DebugDumpDir(object):
       LookupError: If no partition graphs have been loaded.
       ValueError: If no node named node_name exists.
     """
-
     if self._partition_graphs is None:
       raise LookupError("No partition graphs have been loaded.")
 
-    if node_name in self._node_attributes:
-      return self._node_attributes[node_name]
+    device_name = self._infer_device_name(device_name, node_name)
+    if node_name in self._node_attributes[device_name]:
+      return self._node_attributes[device_name][node_name]
     else:
-      raise ValueError("No node named \"%s\" exists." % node_name)
+      raise ValueError("No node named \"%s\" exists on device %s." % (
+          node_name, device_name))
 
-  def node_inputs(self, node_name, is_control=False):
+  def node_inputs(self, node_name, is_control=False, device_name=None):
     """Get the inputs of given node according to partition graphs.
 
     Args:
       node_name: Name of the node.
       is_control: (`bool`) Whether control inputs, rather than non-control
         inputs, are to be returned.
+      device_name: (`str`) name of the device. If there is only one device or if
+        node_name exists on only one device, this argumnet is optional.
 
     Returns:
       (`list` of `str`) inputs to the node, as a list of node names.
@@ -1133,21 +1293,27 @@ class DebugDumpDir(object):
       raise LookupError(
           "Node inputs are not loaded from partition graphs yet.")
 
-    if node_name not in self._node_inputs:
-      raise ValueError("Node '%s' does not exist in partition graphs." %
-                       node_name)
+    device_name = self._infer_device_name(device_name, node_name)
+    if node_name not in self._node_inputs[device_name]:
+      raise ValueError("Node '%s' does not exist in the partition graph of "
+                       "device %s." % (node_name, device_name))
 
     if is_control:
-      return self._node_ctrl_inputs[node_name]
+      return self._node_ctrl_inputs[device_name][node_name]
     else:
-      return self._node_inputs[node_name]
+      return self._node_inputs[device_name][node_name]
 
-  def transitive_inputs(self, node_name, include_control=True):
+  def transitive_inputs(self,
+                        node_name,
+                        include_control=True,
+                        device_name=None):
     """Get the transitive inputs of given node according to partition graphs.
 
     Args:
-      node_name: Name of the node
+      node_name: Name of the node.
       include_control: Include control inputs (True by default).
+      device_name: (`str`) name of the device. If there is only one device or if
+        node_name exists on only one device, this argumnet is optional.
 
     Returns:
       (`list` of `str`) all transitive inputs to the node, as a list of node
@@ -1163,9 +1329,11 @@ class DebugDumpDir(object):
       raise LookupError(
           "Node inputs are not loaded from partition graphs yet.")
 
-    if node_name not in self._node_inputs:
-      raise ValueError("Node '%s' does not exist in partition graphs." %
-                       node_name)
+    device_name = self._infer_device_name(device_name, node_name)
+    if node_name not in self._node_inputs[device_name]:
+      raise ValueError(
+          "Node '%s' does not exist in the partition graph of device %s." %
+          (node_name, device_name))
 
     inputs = []
 
@@ -1186,21 +1354,21 @@ class DebugDumpDir(object):
 
       # Stop the tracing at a Merge op, as it is generally impossible to infer
       # outside the runtime which input to the Merge op is alive.
-      if self._node_op_types[node] == "Merge":
+      if self._node_op_types[device_name][node] == "Merge":
         return
 
       if node in visited_nodes:
         return
       visited_nodes.append(node)
 
-      for inp in self._node_inputs[node]:
+      for inp in self._node_inputs[device_name][node]:
         if inp == node_name:
           continue
         inputs.append(inp)
         trace_inputs(inp)
 
       if include_control:
-        for ctrl_inp in self._node_ctrl_inputs[node]:
+        for ctrl_inp in self._node_ctrl_inputs[device_name][node]:
           if ctrl_inp == node_name:
             continue
           inputs.append(ctrl_inp)
@@ -1210,13 +1378,15 @@ class DebugDumpDir(object):
 
     return inputs
 
-  def node_recipients(self, node_name, is_control=False):
+  def node_recipients(self, node_name, is_control=False, device_name=None):
     """Get recipient of the given node's output according to partition graphs.
 
     Args:
       node_name: (`str`) name of the node.
       is_control: (`bool`) whether control outputs, rather than non-control
         outputs, are to be returned.
+      device_name: (`str`) name of the device. If there is only one device or if
+        node_name exists on only one device, this argumnet is optional.
 
     Returns:
       (`list` of `str`) all inputs to the node, as a list of node names.
@@ -1231,58 +1401,67 @@ class DebugDumpDir(object):
       raise LookupError(
           "Node recipients are not loaded from partition graphs yet.")
 
-    if node_name not in self._node_recipients:
-      raise ValueError("Node '%s' does not exist in partition graphs." %
-                       node_name)
+    device_name = self._infer_device_name(device_name, node_name)
+    if node_name not in self._node_recipients[device_name]:
+      raise ValueError(
+          "Node '%s' does not exist in the partition graph of device %s." %
+          (node_name, device_name))
 
     if is_control:
-      return self._node_ctrl_recipients[node_name]
+      return self._node_ctrl_recipients[device_name][node_name]
     else:
-      return self._node_recipients[node_name]
+      return self._node_recipients[device_name][node_name]
 
   def devices(self):
-    """Get the list of devices.
+    """Get the list of device names.
 
     Returns:
       (`list` of `str`) names of the devices.
-
-    Raises:
-      LookupError: If node inputs and control inputs have not been loaded
-         from partition graphs yet.
     """
 
-    if self._partition_graphs is None:
-      raise LookupError("Devices are not loaded from partition graphs yet.")
-
-    return self._devices
+    return self._device_names
 
-  def node_exists(self, node_name):
+  def node_exists(self, node_name, device_name=None):
     """Test if a node exists in the partition graphs.
 
     Args:
       node_name: (`str`) name of the node to be checked.
+      device_name: optional device name. If None, will search for the node
+        on all available devices. Otherwise, search for the node only on
+        the given device.
 
     Returns:
       A boolean indicating whether the node exists.
 
     Raises:
       LookupError: If no partition graphs have been loaded yet.
+      ValueError: If device_name is specified but cannot be found.
     """
 
     if self._node_inputs is None:
       raise LookupError(
           "Nodes have not been loaded from partition graphs yet.")
 
-    return node_name in self._node_inputs
+    if (device_name is not None) and device_name not in self._node_inputs:
+      raise ValueError(
+          "The specified device_name '%s' cannot be found." % device_name)
+
+    node_inputs_all_devices = (self._node_inputs if device_name is None
+                               else (self._node_inputs[device_name],))
+
+    return any(node_name in node_inputs_all_devices[dev_name]
+               for dev_name in node_inputs_all_devices)
 
   def node_device(self, node_name):
-    """Get the device of a node.
+    """Get the names of the devices that has nodes of the specified name.
 
     Args:
       node_name: (`str`) name of the node.
 
     Returns:
-      (`str`) name of the device on which the node is placed.
+      (`str` or `list` of `str`) name of the device(s) on which the node of the
+        given name is found. Returns a `str` if there is only one such device,
+        otherwise return a `list` of `str`.
 
     Raises:
       LookupError: If node inputs and control inputs have not been loaded
@@ -1298,13 +1477,16 @@ class DebugDumpDir(object):
       raise ValueError("Node '%s' does not exist in partition graphs." %
                        node_name)
 
-    return self._node_devices[node_name]
+    output = list(self._node_devices[node_name])
+    return output[0] if len(output) == 1 else output
 
-  def node_op_type(self, node_name):
+  def node_op_type(self, node_name, device_name=None):
     """Get the op type of given node.
 
     Args:
       node_name: (`str`) name of the node.
+      device_name: (`str`) name of the device. If there is only one device or if
+        node_name exists on only one device, this argumnet is optional.
 
     Returns:
       (`str`) op type of the node.
@@ -1319,17 +1501,21 @@ class DebugDumpDir(object):
       raise LookupError(
           "Node op types are not loaded from partition graphs yet.")
 
-    if node_name not in self._node_op_types:
-      raise ValueError("Node '%s' does not exist in partition graphs." %
-                       node_name)
+    device_name = self._infer_device_name(device_name, node_name)
+    if node_name not in self._node_op_types[device_name]:
+      raise ValueError(
+          "Node '%s' does not exist in the partition graph of device '%s'. " %
+          (node_name, device_name))
 
-    return self._node_op_types[node_name]
+    return self._node_op_types[device_name][node_name]
 
-  def debug_watch_keys(self, node_name):
+  def debug_watch_keys(self, node_name, device_name=None):
     """Get all tensor watch keys of given node according to partition graphs.
 
     Args:
       node_name: (`str`) name of the node.
+      device_name: (`str`) name of the device. If there is only one device or if
+        node_name exists on only one device, this argumnet is optional.
 
     Returns:
       (`list` of `str`) all debug tensor watch keys. Returns an empty list if
@@ -1340,35 +1526,61 @@ class DebugDumpDir(object):
         partition graphs yet.
     """
 
-    if node_name not in self._debug_watches:
+    try:
+      device_name = self._infer_device_name(device_name, node_name)
+    except ValueError:
+      return []
+
+    if node_name not in self._debug_watches[device_name]:
       return []
 
     watch_keys = []
-    for watched_slot in self._debug_watches[node_name]:
-      debug_ops = self._debug_watches[node_name][watched_slot]
+    for watched_slot in self._debug_watches[device_name][node_name]:
+      debug_ops = self._debug_watches[device_name][node_name][watched_slot]
       for debug_op in debug_ops:
         watch_keys.append(
             _get_tensor_watch_key(node_name, watched_slot, debug_op))
 
     return watch_keys
 
-  def watch_key_to_data(self, debug_watch_key):
+  def watch_key_to_data(self, debug_watch_key, device_name=None):
     """Get all `DebugTensorDatum` instances corresponding to a debug watch key.
 
     Args:
       debug_watch_key: (`str`) debug watch key.
+      device_name: (`str`) name of the device. If there is only one device or if
+        the specified debug_watch_key exists on only one device, this argumnet
+        is optional.
 
     Returns:
       A list of `DebugTensorDatum` instances that correspond to the debug watch
       key. If the watch key does not exist, returns an empty list.
 
     Raises:
-      ValueError: If the debug watch key does not exist.
+      ValueError: If there are multiple devices that have the debug_watch_key,
+        but device_name is not specified.
     """
+    if device_name is None:
+      matching_device_names = [
+          name for name in self._watch_key_to_datum
+          if debug_watch_key in self._watch_key_to_datum[name]]
+      if not matching_device_names:
+        return []
+      elif len(matching_device_names) == 1:
+        device_name = matching_device_names[0]
+      else:
+        raise ValueError(
+            "The debug watch key '%s' exists on multiple (%d) devices, but "
+            "device name is not specified." %
+            (debug_watch_key, len(matching_device_names)))
+    elif device_name not in self._debug_key_to_datum:
+      raise ValueError(
+          "There is no device named '%s' consisting of debug watch keys." %
+          device_name)
 
-    return self._watch_key_to_datum.get(debug_watch_key, [])
+    return self._watch_key_to_datum[device_name].get(debug_watch_key, [])
 
-  def find(self, predicate, first_n=0):
+  def find(self, predicate, first_n=0, device_name=None):
     """Find dumped tensor data by a certain predicate.
 
     Args:
@@ -1386,6 +1598,7 @@ class DebugDumpDir(object):
       first_n: (`int`) return only the first n `DebugTensotDatum` instances (in
         time order) for which the predicate returns True. To return all the
         `DebugTensotDatum` instances, let first_n be <= 0.
+      device_name: optional device name.
 
     Returns:
       A list of all `DebugTensorDatum` objects in this `DebugDumpDir` object
@@ -1394,22 +1607,31 @@ class DebugDumpDir(object):
     """
 
     matched_data = []
-    for datum in self._dump_tensor_data:
-      if predicate(datum, datum.get_tensor()):
-        matched_data.append(datum)
+    for device in (self._dump_tensor_data if device_name is None
+                   else (self._dump_tensor_data[device_name],)):
+      for datum in self._dump_tensor_data[device]:
+        if predicate(datum, datum.get_tensor()):
+          matched_data.append(datum)
 
-        if first_n > 0 and len(matched_data) >= first_n:
-          break
+          if first_n > 0 and len(matched_data) >= first_n:
+            return matched_data
 
     return matched_data
 
-  def get_tensor_file_paths(self, node_name, output_slot, debug_op):
+  def get_tensor_file_paths(self,
+                            node_name,
+                            output_slot,
+                            debug_op,
+                            device_name=None):
     """Get the file paths from a debug-dumped tensor.
 
     Args:
       node_name: (`str`) name of the node that the tensor is produced by.
       output_slot: (`int`) output slot index of tensor.
       debug_op: (`str`) name of the debug op.
+      device_name: (`str`) name of the device. If there is only one device or if
+        the specified debug_watch_key exists on only one device, this argumnet
+        is optional.
 
     Returns:
       List of file path(s) loaded. This is a list because each debugged tensor
@@ -1420,14 +1642,17 @@ class DebugDumpDir(object):
         the debug-dump data.
     """
 
+    device_name = self._infer_device_name(device_name, node_name)
     watch_key = _get_tensor_watch_key(node_name, output_slot, debug_op)
-    if watch_key not in self._watch_key_to_datum:
+    if watch_key not in self._watch_key_to_datum[device_name]:
       raise WatchKeyDoesNotExistInDebugDumpDirError(
-          "Watch key \"%s\" does not exist in the debug dump" % watch_key)
+          "Watch key \"%s\" does not exist in the debug dump of device %s" %
+          (watch_key, device_name))
 
-    return [datum.file_path for datum in self._watch_key_to_datum[watch_key]]
+    return [datum.file_path for datum in
+            self._watch_key_to_datum[device_name][watch_key]]
 
-  def get_tensors(self, node_name, output_slot, debug_op):
+  def get_tensors(self, node_name, output_slot, debug_op, device_name=None):
     """Get the tensor value from for a debug-dumped tensor.
 
     The tensor may be dumped multiple times in the dump root directory, so a
@@ -1437,6 +1662,9 @@ class DebugDumpDir(object):
       node_name: (`str`) name of the node that the tensor is produced by.
       output_slot: (`int`) output slot index of tensor.
       debug_op: (`str`) name of the debug op.
+      device_name: (`str`) name of the device. If there is only one device or if
+        the specified debug_watch_key exists on only one device, this argumnet
+        is optional.
 
     Returns:
       List of tensors (`numpy.ndarray`) loaded from the debug-dump file(s).
@@ -1447,13 +1675,20 @@ class DebugDumpDir(object):
     """
 
     watch_key = _get_tensor_watch_key(node_name, output_slot, debug_op)
-    if watch_key not in self._watch_key_to_datum:
+    try:
+      device_name = self._infer_device_name(device_name, node_name)
+      return [datum.get_tensor() for datum in
+              self._watch_key_to_datum[device_name][watch_key]]
+    except (ValueError, KeyError):
       raise WatchKeyDoesNotExistInDebugDumpDirError(
-          "Watch key \"%s\" does not exist in the debug dump" % watch_key)
-
-    return [datum.get_tensor() for datum in self._watch_key_to_datum[watch_key]]
-
-  def get_rel_timestamps(self, node_name, output_slot, debug_op):
+          "Watch key \"%s\" does not exist in the debug dump of device %s" %
+          (watch_key, device_name))
+
+  def get_rel_timestamps(self,
+                         node_name,
+                         output_slot,
+                         debug_op,
+                         device_name=None):
     """Get the relative timestamp from for a debug-dumped tensor.
 
     Relative timestamp means (absolute timestamp - `t0`), where `t0` is the
@@ -1465,6 +1700,9 @@ class DebugDumpDir(object):
       node_name: (`str`) name of the node that the tensor is produced by.
       output_slot: (`int`) output slot index of tensor.
       debug_op: (`str`) name of the debug op.
+      device_name: (`str`) name of the device. If there is only one device or if
+        the specified debug_watch_key exists on only one device, this argumnet
+        is optional.
 
     Returns:
       (`list` of `int`) list of relative timestamps.
@@ -1474,14 +1712,20 @@ class DebugDumpDir(object):
         exist in the debug dump data.
     """
 
+    device_name = self._infer_device_name(device_name, node_name)
     watch_key = _get_tensor_watch_key(node_name, output_slot, debug_op)
-    if watch_key not in self._watch_key_to_datum:
+    if watch_key not in self._watch_key_to_datum[device_name]:
       raise WatchKeyDoesNotExistInDebugDumpDirError(
           "Watch key \"%s\" does not exist in the debug dump" % watch_key)
 
-    return self._watch_key_to_rel_time[watch_key]
+    # TODO(cais): Figure out whether this should be relative to the global t0.
+    return self._watch_key_to_rel_time[device_name][watch_key]
 
-  def get_dump_sizes_bytes(self, node_name, output_slot, debug_op):
+  def get_dump_sizes_bytes(self,
+                           node_name,
+                           output_slot,
+                           debug_op,
+                           device_name=None):
     """Get the sizes of the dump files for a debug-dumped tensor.
 
     Unit of the file size: byte.
@@ -1490,6 +1734,9 @@ class DebugDumpDir(object):
       node_name: (`str`) name of the node that the tensor is produced by.
       output_slot: (`int`) output slot index of tensor.
       debug_op: (`str`) name of the debug op.
+      device_name: (`str`) name of the device. If there is only one device or if
+        the specified debug_watch_key exists on only one device, this argumnet
+        is optional.
 
     Returns:
       (`list` of `int`): list of dump file sizes in bytes.
@@ -1499,12 +1746,14 @@ class DebugDumpDir(object):
         exist in the debug dump data.
     """
 
+    device_name = self._infer_device_name(device_name, node_name)
     watch_key = _get_tensor_watch_key(node_name, output_slot, debug_op)
-    if watch_key not in self._watch_key_to_datum:
+    if watch_key not in self._watch_key_to_datum[device_name]:
       raise WatchKeyDoesNotExistInDebugDumpDirError(
-          "Watch key \"%s\" does not exist in the debug dump" % watch_key)
+          "Watch key \"%s\" does not exist in the debug dump of device %s" %
+          (watch_key, device_name))
 
-    return self._watch_key_to_dump_size_bytes[watch_key]
+    return self._watch_key_to_dump_size_bytes[device_name][watch_key]
 
   def node_traceback(self, element_name):
     """Try to retrieve the Python traceback of node's construction.
diff --git a/tensorflow/python/debug/lib/debug_data_test.py b/tensorflow/python/debug/lib/debug_data_test.py
index dc45e8df6c..dd621a5afc 100644
--- a/tensorflow/python/debug/lib/debug_data_test.py
+++ b/tensorflow/python/debug/lib/debug_data_test.py
@@ -23,12 +23,29 @@ import tempfile
 
 import numpy as np
 
+from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import tensor_pb2
 from tensorflow.python.debug.lib import debug_data
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import googletest
 
 
+class DeviceNamePathConversionTest(test_util.TensorFlowTestCase):
+
+  def testDeviceNameToDevicePath(self):
+    self.assertEqual(
+        debug_data.METADATA_FILE_PREFIX + debug_data.DEVICE_TAG +
+        ",job_ps,replica_1,task_2,cpu_0",
+        debug_data.device_name_to_device_path("/job:ps/replica:1/task:2/cpu:0"))
+
+  def testDevicePathToDeviceName(self):
+    self.assertEqual(
+        "/job:ps/replica:1/task:2/cpu:0",
+        debug_data.device_path_to_device_name(
+            debug_data.METADATA_FILE_PREFIX + debug_data.DEVICE_TAG +
+            ",job_ps,replica_1,task_2,cpu_0"))
+
+
 class ParseNodeOrTensorNameTest(test_util.TensorFlowTestCase):
 
   def testParseNodeName(self):
@@ -163,7 +180,10 @@ class DebugTensorDatumTest(test_util.TensorFlowTestCase):
 
   def testDebugDatum(self):
     dump_root = "/tmp/tfdbg_1"
-    debug_dump_rel_path = "ns1/ns2/node_a_1_2_DebugIdentity_1472563253536385"
+    debug_dump_rel_path = (
+        debug_data.METADATA_FILE_PREFIX + debug_data.DEVICE_TAG +
+        ",job_localhost,replica_0,task_0,cpu_0" +
+        "/ns1/ns2/node_a_1_2_DebugIdentity_1472563253536385")
 
     datum = debug_data.DebugTensorDatum(dump_root, debug_dump_rel_path)
 
@@ -175,16 +195,18 @@ class DebugTensorDatumTest(test_util.TensorFlowTestCase):
     self.assertEqual("ns1/ns2/node_a_1:2:DebugIdentity", datum.watch_key)
     self.assertEqual(
         os.path.join(dump_root, debug_dump_rel_path), datum.file_path)
-    self.assertEqual("{DebugTensorDatum: %s:%d @ %s @ %d}" % (datum.node_name,
-                                                              datum.output_slot,
-                                                              datum.debug_op,
-                                                              datum.timestamp),
-                     str(datum))
-    self.assertEqual("{DebugTensorDatum: %s:%d @ %s @ %d}" % (datum.node_name,
-                                                              datum.output_slot,
-                                                              datum.debug_op,
-                                                              datum.timestamp),
-                     repr(datum))
+    self.assertEqual(
+        "{DebugTensorDatum (/job:localhost/replica:0/task:0/cpu:0) "
+        "%s:%d @ %s @ %d}" % (datum.node_name,
+                              datum.output_slot,
+                              datum.debug_op,
+                              datum.timestamp), str(datum))
+    self.assertEqual(
+        "{DebugTensorDatum (/job:localhost/replica:0/task:0/cpu:0) "
+        "%s:%d @ %s @ %d}" % (datum.node_name,
+                              datum.output_slot,
+                              datum.debug_op,
+                              datum.timestamp), repr(datum))
 
   def testDumpSizeBytesIsNoneForNonexistentFilePath(self):
     dump_root = "/tmp/tfdbg_1"
@@ -204,18 +226,112 @@ class DebugDumpDirTest(test_util.TensorFlowTestCase):
     # Tear down temporary dump directory.
     shutil.rmtree(self._dump_root)
 
+  def _makeDataDirWithMultipleDevicesAndDuplicateNodeNames(self):
+    cpu_0_dir = os.path.join(
+        self._dump_root,
+        debug_data.METADATA_FILE_PREFIX + debug_data.DEVICE_TAG +
+        ",job_localhost,replica_0,task_0,cpu_0")
+    gpu_0_dir = os.path.join(
+        self._dump_root,
+        debug_data.METADATA_FILE_PREFIX + debug_data.DEVICE_TAG +
+        ",job_localhost,replica_0,task_0,gpu_0")
+    gpu_1_dir = os.path.join(
+        self._dump_root,
+        debug_data.METADATA_FILE_PREFIX + debug_data.DEVICE_TAG +
+        ",job_localhost,replica_0,task_0,gpu_1")
+    os.makedirs(cpu_0_dir)
+    os.makedirs(gpu_0_dir)
+    os.makedirs(gpu_1_dir)
+    open(os.path.join(
+        cpu_0_dir, "node_foo_1_2_DebugIdentity_1472563253536386"), "wb")
+    open(os.path.join(
+        gpu_0_dir, "node_foo_1_2_DebugIdentity_1472563253536385"), "wb")
+    open(os.path.join(
+        gpu_1_dir, "node_foo_1_2_DebugIdentity_1472563253536387"), "wb")
+
   def testDebugDumpDir_nonexistentDumpRoot(self):
     with self.assertRaisesRegexp(IOError, "does not exist"):
       debug_data.DebugDumpDir(tempfile.mktemp() + "_foo")
 
   def testDebugDumpDir_invalidFileNamingPattern(self):
     # File name with too few underscores should lead to an exception.
-    open(os.path.join(self._dump_root, "node1_DebugIdentity_1234"), "wb")
+    device_dir = os.path.join(
+        self._dump_root,
+        debug_data.METADATA_FILE_PREFIX + debug_data.DEVICE_TAG +
+        ",job_localhost,replica_0,task_0,cpu_0")
+    os.makedirs(device_dir)
+    open(os.path.join(device_dir, "node1_DebugIdentity_1234"), "wb")
 
     with self.assertRaisesRegexp(ValueError,
                                  "does not conform to the naming pattern"):
       debug_data.DebugDumpDir(self._dump_root)
 
+  def testDebugDumpDir_validDuplicateNodeNamesWithMultipleDevices(self):
+    self._makeDataDirWithMultipleDevicesAndDuplicateNodeNames()
+
+    graph_cpu_0 = graph_pb2.GraphDef()
+    node = graph_cpu_0.node.add()
+    node.name = "node_foo_1"
+    node.op = "FooOp"
+    node.device = "/job:localhost/replica:0/task:0/cpu:0"
+    graph_gpu_0 = graph_pb2.GraphDef()
+    node = graph_gpu_0.node.add()
+    node.name = "node_foo_1"
+    node.op = "FooOp"
+    node.device = "/job:localhost/replica:0/task:0/gpu:0"
+    graph_gpu_1 = graph_pb2.GraphDef()
+    node = graph_gpu_1.node.add()
+    node.name = "node_foo_1"
+    node.op = "FooOp"
+    node.device = "/job:localhost/replica:0/task:0/gpu:1"
+
+    dump_dir = debug_data.DebugDumpDir(
+        self._dump_root,
+        partition_graphs=[graph_cpu_0, graph_gpu_0, graph_gpu_1])
+
+    self.assertItemsEqual(
+        ["/job:localhost/replica:0/task:0/cpu:0",
+         "/job:localhost/replica:0/task:0/gpu:0",
+         "/job:localhost/replica:0/task:0/gpu:1"], dump_dir.devices())
+    self.assertEqual(1472563253536385, dump_dir.t0)
+    self.assertEqual(3, dump_dir.size)
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        r"There are multiple \(3\) devices, but device_name is not specified"):
+      dump_dir.nodes()
+    self.assertItemsEqual(
+        ["node_foo_1"],
+        dump_dir.nodes(device_name="/job:localhost/replica:0/task:0/cpu:0"))
+
+  def testDuplicateNodeNamesInGraphDefOfSingleDeviceRaisesException(self):
+    self._makeDataDirWithMultipleDevicesAndDuplicateNodeNames()
+    graph_cpu_0 = graph_pb2.GraphDef()
+    node = graph_cpu_0.node.add()
+    node.name = "node_foo_1"
+    node.op = "FooOp"
+    node.device = "/job:localhost/replica:0/task:0/cpu:0"
+    graph_gpu_0 = graph_pb2.GraphDef()
+    node = graph_gpu_0.node.add()
+    node.name = "node_foo_1"
+    node.op = "FooOp"
+    node.device = "/job:localhost/replica:0/task:0/gpu:0"
+    graph_gpu_1 = graph_pb2.GraphDef()
+    node = graph_gpu_1.node.add()
+    node.name = "node_foo_1"
+    node.op = "FooOp"
+    node.device = "/job:localhost/replica:0/task:0/gpu:1"
+    node = graph_gpu_1.node.add()  # Here is the duplicate.
+    node.name = "node_foo_1"
+    node.op = "FooOp"
+    node.device = "/job:localhost/replica:0/task:0/gpu:1"
+
+    with self.assertRaisesRegexp(
+        ValueError, r"Duplicate node name on device "):
+      debug_data.DebugDumpDir(
+          self._dump_root,
+          partition_graphs=[graph_cpu_0, graph_gpu_0, graph_gpu_1])
+
   def testDebugDumpDir_emptyDumpDir(self):
     dump_dir = debug_data.DebugDumpDir(self._dump_root)
 
diff --git a/tensorflow/python/debug/lib/session_debug_multi_gpu_test.py b/tensorflow/python/debug/lib/session_debug_multi_gpu_test.py
new file mode 100644
index 0000000000..b0dc25851c
--- /dev/null
+++ b/tensorflow/python/debug/lib/session_debug_multi_gpu_test.py
@@ -0,0 +1,93 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for debugger functionalities under multiple (i.e., >1) GPUs."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+import tempfile
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import device_lib
+from tensorflow.python.client import session
+from tensorflow.python.debug.lib import debug_data
+from tensorflow.python.debug.lib import debug_utils
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import googletest
+
+
+class SessionDebugMultiGPUTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    self._dump_root = tempfile.mkdtemp()
+
+  def tearDown(self):
+    ops.reset_default_graph()
+
+    # Tear down temporary dump directory.
+    if os.path.isdir(self._dump_root):
+      shutil.rmtree(self._dump_root)
+
+  def testMultiGPUSessionRun(self):
+    local_devices = device_lib.list_local_devices()
+    gpu_device_names = []
+    for device in local_devices:
+      if device.device_type == "GPU":
+        gpu_device_names.append(device.name)
+    gpu_device_names = sorted(gpu_device_names)
+
+    if len(gpu_device_names) < 2:
+      self.skipTest(
+          "This test requires at least 2 GPUs, but only %d is available." %
+          len(gpu_device_names))
+
+    with session.Session() as sess:
+      v = variables.Variable([10.0, 15.0], dtype=dtypes.float32, name="v")
+      with ops.device(gpu_device_names[0]):
+        u0 = math_ops.add(v, v, name="u0")
+      with ops.device(gpu_device_names[1]):
+        u1 = math_ops.multiply(v, v, name="u1")
+      w = math_ops.subtract(u1, u0, name="w")
+
+      sess.run(v.initializer)
+
+      run_options = config_pb2.RunOptions(output_partition_graphs=True)
+      debug_utils.watch_graph(run_options, sess.graph,
+                              debug_urls="file://" + self._dump_root)
+      run_metadata = config_pb2.RunMetadata()
+      self.assertAllClose(
+          [80.0, 195.0],
+          sess.run(w, options=run_options, run_metadata=run_metadata))
+
+      debug_dump_dir = debug_data.DebugDumpDir(
+          self._dump_root, partition_graphs=run_metadata.partition_graphs)
+      self.assertEqual(3, len(debug_dump_dir.devices()))
+      self.assertAllClose(
+          [10.0, 15.0], debug_dump_dir.get_tensors("v", 0, "DebugIdentity")[0])
+      self.assertAllClose(
+          [20.0, 30.0], debug_dump_dir.get_tensors("u0", 0, "DebugIdentity")[0])
+      self.assertAllClose(
+          [100.0, 225.0],
+          debug_dump_dir.get_tensors("u1", 0, "DebugIdentity")[0])
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/python/debug/lib/session_debug_testlib.py b/tensorflow/python/debug/lib/session_debug_testlib.py
index 1b4444ef5a..a14ff1af3c 100644
--- a/tensorflow/python/debug/lib/session_debug_testlib.py
+++ b/tensorflow/python/debug/lib/session_debug_testlib.py
@@ -250,8 +250,12 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
     self.assertIn(results.v.op.type, results.dump.node_op_type(results.v_name))
     self.assertIn(results.w.op.type, results.dump.node_op_type(results.w_name))
 
-    with self.assertRaisesRegexp(
-        ValueError, "Node 'foo_bar' does not exist in partition graphs."):
+    if test_util.gpu_device_name():
+      expected_error_regexp = r"None of the .* devices has a node named "
+    else:
+      expected_error_regexp = (
+          r"Node \'foo_bar\' does not exist in the partition graph of device")
+    with self.assertRaisesRegexp(ValueError, expected_error_regexp):
       results.dump.node_op_type("foo_bar")
 
   def testDumpStringTensorsWorks(self):
@@ -437,9 +441,11 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
       # Verify dump files
       self.assertTrue(os.path.isdir(self._dump_root))
 
-      self.assertTrue(os.path.isdir(os.path.join(self._dump_root, u_namespace)))
-      self.assertTrue(
-          os.path.isdir(os.path.join(self._dump_root, v_namespace, "v")))
+      u_glob_out = glob.glob(os.path.join(self._dump_root, "*", u_namespace))
+      v_glob_out = glob.glob(os.path.join(
+          self._dump_root, "*", v_namespace, "v"))
+      self.assertTrue(os.path.isdir(u_glob_out[0]))
+      self.assertTrue(os.path.isdir(v_glob_out[0]))
 
       dump = debug_data.DebugDumpDir(
           self._dump_root, partition_graphs=run_metadata.partition_graphs)
@@ -689,7 +695,11 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
     u_read_name = u_name + "/read"
 
     # Test node name list lookup of the DebugDumpDir object.
-    node_names = dump.nodes()
+    if test_util.gpu_device_name():
+      node_names = dump.nodes(
+          device_name="/job:localhost/replica:0/task:0/gpu:0")
+    else:
+      node_names = dump.nodes()
     self.assertTrue(u_name in node_names)
     self.assertTrue(u_read_name in node_names)
 
@@ -699,7 +709,11 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
     self.assertEqual(1, len(u_attr["shape"].shape.dim))
     self.assertEqual(2, u_attr["shape"].shape.dim[0].size)
 
-    with self.assertRaisesRegexp(ValueError, "No node named \"foo\" exists"):
+    if test_util.gpu_device_name():
+      expected_error_regexp = r"None of the .* devices has a node named "
+    else:
+      expected_error_regexp = r"No node named \"foo\" exists"
+    with self.assertRaisesRegexp(ValueError, expected_error_regexp):
       dump.node_attributes("foo")
 
   def testGraphStructureLookupGivesDebugWatchKeys(self):
@@ -722,7 +736,6 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
     self.assertEqual(0, u_data[0].output_slot)
     self.assertEqual("DebugIdentity", u_data[0].debug_op)
     self.assertGreaterEqual(u_data[0].timestamp, 0)
-
     self.assertEqual([], dump.watch_key_to_data("foo"))
 
   def testGraphStructureLookupGivesNodeInputsAndRecipients(self):
@@ -753,12 +766,13 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
     self.assertEqual([], dump.node_recipients(w_name, is_control=True))
 
     # Test errors raised on invalid node names.
-    with self.assertRaisesRegexp(ValueError,
-                                 "does not exist in partition graphs"):
+    if test_util.gpu_device_name():
+      expected_error_regexp = r"None of the .* devices has a node named "
+    else:
+      expected_error_regexp = "does not exist in the partition graph of device "
+    with self.assertRaisesRegexp(ValueError, expected_error_regexp):
       dump.node_inputs(u_name + "foo")
-
-    with self.assertRaisesRegexp(ValueError,
-                                 "does not exist in partition graphs"):
+    with self.assertRaisesRegexp(ValueError, expected_error_regexp):
       dump.node_recipients(u_name + "foo")
 
     # Test transitive_inputs().
@@ -769,8 +783,7 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
     self.assertEqual(
         set([u_name, u_read_name, v_name]), set(dump.transitive_inputs(w_name)))
 
-    with self.assertRaisesRegexp(ValueError,
-                                 "does not exist in partition graphs"):
+    with self.assertRaisesRegexp(ValueError, expected_error_regexp):
       dump.transitive_inputs(u_name + "foo")
 
   def testGraphStructureLookupWithoutPartitionGraphsDoesNotErrorOut(self):
@@ -1067,10 +1080,12 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
       y = array_ops.squeeze(ph, name="mismatch/y")
 
       run_options = config_pb2.RunOptions(output_partition_graphs=True)
+      run_metadata = config_pb2.RunMetadata()
       debug_utils.watch_graph(
           run_options, sess.graph, debug_urls=self._debug_urls(), global_step=1)
 
-      sess.run(x, feed_dict={ph: np.array([[7.0, 8.0]])}, options=run_options)
+      sess.run(x, feed_dict={ph: np.array([[7.0, 8.0]])}, options=run_options,
+               run_metadata=run_metadata)
       dump1 = debug_data.DebugDumpDir(self._dump_root)
       self.assertEqual(1, dump1.core_metadata.global_step)
       self.assertGreaterEqual(dump1.core_metadata.session_run_index, 0)
diff --git a/tensorflow/python/framework/importer.py b/tensorflow/python/framework/importer.py
index ffce6ce4c8..c2fc7e3af9 100644
--- a/tensorflow/python/framework/importer.py
+++ b/tensorflow/python/framework/importer.py
@@ -437,6 +437,7 @@ def import_graph_def(graph_def, input_map=None, return_elements=None,
                            'WholeFileReader', 'TextLineReader',
                            'FixedLengthRecordReader',
                            'TFRecordReader', 'IdentityReader',
+                           'LMDBReader',
                            'RefSwitch', 'RefEnter', 'RefNextIteration',
                            'RefMerge', 'RefIdentity']:
               pass
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 79c7905427..242c396acb 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -947,7 +947,7 @@ cuda_py_test(
     tags = ["notsan"],
 )
 
-tf_py_test(
+cuda_py_test(
     name = "diag_op_test",
     size = "medium",
     srcs = ["diag_op_test.py"],
@@ -979,6 +979,7 @@ tf_py_test(
         "//tensorflow/python:util",
         "//tensorflow/python:variables",
     ],
+    data = ["//tensorflow/core:lmdb_testdata"],
 )
 
 cuda_py_test(
diff --git a/tensorflow/python/kernel_tests/basic_gpu_test.py b/tensorflow/python/kernel_tests/basic_gpu_test.py
index dbbc2de811..013aa1ba8a 100644
--- a/tensorflow/python/kernel_tests/basic_gpu_test.py
+++ b/tensorflow/python/kernel_tests/basic_gpu_test.py
@@ -129,7 +129,7 @@ class MathBuiltinUnaryTest(test.TestCase):
     for dtype in [np.float32]:
       self._testDtype(dtype, use_gpu=True)
 
-  def testFloorDevide(self):
+  def testFloorDivide(self):
     x = (1 + np.linspace(0, 5, np.prod([1, 3, 2]))).astype(np.float32).reshape(
         [1, 3, 2])
     y = (1 + np.linspace(0, 5, np.prod([1, 3, 2]))).astype(np.float32).reshape(
diff --git a/tensorflow/python/kernel_tests/diag_op_test.py b/tensorflow/python/kernel_tests/diag_op_test.py
index 4744e68051..f0b7885732 100644
--- a/tensorflow/python/kernel_tests/diag_op_test.py
+++ b/tensorflow/python/kernel_tests/diag_op_test.py
@@ -39,21 +39,23 @@ class MatrixDiagTest(test.TestCase):
       self.assertEqual((3, 3), v_diag.get_shape())
       self.assertAllEqual(v_diag.eval(), mat)
 
-  def testBatchVector(self):
+  def _testBatchVector(self, dtype):
     with self.test_session(use_gpu=True):
-      v_batch = np.array([[1.0, 2.0, 3.0],
-                          [4.0, 5.0, 6.0]])
-      mat_batch = np.array(
-          [[[1.0, 0.0, 0.0],
-            [0.0, 2.0, 0.0],
-            [0.0, 0.0, 3.0]],
-           [[4.0, 0.0, 0.0],
-            [0.0, 5.0, 0.0],
-            [0.0, 0.0, 6.0]]])
+      v_batch = np.array([[1.0, 0.0, 3.0], [4.0, 5.0, 6.0]]).astype(dtype)
+      mat_batch = np.array([[[1.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 3.0]],
+                            [[4.0, 0.0, 0.0], [0.0, 5.0, 0.0],
+                             [0.0, 0.0, 6.0]]]).astype(dtype)
       v_batch_diag = array_ops.matrix_diag(v_batch)
       self.assertEqual((2, 3, 3), v_batch_diag.get_shape())
       self.assertAllEqual(v_batch_diag.eval(), mat_batch)
 
+  def testBatchVector(self):
+    self._testBatchVector(np.float32)
+    self._testBatchVector(np.float64)
+    self._testBatchVector(np.int32)
+    self._testBatchVector(np.int64)
+    self._testBatchVector(np.bool)
+
   def testInvalidShape(self):
     with self.assertRaisesRegexp(ValueError, "must be at least rank 1"):
       array_ops.matrix_diag(0)
@@ -108,29 +110,29 @@ class MatrixSetDiagTest(test.TestCase):
       self.assertEqual((3, 2), output.get_shape())
       self.assertAllEqual(expected, output.eval())
 
-  def testSquareBatch(self):
+  def _testSquareBatch(self, dtype):
     with self.test_session(use_gpu=True):
-      v_batch = np.array([[-1.0, -2.0, -3.0],
-                          [-4.0, -5.0, -6.0]])
-      mat_batch = np.array(
-          [[[1.0, 0.0, 3.0],
-            [0.0, 2.0, 0.0],
-            [1.0, 0.0, 3.0]],
-           [[4.0, 0.0, 4.0],
-            [0.0, 5.0, 0.0],
-            [2.0, 0.0, 6.0]]])
+      v_batch = np.array([[-1.0, 0.0, -3.0], [-4.0, -5.0, -6.0]]).astype(dtype)
+      mat_batch = np.array([[[1.0, 0.0, 3.0], [0.0, 2.0, 0.0], [1.0, 0.0, 3.0]],
+                            [[4.0, 0.0, 4.0], [0.0, 5.0, 0.0],
+                             [2.0, 0.0, 6.0]]]).astype(dtype)
+
+      mat_set_diag_batch = np.array([[[-1.0, 0.0, 3.0], [0.0, 0.0, 0.0],
+                                      [1.0, 0.0, -3.0]],
+                                     [[-4.0, 0.0, 4.0], [0.0, -5.0, 0.0],
+                                      [2.0, 0.0, -6.0]]]).astype(dtype)
 
-      mat_set_diag_batch = np.array(
-          [[[-1.0, 0.0, 3.0],
-            [0.0, -2.0, 0.0],
-            [1.0, 0.0, -3.0]],
-           [[-4.0, 0.0, 4.0],
-            [0.0, -5.0, 0.0],
-            [2.0, 0.0, -6.0]]])
       output = array_ops.matrix_set_diag(mat_batch, v_batch)
       self.assertEqual((2, 3, 3), output.get_shape())
       self.assertAllEqual(mat_set_diag_batch, output.eval())
 
+  def testSquareBatch(self):
+    self._testSquareBatch(np.float32)
+    self._testSquareBatch(np.float64)
+    self._testSquareBatch(np.int32)
+    self._testSquareBatch(np.int64)
+    self._testSquareBatch(np.bool)
+
   def testRectangularBatch(self):
     with self.test_session(use_gpu=True):
       v_batch = np.array([[-1.0, -2.0],
@@ -220,22 +222,24 @@ class MatrixDiagPartTest(test.TestCase):
       mat_diag = array_ops.matrix_diag_part(mat)
       self.assertAllEqual(mat_diag.eval(), np.array([1.0, 4.0]))
 
-  def testSquareBatch(self):
+  def _testSquareBatch(self, dtype):
     with self.test_session(use_gpu=True):
-      v_batch = np.array([[1.0, 2.0, 3.0],
-                          [4.0, 5.0, 6.0]])
-      mat_batch = np.array(
-          [[[1.0, 0.0, 0.0],
-            [0.0, 2.0, 0.0],
-            [0.0, 0.0, 3.0]],
-           [[4.0, 0.0, 0.0],
-            [0.0, 5.0, 0.0],
-            [0.0, 0.0, 6.0]]])
+      v_batch = np.array([[1.0, 0.0, 3.0], [4.0, 5.0, 6.0]]).astype(dtype)
+      mat_batch = np.array([[[1.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 3.0]],
+                            [[4.0, 0.0, 0.0], [0.0, 5.0, 0.0],
+                             [0.0, 0.0, 6.0]]]).astype(dtype)
       self.assertEqual(mat_batch.shape, (2, 3, 3))
       mat_batch_diag = array_ops.matrix_diag_part(mat_batch)
       self.assertEqual((2, 3), mat_batch_diag.get_shape())
       self.assertAllEqual(mat_batch_diag.eval(), v_batch)
 
+  def testSquareBatch(self):
+    self._testSquareBatch(np.float32)
+    self._testSquareBatch(np.float64)
+    self._testSquareBatch(np.int32)
+    self._testSquareBatch(np.int64)
+    self._testSquareBatch(np.bool)
+
   def testRectangularBatch(self):
     with self.test_session(use_gpu=True):
       v_batch = np.array([[1.0, 2.0],
diff --git a/tensorflow/python/kernel_tests/reader_ops_test.py b/tensorflow/python/kernel_tests/reader_ops_test.py
index 10f34751d0..0038f4bbf3 100644
--- a/tensorflow/python/kernel_tests/reader_ops_test.py
+++ b/tensorflow/python/kernel_tests/reader_ops_test.py
@@ -858,5 +858,48 @@ class AsyncReaderTest(test.TestCase):
     output.append(sess.run(args))
 
 
+class LMDBReaderTest(test.TestCase):
+
+  def setUp(self):
+    super(LMDBReaderTest, self).setUp()
+
+  def testReadFromFile(self):
+    with self.test_session() as sess:
+      reader = io_ops.LMDBReader(name="test_read_from_file")
+      path = os.path.join("tensorflow", "core", "lib", "lmdb", "testdata",
+                          "data.mdb")
+      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+      key, value = reader.read(queue)
+
+      queue.enqueue([path]).run()
+      queue.close().run()
+      for i in range(10):
+        k, v = sess.run([key, value])
+        self.assertAllEqual(compat.as_bytes(k), compat.as_bytes(str(i)))
+        self.assertAllEqual(compat.as_bytes(v), compat.as_bytes(str(chr(ord('a') + i))))
+
+      with self.assertRaisesOpError("is closed and has insufficient elements "
+                                    "\\(requested 1, current size 0\\)"):
+        k, v = sess.run([key, value])
+
+  def testReadFromFolder(self):
+    with self.test_session() as sess:
+      reader = io_ops.LMDBReader(name="test_read_from_folder")
+      path = os.path.join("tensorflow", "core", "lib", "lmdb", "testdata")
+      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+      key, value = reader.read(queue)
+
+      queue.enqueue([path]).run()
+      queue.close().run()
+      for i in range(10):
+        k, v = sess.run([key, value])
+        self.assertAllEqual(compat.as_bytes(k), compat.as_bytes(str(i)))
+        self.assertAllEqual(compat.as_bytes(v), compat.as_bytes(str(chr(ord('a') + i))))
+
+      with self.assertRaisesOpError("is closed and has insufficient elements "
+                                    "\\(requested 1, current size 0\\)"):
+        k, v = sess.run([key, value])
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/layers/convolutional.py b/tensorflow/python/layers/convolutional.py
index b61168695a..49dcd2370c 100644
--- a/tensorflow/python/layers/convolutional.py
+++ b/tensorflow/python/layers/convolutional.py
@@ -149,39 +149,13 @@ class _Conv(base.Layer):
     self.built = True
 
   def call(self, inputs):
-    if (self.data_format == 'channels_first' and
-        not framework.test_util.gpu_device_name()):
-      # `nn.convolution` is not implemented on CPU for `channels_first` format.
-      # In cases where we are most likely running on CPU using `channels_first`,
-      # we reshape the inputs to use `channels_last` (and reshape them back
-      # afterwards). This is a temporary fix; a better solution would be a fix
-      # at the op level.
-      # TODO(chollet): remove this when `nn.convolution` is feature-complete.
-      data_format = 'channels_last'
-      if self.rank == 1:
-        inputs = array_ops.transpose(inputs, (0, 2, 1))
-      elif self.rank == 2:
-        inputs = array_ops.transpose(inputs, (0, 2, 3, 1))
-      elif self.rank == 3:
-        inputs = array_ops.transpose(inputs, (0, 2, 3, 4, 1))
-    else:
-      data_format = self.data_format
     outputs = nn.convolution(
         input=inputs,
         filter=self.kernel,
         dilation_rate=self.dilation_rate,
         strides=self.strides,
         padding=self.padding.upper(),
-        data_format=utils.convert_data_format(data_format,
-                                              self.rank + 2))
-    if (self.data_format == 'channels_first' and
-        not framework.test_util.gpu_device_name()):
-      if self.rank == 1:
-        outputs = array_ops.transpose(outputs, (0, 2, 1))
-      elif self.rank == 2:
-        outputs = array_ops.transpose(outputs, (0, 3, 1, 2))
-      elif self.rank == 3:
-        outputs = array_ops.transpose(outputs, (0, 4, 1, 2, 3))
+        data_format=utils.convert_data_format(self.data_format, self.rank + 2))
 
     if self.bias is not None:
       if self.data_format == 'channels_first':
@@ -202,18 +176,10 @@ class _Conv(base.Layer):
                                          [outputs_shape[0], outputs_shape[1],
                                           outputs_shape[2] * outputs_shape[3],
                                           outputs_shape[4]])
-          outputs_4d = nn.bias_add(
-              outputs_4d,
-              self.bias,
-              data_format=utils.convert_data_format(self.data_format, 4))
+          outputs_4d = nn.bias_add(outputs_4d, self.bias, data_format='NCHW')
           outputs = array_ops.reshape(outputs_4d, outputs_shape)
       else:
-        outputs = nn.bias_add(
-            outputs,
-            self.bias,
-            data_format=utils.convert_data_format(self.data_format, 4))
-        # Note that we passed rank=4 because bias_add will only accept
-        # NHWC and NCWH even if the rank of the inputs is 3 or 5.
+        outputs = nn.bias_add(outputs, self.bias, data_format='NHWC')
 
     if self.activation is not None:
       return self.activation(outputs)
diff --git a/tensorflow/python/layers/pooling.py b/tensorflow/python/layers/pooling.py
index 6cd644b642..e903afa0a8 100644
--- a/tensorflow/python/layers/pooling.py
+++ b/tensorflow/python/layers/pooling.py
@@ -262,16 +262,7 @@ class _Pooling2D(base.Layer):
     self.input_spec = base.InputSpec(ndim=4)
 
   def call(self, inputs):
-    if (self.data_format == 'channels_first' and
-        not framework.test_util.gpu_device_name()):
-      # `nn.convolution` is not implemented on CPU for `channels_first` format.
-      # TODO(chollet): remove this when `nn.convolution` is feature-complete.
-      data_format = 'channels_last'
-      inputs = array_ops.transpose(inputs, (0, 2, 3, 1))
-    else:
-      data_format = self.data_format
-
-    if data_format == 'channels_last':
+    if self.data_format == 'channels_last':
       pool_shape = (1,) + self.pool_size + (1,)
       strides = (1,) + self.strides + (1,)
     else:
@@ -282,11 +273,7 @@ class _Pooling2D(base.Layer):
         ksize=pool_shape,
         strides=strides,
         padding=self.padding.upper(),
-        data_format=utils.convert_data_format(data_format, 4))
-
-    if (self.data_format == 'channels_first' and
-        not framework.test_util.gpu_device_name()):
-      outputs = array_ops.transpose(outputs, (0, 3, 1, 2))
+        data_format=utils.convert_data_format(self.data_format, 4))
     return outputs
 
   def _compute_output_shape(self, input_shape):
diff --git a/tensorflow/python/ops/hidden_ops.txt b/tensorflow/python/ops/hidden_ops.txt
index 06adfc5066..553e0dc135 100644
--- a/tensorflow/python/ops/hidden_ops.txt
+++ b/tensorflow/python/ops/hidden_ops.txt
@@ -191,6 +191,7 @@ WholeFileReader
 TextLineReaderV2
 TFRecordReaderV2
 WholeFileReaderV2
+LMDBReader
 
 # linalg_ops
 BatchCholesky
diff --git a/tensorflow/python/ops/io_ops.py b/tensorflow/python/ops/io_ops.py
index 68ecc219e4..8f5d0e5cd4 100644
--- a/tensorflow/python/ops/io_ops.py
+++ b/tensorflow/python/ops/io_ops.py
@@ -26,6 +26,7 @@ See the @{$python/io_ops} guide.
 @@WholeFileReader
 @@IdentityReader
 @@TFRecordReader
+@@LMDBReader
 @@FixedLengthRecordReader
 @@decode_csv
 @@decode_raw
@@ -443,6 +444,25 @@ class TFRecordReader(ReaderBase):
 ops.NotDifferentiable("TFRecordReader")
 
 
+class LMDBReader(ReaderBase):
+  """A Reader that outputs the records from a LMDB file.
+
+  See ReaderBase for supported methods.
+  """
+  def __init__(self, name=None, options=None):
+    """Create a LMDBReader.
+
+    Args:
+      name: A name for the operation (optional).
+      options: A LMDBRecordOptions object (optional).
+    """
+    rr = gen_io_ops._lmdb_reader(name=name)
+    super(LMDBReader, self).__init__(rr)
+
+
+ops.NotDifferentiable("LMDBReader")
+
+
 class IdentityReader(ReaderBase):
   """A Reader that outputs the queued work as both the key and value.
 
diff --git a/tensorflow/python/ops/script_ops.py b/tensorflow/python/ops/script_ops.py
index ebdb137fb9..fe532fa186 100644
--- a/tensorflow/python/ops/script_ops.py
+++ b/tensorflow/python/ops/script_ops.py
@@ -26,6 +26,7 @@ from __future__ import print_function
 import threading
 
 import numpy as np
+import six
 
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.framework import function
@@ -81,6 +82,10 @@ class FuncRegistry(object):
     if func is None:
       raise ValueError("callback %s is not found" % token)
     ret = func(*args)
+    # Strings seem to lead to a memory leak here if they're not wrapped in a
+    # list.
+    if isinstance(ret, six.binary_type):
+      ret = [ret]
     # Ensures that we return either a single numpy array or a list of numpy
     # arrays.
     if isinstance(ret, (tuple, list)):
diff --git a/tensorflow/python/training/adam.py b/tensorflow/python/training/adam.py
index 5ae82524a3..459c735ea3 100644
--- a/tensorflow/python/training/adam.py
+++ b/tensorflow/python/training/adam.py
@@ -113,10 +113,14 @@ class AdamOptimizer(optimizer.Optimizer):
 
   def _create_slots(self, var_list):
     # Create the beta1 and beta2 accumulators on the same device as the first
-    # variable.
+    # variable. Sort the var_list to make sure this device is consistent across
+    # workers (these need to go on the same PS, otherwise some updates are
+    # silently ignored).
+    first_var = min(var_list, key=lambda x: x.name)
+
     if (self._beta1_power is None or
-        self._beta1_power.graph is not var_list[0].graph):
-      with ops.colocate_with(var_list[0]):
+        self._beta1_power.graph is not first_var.graph):
+      with ops.colocate_with(first_var):
         self._beta1_power = variable_scope.variable(self._beta1,
                                                     name="beta1_power",
                                                     trainable=False)
diff --git a/tensorflow/python/util/tf_should_use.py b/tensorflow/python/util/tf_should_use.py
index 88df3351e6..05c99856d2 100644
--- a/tensorflow/python/util/tf_should_use.py
+++ b/tensorflow/python/util/tf_should_use.py
@@ -17,14 +17,52 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import functools
+import itertools
 import traceback
 import types
 
+import six  # pylint: disable=unused-import
+
+from backports import weakref  # pylint: disable=g-bad-import-order
+
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.util import tf_decorator
 
 
+class _RefInfoField(
+    collections.namedtuple(
+        '_RefInfoField', ('type_', 'repr_', 'creation_stack', 'object_used'))):
+  pass
+
+
+# Thread-safe up to int32max/2 thanks to python's GIL; and may be safe even for
+# higher values in Python 3.4+.  We don't expect to ever count higher than this.
+# https://mail.python.org/pipermail/python-list/2005-April/342279.html
+_REF_ITER = itertools.count()
+
+# Dictionary mapping id(obj) => _RefInfoField.
+_REF_INFO = {}
+
+
+def _deleted(obj_id, fatal_error):
+  obj = _REF_INFO[obj_id]
+  del _REF_INFO[obj_id]
+  if not obj.object_used:
+    if fatal_error:
+      logger = tf_logging.fatal
+    else:
+      logger = tf_logging.error
+    logger(
+        '==================================\n'
+        'Object was never used (type %s):\n%s\nIf you want to mark it as '
+        'used call its "mark_used()" method.\nIt was originally created '
+        'here:\n%s\n'
+        '==================================' %
+        (obj.type_, obj.repr_, obj.creation_stack))
+
+
 def _add_should_use_warning(x, fatal_error=False):
   """Wraps object x so that if it is never used, a warning is logged.
 
@@ -39,14 +77,14 @@ def _add_should_use_warning(x, fatal_error=False):
   """
   if x is None:  # special corner case where x is None
     return x
-  has_been_used = getattr(x, '_tf_object_has_been_used', None)
-  if has_been_used is not None:
-    x._tf_object_has_been_used = has_been_used  # pylint: disable=protected-access
+  if hasattr(x, '_tf_ref_id'):  # this is already a TFShouldUseWarningWrapper
     return x
 
   def override_method(method):
     def fn(self, *args, **kwargs):
-      self._tf_object_has_been_used = True  # pylint: disable=protected-access
+      # pylint: disable=protected-access
+      _REF_INFO[self._tf_ref_id] = _REF_INFO[self._tf_ref_id]._replace(
+          object_used=True)
       return method(self, *args, **kwargs)
     return fn
 
@@ -55,38 +93,36 @@ def _add_should_use_warning(x, fatal_error=False):
 
     def __init__(self, true_self):
       self.__dict__ = true_self.__dict__
-      stack = [x.strip() for x in traceback.format_stack()]
+      stack = [s.strip() for s in traceback.format_stack()]
       # Remove top three stack entries from adding the wrapper
-      self._tf_object_creation_stack = '\n'.join(stack[:-3])
-      self._tf_object_has_been_used = False
+      self.creation_stack = '\n'.join(stack[:-3])
+      self._tf_ref_id = next(_REF_ITER)
+      _REF_INFO[self._tf_ref_id] = _RefInfoField(
+          type_=type(x),
+          repr_=repr(x),
+          creation_stack=stack,
+          object_used=False)
+
+      # Create a finalizer for self, which will be called when self is
+      # garbage collected.  Can't add self as the args because the
+      # loop will break garbage collection.  We keep track of
+      # ourselves via python ids.
+      weakref.finalize(self, _deleted, self._tf_ref_id, fatal_error)
 
     # Not sure why this pylint warning is being used; this is not an
     # old class form.
     # pylint: disable=super-on-old-class
     def __getattribute__(self, name):
-      if name != '_tf_object_has_been_used':
-        self._tf_object_has_been_used = True
+      if name == '_tf_ref_id':
+        return super(TFShouldUseWarningWrapper, self).__getattribute__(name)
+      if self._tf_ref_id in _REF_INFO:
+        _REF_INFO[self._tf_ref_id] = _REF_INFO[self._tf_ref_id]._replace(
+            object_used=True)
       return super(TFShouldUseWarningWrapper, self).__getattribute__(name)
 
-    def __del__(self):
-      if not self._tf_object_has_been_used:
-        if fatal_error:
-          logger = tf_logging.fatal
-        else:
-          logger = tf_logging.error
-        logger(
-            '==================================\n'
-            'Object was never used (type %s):\n%s\nIf you want to mark it as '
-            'used call its "mark_used()" method.\nIt was originally created '
-            'here:\n%s\n'
-            '==================================' %
-            (type(x), x, self._tf_object_creation_stack))
-
-      if hasattr(super(TFShouldUseWarningWrapper, self), '__del__'):
-        return super(TFShouldUseWarningWrapper, self).__del__()
-
     def mark_used(self, *args, **kwargs):
-      self._tf_object_has_been_used = True
+      _REF_INFO[self._tf_ref_id] = _REF_INFO[self._tf_ref_id]._replace(
+          object_used=True)
       if hasattr(super(TFShouldUseWarningWrapper, self), 'mark_used'):
         return super(TFShouldUseWarningWrapper, self).mark_used(*args, **kwargs)
     # pylint: enable=super-on-old-class
@@ -102,7 +138,8 @@ def _add_should_use_warning(x, fatal_error=False):
 
   wrapped = TFShouldUseWarningWrapper(x)
   wrapped.__doc__ = x.__doc__  # functools.wraps fails on some objects.
-  wrapped._tf_object_has_been_used = False   # pylint: disable=protected-access
+  ref_id = wrapped._tf_ref_id  # pylint: disable=protected-access
+  _REF_INFO[ref_id] = _REF_INFO[ref_id]._replace(object_used=False)
   return wrapped
 
 
diff --git a/tensorflow/python/util/tf_should_use_test.py b/tensorflow/python/util/tf_should_use_test.py
index 71d48e3dde..c826874400 100644
--- a/tensorflow/python/util/tf_should_use_test.py
+++ b/tensorflow/python/util/tf_should_use_test.py
@@ -20,6 +20,7 @@ from __future__ import division
 from __future__ import print_function
 
 import contextlib
+import gc
 import sys
 
 from tensorflow.python.framework import constant_op
@@ -45,7 +46,7 @@ def reroute_error(captured):
 class TfShouldUseTest(test.TestCase):
 
   def testAddShouldUseWarningWhenNotUsed(self):
-    c = constant_op.constant(0, name='blah')
+    c = constant_op.constant(0, name='blah0')
     captured = []
     with reroute_error(captured):
       def in_this_function():
@@ -53,44 +54,52 @@ class TfShouldUseTest(test.TestCase):
         del h
       in_this_function()
     self.assertIn('Object was never used', '\n'.join(captured))
-    self.assertIn('blah:0', '\n'.join(captured))
+    self.assertIn('blah0:0', '\n'.join(captured))
     self.assertIn('in_this_function', '\n'.join(captured))
+    gc.collect()
+    self.assertFalse(gc.garbage)
 
-  def _testAddShouldUseWarningWhenUsed(self, fn):
-    c = constant_op.constant(0, name='blah')
+  def _testAddShouldUseWarningWhenUsed(self, fn, name):
+    c = constant_op.constant(0, name=name)
     captured = []
     with reroute_error(captured):
       h = tf_should_use._add_should_use_warning(c)
       fn(h)
       del h
     self.assertNotIn('Object was never used', '\n'.join(captured))
-    self.assertNotIn('blah:0', '\n'.join(captured))
+    self.assertNotIn('%s:0' % name, '\n'.join(captured))
 
   def testAddShouldUseWarningWhenUsedWithAdd(self):
     def add(h):
       _ = h + 1
-    self._testAddShouldUseWarningWhenUsed(add)
+    self._testAddShouldUseWarningWhenUsed(add, name='blah_add')
+    gc.collect()
+    self.assertFalse(gc.garbage)
 
   def testAddShouldUseWarningWhenUsedWithGetName(self):
     def get_name(h):
       _ = h.name
-    self._testAddShouldUseWarningWhenUsed(get_name)
+    self._testAddShouldUseWarningWhenUsed(get_name, name='blah_get_name')
+    gc.collect()
+    self.assertFalse(gc.garbage)
 
   def testShouldUseResult(self):
     @tf_should_use.should_use_result
     def return_const(value):
-      return constant_op.constant(value, name='blah')
+      return constant_op.constant(value, name='blah2')
     captured = []
     with reroute_error(captured):
       return_const(0.0)
     self.assertIn('Object was never used', '\n'.join(captured))
-    self.assertIn('blah:0', '\n'.join(captured))
+    self.assertIn('blah2:0', '\n'.join(captured))
     self.assertIn('return_const', '\n'.join(captured))
+    gc.collect()
+    self.assertFalse(gc.garbage)
 
   def testShouldUseResultWhenNotReallyUsed(self):
     @tf_should_use.should_use_result
     def return_const(value):
-      return constant_op.constant(value, name='blah')
+      return constant_op.constant(value, name='blah3')
     captured = []
     with reroute_error(captured):
       with self.test_session():
@@ -100,8 +109,10 @@ class TfShouldUseTest(test.TestCase):
         v = constant_op.constant(1.0, name='meh')
         v.eval()
     self.assertIn('Object was never used', '\n'.join(captured))
-    self.assertIn('blah:0', '\n'.join(captured))
+    self.assertIn('blah3:0', '\n'.join(captured))
     self.assertIn('return_const', '\n'.join(captured))
+    gc.collect()
+    self.assertFalse(gc.garbage)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
index 9ea6474934..bf81b9c0ad 100644
--- a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
+++ b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
@@ -170,7 +170,7 @@ void Diagnostician::LogDiagnosticInformation() {
     VLOG(1) << "LD_LIBRARY_PATH is: \"" << library_path << "\"";
 
     std::vector<string> pieces = port::Split(library_path, ':');
-    for (auto piece : pieces) {
+    for (const auto &piece : pieces) {
       if (piece.empty()) {
         continue;
       }
diff --git a/tensorflow/stream_executor/stream_executor_internal.h b/tensorflow/stream_executor/stream_executor_internal.h
index 9d3ac4ed9e..802ef755eb 100644
--- a/tensorflow/stream_executor/stream_executor_internal.h
+++ b/tensorflow/stream_executor/stream_executor_internal.h
@@ -48,27 +48,9 @@ limitations under the License.
 namespace perftools {
 namespace gputools {
 
-class KernelBase;
 class Stream;
 class Timer;
 
-namespace blas {
-class BlasSupport;
-}  // namespace blas
-
-namespace fft {
-class Support;
-}  // namespace fft
-
-namespace rng {
-class RngSupport;
-}  // namespace rng
-
-}  // namespace gputools
-}  // namespace perftools
-
-namespace perftools {
-namespace gputools {
 namespace internal {
 
 // Platform-dependent interface class for the generic Events interface, in
diff --git a/tensorflow/tensorboard/backend/application_test.py b/tensorflow/tensorboard/backend/application_test.py
index 87cfdbc1d8..63eb47e9d4 100644
--- a/tensorflow/tensorboard/backend/application_test.py
+++ b/tensorflow/tensorboard/backend/application_test.py
@@ -36,7 +36,6 @@ import tensorflow as tf
 
 from werkzeug import serving
 
-from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.tensorboard import tensorboard
 from tensorflow.tensorboard.backend import application
 from tensorflow.tensorboard.backend.event_processing import event_multiplexer
@@ -221,7 +220,7 @@ class TensorboardServerTest(tf.test.TestCase):
     node2.name = 'b'
     node2.attr['very_large_attr'].s = b'a' * 2048  # 2 KB attribute
 
-    meta_graph_def = meta_graph_pb2.MetaGraphDef(graph_def=graph_def)
+    meta_graph_def = tf.MetaGraphDef(graph_def=graph_def)
 
     if self._only_use_meta_graph:
       writer.add_meta_graph(meta_graph_def)
diff --git a/tensorflow/tensorboard/backend/event_processing/BUILD b/tensorflow/tensorboard/backend/event_processing/BUILD
index 1c3e094e76..dff2dbc90d 100644
--- a/tensorflow/tensorboard/backend/event_processing/BUILD
+++ b/tensorflow/tensorboard/backend/event_processing/BUILD
@@ -134,18 +134,6 @@ py_library(
     ],
 )
 
-py_test(
-    name = "plugin_asset_util_test",
-    size = "small",
-    srcs = ["plugin_asset_util_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":event_multiplexer",
-        ":plugin_asset_util",
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
 py_library(
     name = "event_file_inspector",
     srcs = ["event_file_inspector.py"],
diff --git a/tensorflow/tensorboard/backend/event_processing/event_accumulator_test.py b/tensorflow/tensorboard/backend/event_processing/event_accumulator_test.py
index 9efd64bd2e..4ce766f420 100644
--- a/tensorflow/tensorboard/backend/event_processing/event_accumulator_test.py
+++ b/tensorflow/tensorboard/backend/event_processing/event_accumulator_test.py
@@ -24,7 +24,6 @@ import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
 import tensorflow as tf
 
-from tensorflow.python.summary.writer.writer import SummaryToEventTransformer
 from tensorflow.tensorboard.backend.event_processing import event_accumulator as ea
 
 
@@ -760,7 +759,8 @@ class MockingEventAccumulatorTest(EventAccumulatorTest):
   def testTFSummaryScalar(self):
     """Verify processing of tf.summary.scalar."""
     event_sink = _EventGenerator(self, zero_out_timestamps=True)
-    writer = SummaryToEventTransformer(event_sink)
+    writer = tf.summary.FileWriter(self.get_temp_dir())
+    writer.event_writer = event_sink
     with self.test_session() as sess:
       ipt = tf.placeholder(tf.float32)
       tf.summary.scalar('scalar1', ipt)
@@ -794,7 +794,8 @@ class MockingEventAccumulatorTest(EventAccumulatorTest):
   def testTFSummaryImage(self):
     """Verify processing of tf.summary.image."""
     event_sink = _EventGenerator(self, zero_out_timestamps=True)
-    writer = SummaryToEventTransformer(event_sink)
+    writer = tf.summary.FileWriter(self.get_temp_dir())
+    writer.event_writer = event_sink
     with self.test_session() as sess:
       ipt = tf.ones([10, 4, 4, 3], tf.uint8)
       # This is an interesting example, because the old tf.image_summary op
@@ -830,7 +831,8 @@ class MockingEventAccumulatorTest(EventAccumulatorTest):
   def testTFSummaryTensor(self):
     """Verify processing of tf.summary.tensor."""
     event_sink = _EventGenerator(self, zero_out_timestamps=True)
-    writer = SummaryToEventTransformer(event_sink)
+    writer = tf.summary.FileWriter(self.get_temp_dir())
+    writer.event_writer = event_sink
     with self.test_session() as sess:
       tf.summary.tensor_summary('scalar', tf.constant(1.0))
       tf.summary.tensor_summary('vector', tf.constant([1.0, 2.0, 3.0]))
diff --git a/tensorflow/tensorboard/backend/event_processing/plugin_asset_util_test.py b/tensorflow/tensorboard/backend/event_processing/plugin_asset_util_test.py
deleted file mode 100644
index 8fe5b31621..0000000000
--- a/tensorflow/tensorboard/backend/event_processing/plugin_asset_util_test.py
+++ /dev/null
@@ -1,119 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os.path
-
-import tensorflow as tf
-
-from tensorflow.python.summary import plugin_asset
-
-from tensorflow.tensorboard.backend.event_processing import event_multiplexer
-from tensorflow.tensorboard.backend.event_processing import plugin_asset_util
-
-
-class GenericContentPlugin(plugin_asset.PluginAsset):
-
-  def __init__(self):
-    self.contents = "hello world"
-
-  def assets(self):
-    return {"contents.txt": self.contents}
-
-
-class PluginAlpha(GenericContentPlugin):
-  plugin_name = "Alpha"
-
-
-class PluginBeta(GenericContentPlugin):
-  plugin_name = "Beta"
-
-
-class PluginGamma(GenericContentPlugin):
-  plugin_name = "Gamma"
-
-
-class PluginAssetUtilitiesTest(tf.test.TestCase):
-
-  def testGetPluginDirectory(self):
-    self.assertEqual(
-        os.path.join("logdir", "plugins", "x"),
-        plugin_asset_util.PluginDirectory("logdir", "x"))
-
-  def testNonExistentDirectory(self):
-    tempdir = self.get_temp_dir()
-    fake_dir = os.path.join(tempdir, "nonexistent_dir")
-    self.assertEqual([], plugin_asset_util.ListPlugins(fake_dir))
-    self.assertEqual([], plugin_asset_util.ListAssets(fake_dir, "fake_plugin"))
-    with self.assertRaises(KeyError):
-      plugin_asset_util.RetrieveAsset(fake_dir, "fake_plugin", "fake_asset")
-
-  def testSimplePluginCase(self):
-    tempdir = self.get_temp_dir()
-    with tf.Graph().as_default() as g:
-      plugin_asset.get_plugin_asset(PluginAlpha)
-      fw = tf.summary.FileWriter(tempdir)
-      fw.add_graph(g)
-    self.assertEqual(["Alpha"], plugin_asset_util.ListPlugins(tempdir))
-    assets = plugin_asset_util.ListAssets(tempdir, "Alpha")
-    self.assertEqual(["contents.txt"], assets)
-    contents = plugin_asset_util.RetrieveAsset(tempdir, "Alpha", "contents.txt")
-    self.assertEqual("hello world", contents)
-
-  def testEventMultiplexerIntegration(self):
-    tempdir = self.get_temp_dir()
-    with tf.Graph().as_default() as g:
-      plugin_instance = plugin_asset.get_plugin_asset(PluginAlpha)
-      plugin_instance.contents = "graph one"
-      plugin_asset.get_plugin_asset(PluginBeta)
-
-      fw = tf.summary.FileWriter(os.path.join(tempdir, "one"))
-      fw.add_graph(g)
-      fw.close()
-
-    with tf.Graph().as_default() as g:
-      plugin_instance = plugin_asset.get_plugin_asset(PluginAlpha)
-      plugin_instance.contents = "graph two"
-      fw = tf.summary.FileWriter(os.path.join(tempdir, "two"))
-      fw.add_graph(g)
-      fw.close()
-
-    multiplexer = event_multiplexer.EventMultiplexer()
-    multiplexer.AddRunsFromDirectory(tempdir)
-
-    self.assertEqual(
-        multiplexer.PluginAssets("Alpha"),
-        {"one": ["contents.txt"], "two": ["contents.txt"]})
-    self.assertEqual(
-        multiplexer.RetrievePluginAsset("one", "Alpha", "contents.txt"),
-        "graph one")
-    self.assertEqual(
-        multiplexer.RetrievePluginAsset("one", "Beta", "contents.txt"),
-        "hello world")
-    self.assertEqual(
-        multiplexer.RetrievePluginAsset("two", "Alpha", "contents.txt"),
-        "graph two")
-
-    self.assertEqual(
-        multiplexer.PluginAssets("Beta"),
-        {"one": ["contents.txt"], "two": []})
-    self.assertEqual(multiplexer.PluginAssets("Gamma"), {"one": [], "two": []})
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/tensorflow/tensorboard/plugins/scalars/scalars_demo.py b/tensorflow/tensorboard/plugins/scalars/scalars_demo.py
index e9b2a6129d..f3195fd849 100644
--- a/tensorflow/tensorboard/plugins/scalars/scalars_demo.py
+++ b/tensorflow/tensorboard/plugins/scalars/scalars_demo.py
@@ -23,8 +23,6 @@ import os.path
 from six.moves import xrange  # pylint: disable=redefined-builtin
 import tensorflow as tf
 
-from tensorflow.python.platform import app
-
 # Directory into which to write tensorboard data.
 LOGDIR = '/tmp/scalars_demo'
 
@@ -129,4 +127,4 @@ def main(unused_argv):
 
 
 if __name__ == '__main__':
-  app.run()
+  tf.app.run()
diff --git a/tensorflow/tensorboard/plugins/text/text_plugin.py b/tensorflow/tensorboard/plugins/text/text_plugin.py
index 2c11b80029..d0040e20be 100644
--- a/tensorflow/tensorboard/plugins/text/text_plugin.py
+++ b/tensorflow/tensorboard/plugins/text/text_plugin.py
@@ -35,7 +35,6 @@ import six
 import tensorflow as tf
 from werkzeug import wrappers
 
-from tensorflow.python.summary import text_summary
 from tensorflow.tensorboard.backend import http_util
 from tensorflow.tensorboard.plugins import base_plugin
 
@@ -256,7 +255,7 @@ class TextPlugin(base_plugin.TBPlugin):
 
   def index_impl(self):
     run_to_series = {}
-    name = text_summary.TextSummaryPluginAsset.plugin_name
+    name = 'tensorboard_text'
     run_to_assets = self.multiplexer.PluginAssets(name)
 
     for run, assets in run_to_assets.items():
diff --git a/tensorflow/tools/ci_build/Dockerfile.gpu b/tensorflow/tools/ci_build/Dockerfile.gpu
index 68493965fa..5d18295f68 100644
--- a/tensorflow/tools/ci_build/Dockerfile.gpu
+++ b/tensorflow/tools/ci_build/Dockerfile.gpu
@@ -1,14 +1,15 @@
-FROM nvidia/cuda:8.0-cudnn5-devel-ubuntu14.04
+FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu14.04
 
 MAINTAINER Jan Prach <jendap@google.com>
 
 # In the Ubuntu 14.04 images, cudnn is placed in system paths. Move them to
 # /usr/local/cuda
-RUN cp /usr/include/cudnn.h /usr/local/cuda/include
-RUN cp /usr/lib/x86_64-linux-gnu/libcudnn* /usr/local/cuda/lib64
+RUN cp -P /usr/include/cudnn.h /usr/local/cuda/include
+RUN cp -P /usr/lib/x86_64-linux-gnu/libcudnn* /usr/local/cuda/lib64
 
 # Copy and run the install scripts.
 COPY install/*.sh /install/
+ARG DEBIAN_FRONTEND=noninteractive
 RUN /install/install_bootstrap_deb_packages.sh
 RUN add-apt-repository -y ppa:openjdk-r/ppa && \
     add-apt-repository -y ppa:george-edison55/cmake-3.x
diff --git a/tensorflow/tools/ci_build/Dockerfile.gpu_clang b/tensorflow/tools/ci_build/Dockerfile.gpu_clang
index 00aaa9f760..c4342d17f5 100644
--- a/tensorflow/tools/ci_build/Dockerfile.gpu_clang
+++ b/tensorflow/tools/ci_build/Dockerfile.gpu_clang
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:8.0-cudnn5-devel-ubuntu14.04
+FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu14.04
 
 MAINTAINER Ilya Biryukov <ibiryukov@google.com>
 
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index b8f9fc8453..8768852dc7 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -85,3 +85,6 @@ pip2 install mock
 
 pip2 install portpicker
 pip3 install portpicker
+
+pip2 install backports.weakref==1.0rc1
+pip3 install backports.weakref==1.0rc1
diff --git a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
index e7e2d256cd..edfc4e3a98 100755
--- a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
@@ -89,3 +89,6 @@ pip3.5 install wheel==0.29.0
 pip3.5 install portpicker
 
 pip3.5 install werkzeug
+
+pip3.5 install backports.weakref==1.0rc1
+
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index d1de9eace5..d0a038a9db 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:8.0-cudnn5-devel-ubuntu16.04
+FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu16.04
 
 MAINTAINER Craig Citro <craigcitro@google.com>
 
diff --git a/tensorflow/tools/docker/Dockerfile.gpu b/tensorflow/tools/docker/Dockerfile.gpu
index 88876421f5..3ba1e963f9 100644
--- a/tensorflow/tools/docker/Dockerfile.gpu
+++ b/tensorflow/tools/docker/Dockerfile.gpu
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:8.0-cudnn5-devel-ubuntu16.04
+FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu16.04
 
 MAINTAINER Craig Citro <craigcitro@google.com>
 
diff --git a/tensorflow/tools/docs/parser.py b/tensorflow/tools/docs/parser.py
index 52a65e1c9d..7ae1d2abd9 100644
--- a/tensorflow/tools/docs/parser.py
+++ b/tensorflow/tools/docs/parser.py
@@ -897,7 +897,7 @@ class _ClassPageInfo(object):
 
   @property
   def guides(self):
-    """Returns a markdown string containing backlinks to relevent api_guides."""
+    """Returns a markdown string containing backlinks to relevant api_guides."""
     return self._guides
 
   def set_guides(self, guides):
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index 1e36af93ea..51ba3b7a0b 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -97,6 +97,7 @@ genrule(
         "@jemalloc//:COPYING",
         "@jpeg//:LICENSE.md",
         "@libxsmm_archive//:LICENSE",
+        "@lmdb//:LICENSE",
         "@local_config_sycl//sycl:LICENSE.text",
         "@png_archive//:LICENSE",
         "@protobuf//:LICENSE",
@@ -126,6 +127,7 @@ genrule(
         "@jemalloc//:COPYING",
         "@jpeg//:LICENSE.md",
         "@libxsmm_archive//:LICENSE",
+        "@lmdb//:LICENSE",
         "@local_config_sycl//sycl:LICENSE.text",
         "@png_archive//:LICENSE",
         "@protobuf//:LICENSE",
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 0c2660ff37..09fe6c53cd 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -110,6 +110,7 @@ filegroup(
         "@jemalloc//:COPYING",
         "@jpeg//:LICENSE.md",
         "@libxsmm_archive//:LICENSE",
+        "@lmdb//:LICENSE",
         "@local_config_sycl//sycl:LICENSE.text",
         "@nanopb_git//:LICENSE.txt",
         "@org_html5lib//:LICENSE",
diff --git a/tensorflow/tools/pip_package/pip_smoke_test.py b/tensorflow/tools/pip_package/pip_smoke_test.py
index 0524d2f1aa..58a80fd98a 100644
--- a/tensorflow/tools/pip_package/pip_smoke_test.py
+++ b/tensorflow/tools/pip_package/pip_smoke_test.py
@@ -46,6 +46,7 @@ BLACKLIST = [
     "//tensorflow/python:tf_optimizer",
     "//tensorflow/python:compare_test_proto_py",
     "//tensorflow/core:image_testdata",
+    "//tensorflow/core:lmdb_testdata",
     "//tensorflow/core/kernels/cloud:bigquery_reader_ops",
     "//tensorflow/python/feature_column:vocabulary_testdata",
     "//tensorflow/python:framework/test_file_system.so",
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 79b8e5bfa3..eb76fb3ce3 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -39,6 +39,7 @@ REQUIRED_PACKAGES = [
     'html5lib == 0.9999999',  # identical to 1.0b8
     'markdown == 2.2.0',
     'bleach == 1.5.0',
+    'backports.weakref == 1.0rc1',
 ]
 
 project_name = 'tensorflow'
diff --git a/tensorflow/tools/test/system_info_lib.py b/tensorflow/tools/test/system_info_lib.py
index c352abaa54..0cc261591b 100644
--- a/tensorflow/tools/test/system_info_lib.py
+++ b/tensorflow/tools/test/system_info_lib.py
@@ -96,7 +96,7 @@ def gather_cpu_info():
   cpu_info.cpu_info = info['brand']
   cpu_info.num_cores = info['count']
   cpu_info.mhz_per_cpu = info['hz_advertised_raw'][0] / 1.0e6
-  l2_cache_size = re.match(r'(\d+)', str(info['l2_cache_size']))
+  l2_cache_size = re.match(r'(\d+)', str(info.get('l2_cache_size', '')))
   if l2_cache_size:
     # If a value is returned, it's in KB
     cpu_info.cache_size['L2'] = int(l2_cache_size.group(0)) * 1024
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 18bc9a8277..d7dd16919a 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -508,6 +508,17 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   )
 
   native.new_http_archive(
+    name = "lmdb",
+    urls = [
+      "http://mirror.bazel.build/github.com/LMDB/lmdb/archive/LMDB_0.9.19.tar.gz",
+      "https://github.com/LMDB/lmdb/archive/LMDB_0.9.19.tar.gz",
+    ],
+    sha256 = "108532fb94c6f227558d45be3f3347b52539f0f58290a7bb31ec06c462d05326",
+    strip_prefix = "lmdb-LMDB_0.9.19/libraries/liblmdb",
+    build_file = str(Label("//third_party:lmdb.BUILD")),
+  )
+
+  native.new_http_archive(
       name = "jsoncpp_git",
       urls = [
           "http://mirror.bazel.build/github.com/open-source-parsers/jsoncpp/archive/11086dd6a7eba04289944367ca82cea71299ed70.tar.gz",
diff --git a/third_party/lmdb.BUILD b/third_party/lmdb.BUILD
new file mode 100644
index 0000000000..357e4a9335
--- /dev/null
+++ b/third_party/lmdb.BUILD
@@ -0,0 +1,25 @@
+# Description:
+#   LMDB is the Lightning Memory-mapped Database.
+
+licenses(["notice"])  # OpenLDAP Public License
+
+exports_files(["LICENSE"])
+
+cc_library(
+    name = "lmdb",
+    srcs = [
+        "mdb.c",
+        "midl.c",
+    ],
+    hdrs = [
+        "lmdb.h",
+        "midl.h",
+    ],
+    copts = [
+        "-w",
+    ],
+    linkopts = [
+        "-lpthread",
+    ],
+    visibility = ["//visibility:public"],
+)