tfdbg: dump debug data from different devices in separate directories

Fixes: #7051 wherein TFDBG failed to load the data dump from a Session.run() involving multiple GPUs. The root cause of the bug was that TFDBG previously assumed that node names are unique across all partition graphs. This is however not the case when multiple GPUs exist. The Send/Recv nodes in the partition graphs of the GPUs can have duplicate names. There will potentially be other cases like this in the future due to other reasons (e.g., distributed sessions and/or graph optimization). This CL relaxes this assumption, by dumping the GraphDef and tensor data from different devices into different sub-directories under the dump root directory. PiperOrigin-RevId: 158029814
author: Shanqing Cai <cais@google.com> 2017-06-05 10:29:50 -0700
committer: TensorFlower Gardener <gardener@tensorflow.org> 2017-06-05 10:33:23 -0700
commit: cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844 (patch)
tree: 80db01fd15e27ebc0d7b88959caf3140c644bd39
parent: a5909d64320a9dfd940b298bcb0bd758e514a04f (diff)
10 files changed, 884 insertions, 326 deletions
diff --git a/tensorflow/core/debug/debug_io_utils.cc b/tensorflow/core/debug/debug_io_utils.cc
index 25847a20a4..54366ce249 100644
--- a/tensorflow/core/debug/debug_io_utils.cc
+++ b/tensorflow/core/debug/debug_io_utils.cc
@@ -119,6 +119,18 @@ Status PublishEncodedGraphDefInChunks(const string& encoded_graph_def,
 
 }  // namespace
 
+// static
+const char* const DebugIO::kMetadataFilePrefix = "_tfdbg_";
+
+// static
+const char* const DebugIO::kCoreMetadataTag = "core_metadata_";
+
+// static
+const char* const DebugIO::kDeviceTag = "device_";
+
+// static
+const char* const DebugIO::kGraphTag = "graph_";
+
 DebugNodeKey::DebugNodeKey(const string& device_name, const string& node_name,
                            const int32 output_slot, const string& debug_op)
     : device_name(device_name),
@@ -126,7 +138,8 @@ DebugNodeKey::DebugNodeKey(const string& device_name, const string& node_name,
       output_slot(output_slot),
       debug_op(debug_op),
       debug_node_name(
-          strings::StrCat(node_name, ":", output_slot, ":", debug_op)) {}
+          strings::StrCat(node_name, ":", output_slot, ":", debug_op)),
+      device_path(DeviceNameToDevicePath(device_name)) {}
 
 Status ReadEventFromFile(const string& dump_file_path, Event* event) {
   Env* env(Env::Default());
@@ -158,6 +171,15 @@ Status ReadEventFromFile(const string& dump_file_path, Event* event) {
 }
 
 // static
+const string DebugNodeKey::DeviceNameToDevicePath(const string& device_name) {
+  return strings::StrCat(
+      DebugIO::kMetadataFilePrefix, DebugIO::kDeviceTag,
+      str_util::StringReplace(
+          str_util::StringReplace(device_name, ":", "_", true), "/", ",",
+          true));
+}
+
+// static
 const char* const DebugIO::kFileURLScheme = "file://";
 // static
 const char* const DebugIO::kGrpcURLScheme = "grpc://";
@@ -236,7 +258,8 @@ Status DebugIO::PublishDebugMetadata(
       const string core_metadata_path = AppendTimestampToFilePath(
           io::JoinPath(
               dump_root_dir,
-              strings::StrCat("_tfdbg_core_metadata_", "sessionrun",
+              strings::StrCat(DebugIO::kMetadataFilePrefix,
+                              DebugIO::kCoreMetadataTag, "sessionrun",
                               strings::Printf("%.14lld", session_run_index))),
           Env::Default()->NowMicros());
       status.Update(DebugFileIO::DumpEventProtoToFile(
@@ -325,10 +348,11 @@ Status DebugIO::PublishGraph(const Graph& graph, const string& device_name,
   Status status = Status::OK();
   for (const string& debug_url : debug_urls) {
     if (debug_url.find(kFileURLScheme) == 0) {
-      const string dump_root_dir = debug_url.substr(strlen(kFileURLScheme));
-      // TODO(cais): (b/38325442) Serialize the GraphDef to a directory that
-      // reflects the device name.
-      const string file_name = strings::StrCat("_tfdbg_graph_", now_micros);
+      const string dump_root_dir =
+          io::JoinPath(debug_url.substr(strlen(kFileURLScheme)),
+                       DebugNodeKey::DeviceNameToDevicePath(device_name));
+      const string file_name = strings::StrCat(DebugIO::kMetadataFilePrefix,
+                                               DebugIO::kGraphTag, now_micros);
 
       status.Update(
           DebugFileIO::DumpEventProtoToFile(event, dump_root_dir, file_name));
@@ -437,7 +461,7 @@ string DebugFileIO::GetDumpFilePath(const string& dump_root_dir,
                                     const DebugNodeKey& debug_node_key,
                                     const uint64 wall_time_us) {
   return AppendTimestampToFilePath(
-      io::JoinPath(dump_root_dir,
+      io::JoinPath(dump_root_dir, debug_node_key.device_path,
                    strings::StrCat(debug_node_key.node_name, "_",
                                    debug_node_key.output_slot, "_",
                                    debug_node_key.debug_op)),
diff --git a/tensorflow/core/debug/debug_io_utils.h b/tensorflow/core/debug/debug_io_utils.h
index f3e76cc0ee..69d8c7bd4e 100644
--- a/tensorflow/core/debug/debug_io_utils.h
+++ b/tensorflow/core/debug/debug_io_utils.h
@@ -44,11 +44,14 @@ struct DebugNodeKey {
   DebugNodeKey(const string& device_name, const string& node_name,
                const int32 output_slot, const string& debug_op);
 
+  static const string DeviceNameToDevicePath(const string& device_name);
+
   const string device_name;
   const string node_name;
   const int32 output_slot;
   const string debug_op;
   const string debug_node_name;
+  const string device_path;
 };
 
 class DebugIO {
@@ -136,6 +139,11 @@ class DebugIO {
 
   static Status CloseDebugURL(const string& debug_url);
 
+  static const char* const kMetadataFilePrefix;
+  static const char* const kCoreMetadataTag;
+  static const char* const kDeviceTag;
+  static const char* const kGraphTag;
+
   static const char* const kFileURLScheme;
   static const char* const kGrpcURLScheme;
 };
diff --git a/tensorflow/core/debug/debug_io_utils_test.cc b/tensorflow/core/debug/debug_io_utils_test.cc
index 406bcae07f..77039aa4ab 100644
--- a/tensorflow/core/debug/debug_io_utils_test.cc
+++ b/tensorflow/core/debug/debug_io_utils_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/util/event.pb.h"
@@ -47,6 +48,18 @@ class DebugIOUtilsTest : public ::testing::Test {
   std::unique_ptr<Tensor> tensor_b_;
 };
 
+TEST_F(DebugIOUtilsTest, ConstructDebugNodeKey) {
+  DebugNodeKey debug_node_key("/job:worker/replica:1/task:0/gpu:2",
+                              "hidden_1/MatMul", 0, "DebugIdentity");
+  EXPECT_EQ("/job:worker/replica:1/task:0/gpu:2", debug_node_key.device_name);
+  EXPECT_EQ("hidden_1/MatMul", debug_node_key.node_name);
+  EXPECT_EQ(0, debug_node_key.output_slot);
+  EXPECT_EQ("DebugIdentity", debug_node_key.debug_op);
+  EXPECT_EQ("hidden_1/MatMul:0:DebugIdentity", debug_node_key.debug_node_name);
+  EXPECT_EQ("_tfdbg_device_,job_worker,replica_1,task_0,gpu_2",
+            debug_node_key.device_path);
+}
+
 TEST_F(DebugIOUtilsTest, DumpFloatTensorToFileSunnyDay) {
   Initialize();
 
@@ -138,10 +151,14 @@ TEST_F(DebugIOUtilsTest, DumpTensorToFileCannotCreateDirectory) {
 
   // First, create the file at the path.
   const string test_dir = testing::TmpDir();
-  const string txt_file_name = strings::StrCat(test_dir, "/baz");
-
-  if (!env_->FileExists(test_dir).ok()) {
-    ASSERT_TRUE(env_->CreateDir(test_dir).ok());
+  const string kDeviceName = "/job:localhost/replica:0/task:0/cpu:0";
+  const DebugNodeKey kDebugNodeKey(kDeviceName, "baz/tensor_a", 0,
+                                   "DebugIdentity");
+  const string txt_file_dir =
+      io::JoinPath(test_dir, DebugNodeKey::DeviceNameToDevicePath(kDeviceName));
+  const string txt_file_name = io::JoinPath(txt_file_dir, "baz");
+  if (!env_->FileExists(txt_file_dir).ok()) {
+    ASSERT_TRUE(env_->RecursivelyCreateDir(txt_file_dir).ok());
   }
   ASSERT_EQ(error::Code::NOT_FOUND, env_->FileExists(txt_file_name).code());
 
@@ -157,8 +174,7 @@ TEST_F(DebugIOUtilsTest, DumpTensorToFileCannotCreateDirectory) {
 
   // Second, try to dump the tensor to a path that requires "baz" to be a
   // directory, which should lead to an error.
-  const DebugNodeKey kDebugNodeKey("/job:localhost/replica:0/task:0/cpu:0",
-                                   "baz/tensor_a", 0, "DebugIdentity");
+
   const uint64 wall_time = env_->NowMicros();
 
   string dump_file_name;
diff --git a/tensorflow/core/debug/grpc_session_debug_test.cc b/tensorflow/core/debug/grpc_session_debug_test.cc
index 6c68729410..9584d8b9f3 100644
--- a/tensorflow/core/debug/grpc_session_debug_test.cc
+++ b/tensorflow/core/debug/grpc_session_debug_test.cc
@@ -187,7 +187,10 @@ TEST_F(GrpcSessionDebugTest, FileDebugURL) {
     IsSingleFloatValue(outputs[0], 4.0);
 
     std::vector<Tensor> dumped_tensors;
-    LoadTensorDumps("n", &dumped_tensors);
+    LoadTensorDumps(io::JoinPath(DebugNodeKey::DeviceNameToDevicePath(
+                                     cluster->devices()[0].name()),
+                                 "n"),
+                    &dumped_tensors);
 
     if (i == 0 || i == 5) {
       ASSERT_EQ(0, dumped_tensors.size());
@@ -267,7 +270,10 @@ TEST_F(GrpcSessionDebugTest, MultiDevices_String) {
         TF_CHECK_OK(session->Close());
 
         std::vector<Tensor> dumped_tensors;
-        LoadTensorDumps("n", &dumped_tensors);
+        LoadTensorDumps(
+            io::JoinPath(DebugNodeKey::DeviceNameToDevicePath(a_dev.name()),
+                         "n"),
+            &dumped_tensors);
         ASSERT_EQ(1, dumped_tensors.size());
         ASSERT_EQ(TensorShape({2, 2}), dumped_tensors[0].shape());
         for (size_t i = 0; i < 4; ++i) {
diff --git a/tensorflow/core/kernels/debug_ops_test.cc b/tensorflow/core/kernels/debug_ops_test.cc
index 037272e009..1830b2a178 100644
--- a/tensorflow/core/kernels/debug_ops_test.cc
+++ b/tensorflow/core/kernels/debug_ops_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
@@ -94,7 +95,22 @@ TEST_F(DebugIdentityOpTest, Int32Success_6_FileURLs) {
     ASSERT_TRUE(env_->FileExists(dump_roots[i]).ok());
     ASSERT_TRUE(env_->IsDirectory(dump_roots[i]).ok());
 
-    DIR* dir = opendir(dump_roots[i].c_str());
+    std::vector<string> device_roots;
+    DIR* dir0 = opendir(dump_roots[i].c_str());
+    struct dirent* ent0;
+    const string kDeviceDirPrefix =
+        strings::StrCat(DebugIO::kMetadataFilePrefix, DebugIO::kDeviceTag);
+    while ((ent0 = readdir(dir0)) != nullptr) {
+      if (!strncmp(ent0->d_name, kDeviceDirPrefix.c_str(),
+                   kDeviceDirPrefix.size())) {
+        device_roots.push_back(io::JoinPath(dump_roots[i], ent0->d_name));
+      }
+    }
+    ASSERT_EQ(1, device_roots.size());
+    closedir(dir0);
+
+    const string& device_root = device_roots[0];
+    DIR* dir = opendir(device_root.c_str());
     struct dirent* ent;
     int dump_files_found = 0;
     while ((ent = readdir(dir)) != nullptr) {
@@ -102,8 +118,7 @@ TEST_F(DebugIdentityOpTest, Int32Success_6_FileURLs) {
         dump_files_found++;
 
         // Try reading the file into a Event proto.
-        const string dump_file_path =
-            strings::StrCat(dump_roots[i], "/", ent->d_name);
+        const string dump_file_path = io::JoinPath(device_root, ent->d_name);
         std::fstream ifs(dump_file_path, std::ios::in | std::ios::binary);
         Event event;
         event.ParseFromIstream(&ifs);
diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD
index dfb04a9744..07d0a9ec73 100644
--- a/tensorflow/python/debug/BUILD
+++ b/tensorflow/python/debug/BUILD
@@ -545,6 +545,22 @@ cuda_py_test(
     tags = ["notsan"],
 )
 
+cuda_py_test(
+    name = "session_debug_multi_gpu_test",
+    size = "small",
+    srcs = ["lib/session_debug_multi_gpu_test.py"],
+    additional_deps = [
+        ":debug_data",
+        ":debug_utils",
+        "//tensorflow/python:client",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:variables",
+    ],
+)
+
 py_test(
     name = "debugger_cli_common_test",
     size = "small",
diff --git a/tensorflow/python/debug/lib/debug_data.py b/tensorflow/python/debug/lib/debug_data.py
index 35e9fef29f..748e5d78ba 100644
--- a/tensorflow/python/debug/lib/debug_data.py
+++ b/tensorflow/python/debug/lib/debug_data.py
@@ -19,10 +19,12 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import glob
 import json
 import os
 
 import numpy as np
+import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.core.framework import graph_pb2
@@ -32,9 +34,12 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.platform import gfile
 
 
+# TODO(cais): Tie these string constants in with C++?
 METADATA_FILE_PREFIX = "_tfdbg_"
 CORE_METADATA_TAG = "core_metadata_"
 GRAPH_FILE_TAG = "graph_"
+DEVICE_TAG = "device_"
+
 FETCHES_INFO_FILE_TAG = "fetches_info_"
 FEED_KEYS_INFO_FILE_TAG = "feed_keys_info_"
 
@@ -158,10 +163,6 @@ def parse_node_or_tensor_name(name):
     return name, None
 
 
-def _is_core_metadata_file(file_name):
-  return file_name.startswith(METADATA_FILE_PREFIX + CORE_METADATA_TAG)
-
-
 def _is_graph_file(file_name):
   return file_name.startswith(METADATA_FILE_PREFIX + GRAPH_FILE_TAG)
 
@@ -344,6 +345,28 @@ def extract_core_metadata_from_event_proto(event):
                        json_metadata["target_nodes"])
 
 
+def device_name_to_device_path(device_name):
+  """Convert device name to device path."""
+  device_name_items = device_name.split("/")
+  device_name_items = [item.replace(":", "_") for item in device_name_items]
+  return METADATA_FILE_PREFIX + DEVICE_TAG + ",".join(device_name_items)
+
+
+def device_path_to_device_name(device_dir):
+  """Parse device name from device path.
+
+  Args:
+    device_dir: (str) a directory name for the device.
+
+  Returns:
+    (str) parsed device name.
+  """
+  path_items = os.path.basename(device_dir)[
+      len(METADATA_FILE_PREFIX) + len(DEVICE_TAG):].split(",")
+  return "/".join([
+      path_item.replace("_", ":", 1) for path_item in path_items])
+
+
 class DebugTensorDatum(object):
   """A single tensor dumped by TensorFlow Debugger (tfdbg).
 
@@ -360,13 +383,17 @@ class DebugTensorDatum(object):
     """`DebugTensorDatum` constructor.
 
     Args:
-      dump_root: (`str`) Debug dump root directory.
+      dump_root: (`str`) Debug dump root directory. This path should not include
+        the path component that represents the device name (see also below).
       debug_dump_rel_path: (`str`) Path to a debug dump file, relative to the
-          `dump_root`. For example, suppose the debug dump root
-          directory is `/tmp/tfdbg_1` and the dump file is at
-          `/tmp/tfdbg_1/ns_1/node_a_0_DebugIdentity_123456789`, then
-          the value of the debug_dump_rel_path should be
-          `ns_1/node_a_0_DebugIdenity_1234456789`.
+        `dump_root`. The first item of this relative path is assumed to be
+        a path representing the name of the device that the Tensor belongs to.
+        See `device_path_to_device_name` for more details on the device path.
+        For example, suppose the debug dump root
+        directory is `/tmp/tfdbg_1` and the dump file is at
+        `/tmp/tfdbg_1/<device_path>/>ns_1/node_a_0_DebugIdentity_123456789`,
+        then the value of the debug_dump_rel_path should be
+        `<device_path>/ns_1/node_a_0_DebugIdenity_1234456789`.
 
     Raises:
       ValueError: If the base file name of the dump file does not conform to
@@ -374,15 +401,13 @@ class DebugTensorDatum(object):
         `node_name`_`output_slot`_`debug_op`_`timestamp`
     """
 
-    base = os.path.basename(debug_dump_rel_path)
-
+    path_components = os.path.normpath(debug_dump_rel_path).split(os.sep)
+    self._device_name = device_path_to_device_name(path_components[0])
+    base = path_components[-1]
     if base.count("_") < 3:
       raise ValueError(
           "Dump file path does not conform to the naming pattern: %s" % base)
 
-    # TODO(cais): Add hostname and pid to support dumps from distributed
-    #             sessions.
-
     self._extended_timestamp = base.split("_")[-1]
     # It may include an index suffix at the end if file path collision happened
     # due to identical timestamps.
@@ -395,31 +420,23 @@ class DebugTensorDatum(object):
     self._debug_op = base.split("_")[-2]
     self._output_slot = int(base.split("_")[-3])
 
-    namespace = os.path.dirname(debug_dump_rel_path).replace("\\", "/")
     node_base_name = "_".join(base.split("_")[:-3])
-    if not namespace or namespace == ".":
-      self._node_name = node_base_name
-    else:
-      self._node_name = namespace + "/" + node_base_name
+    self._node_name = "/".join(path_components[1:-1] + [node_base_name])
 
     self._file_path = os.path.join(dump_root, debug_dump_rel_path)
     self._dump_size_bytes = (gfile.Stat(self._file_path).length if
                              gfile.Exists(self._file_path) else None)
 
-    self._run_fetches_info = None
-    self._run_feed_keys_info = None
-
   def __str__(self):
-    return "{DebugTensorDatum: %s:%d @ %s @ %d}" % (self.node_name,
-                                                    self.output_slot,
-                                                    self.debug_op,
-                                                    self.timestamp)
+    return "{DebugTensorDatum (%s) %s:%d @ %s @ %d}" % (self.device_name,
+                                                        self.node_name,
+                                                        self.output_slot,
+                                                        self.debug_op,
+                                                        self.timestamp)
 
   def __repr__(self):
     return self.__str__()
 
-  # TODO(cais): (b/38325442) Add device name information to this class.
-
   def get_tensor(self):
     """Get tensor from the dump (`Event`) file.
 
@@ -465,6 +482,16 @@ class DebugTensorDatum(object):
     return self._debug_op
 
   @property
+  def device_name(self):
+    """Name of the device that the tensor belongs to.
+
+    Returns:
+      (`str`) device name.
+    """
+
+    return self._device_name
+
+  @property
   def node_name(self):
     """Name of the node from which the tensor value was dumped.
 
@@ -529,6 +556,8 @@ class WatchKeyDoesNotExistInDebugDumpDirError(ValueError):
   pass
 
 
+# TODO(cais): This class is getting too large in line count. Refactor to make it
+# smaller and easier to maintain.
 class DebugDumpDir(object):
   """Data set from a debug-dump directory on filesystem.
 
@@ -548,23 +577,54 @@ class DebugDumpDir(object):
 
     Raises:
       IOError: If dump_root does not exist as a directory.
+      ValueError: If more than one core metadata file is found under the dump
+        root directory.
     """
 
     if not gfile.IsDirectory(dump_root):
       raise IOError("Dump root directory %s does not exist" % dump_root)
 
-    self._core_metadata = None
-    self._load_dumps(dump_root)
-    self._create_tensor_watch_maps()
-    self._load_partition_graphs(partition_graphs, validate)
+    self._core_metadata = []
+
+    # Find the list of devices.
+    self._dump_root = dump_root
+
+    self._load_core_metadata()
+    self._load_fetches_info()
+    self._load_feeds_info()
+    self._load_all_device_dumps(partition_graphs, validate)
 
     self._python_graph = None
 
-  def _load_dumps(self, dump_root):
-    """Load `DebugTensorDatum` instances from the dump root.
+  def _load_all_device_dumps(self, partition_graphs, validate):
+    """Load the dump data for all devices."""
+    device_dirs = glob.glob(os.path.join(
+        self._dump_root, METADATA_FILE_PREFIX + DEVICE_TAG + "*"))
+
+    self._device_names = []
+    self._t0s = {}
+    self._dump_tensor_data = {}
+    self._dump_graph_file_paths = {}
+    self._debug_watches = {}
+    self._watch_key_to_devices = {}
+    self._watch_key_to_datum = {}
+    self._watch_key_to_rel_time = {}
+    self._watch_key_to_dump_size_bytes = {}
+    for device_dir in device_dirs:
+      device_name = device_path_to_device_name(device_dir)
+      self._device_names.append(device_name)
+      self._load_device_dumps(device_name, device_dir)
+    self._load_partition_graphs(partition_graphs, validate)
+    self._calculate_t0()
+
+    for device_name in self._device_names:
+      self._create_tensor_watch_maps(device_name)
 
-    Populates a list of `DebugTensorDatum` instance and sorts the list by
-    ascending timestamp.
+  def _load_device_dumps(self, device_name, device_root):
+    """Load `DebugTensorDatum` instances from the dump root of a given device.
+
+    Populates a map {device_name: a list of `DebugTensorDatum`}, where the list
+    is sorted by  ascending timestamp.
 
     This sorting order reflects the order in which the TensorFlow executor
     processed the nodes of the graph. It is (one of many possible) topological
@@ -584,55 +644,67 @@ class DebugDumpDir(object):
     graphs may not be available, e.g., when the run errors out.
 
     Args:
-      dump_root: (`str`) Dump root directory.
-    """
+      device_name: (`str`) name of the device.
+      device_root: (`str`) dump root directory of the given device.
 
-    self._dump_root = dump_root
-    self._dump_tensor_data = []
-    self._dump_graph_file_paths = []
+    Raises:
+      ValueError: If GraphDef for the device is not available.
+    """
 
-    self._debug_watches = collections.defaultdict(
+    self._dump_tensor_data[device_name] = []
+    self._debug_watches[device_name] = collections.defaultdict(
         lambda: collections.defaultdict(set))
 
-    for root, _, files in gfile.Walk(self._dump_root):
+    for root, _, files in gfile.Walk(device_root):
       for f in files:
-        if f.startswith(METADATA_FILE_PREFIX):
-          if _is_core_metadata_file(f):
-            self._load_core_metadata(os.path.join(self._dump_root, root, f))
-
-          if _is_graph_file(f):
-            self._dump_graph_file_paths.append(
-                os.path.join(self._dump_root, root, f))
-
-          if _is_run_fetches_info_file(f):
-            self._run_fetches_info = _load_log_message_from_event_file(
-                os.path.join(root, f))
-
-          if _is_run_feed_keys_info_file(f):
-            self._run_feed_keys_info = _load_log_message_from_event_file(
-                os.path.join(root, f))
-
-          continue
-
-        datum = self._dump_file_name_to_datum(root, f)
-        self._dump_tensor_data.append(datum)
-
-        self._debug_watches[datum.node_name][datum.output_slot].add(
-            datum.debug_op)
-
-    self._dump_tensor_data = sorted(
-        self._dump_tensor_data, key=lambda x: x.extended_timestamp)
-
-    if self._dump_tensor_data:
-      self._t0 = self._dump_tensor_data[0].timestamp
+        if _is_graph_file(f):
+          self._dump_graph_file_paths[device_name] = os.path.join(
+              device_root, root, f)
+        else:
+          datum = self._dump_file_name_to_datum(root, f)
+          self._dump_tensor_data[device_name].append(datum)
+          self._debug_watches[device_name][datum.node_name][
+              datum.output_slot].add(datum.debug_op)
+
+    self._dump_tensor_data[device_name] = sorted(
+        self._dump_tensor_data[device_name],
+        key=lambda x: x.extended_timestamp)
+
+    if self._dump_tensor_data[device_name]:
+      self._t0s[device_name] = self._dump_tensor_data[device_name][0].timestamp
     else:
-      self._t0 = None
-
-  def _load_core_metadata(self, event_file_path):
-    event = event_pb2.Event()
-    with gfile.Open(event_file_path, "rb") as f:
-      event.ParseFromString(f.read())
-      self._core_metadata = extract_core_metadata_from_event_proto(event)
+      self._t0s[device_name] = None
+
+  def _calculate_t0(self):
+    """Calculate the first timestamp across all devices."""
+    t0s = [t0 for t0 in six.itervalues(self._t0s) if t0 is not None]
+    self._t0 = min(t0s) if t0s else None
+
+  def _load_core_metadata(self):
+    core_metadata_files = glob.glob(os.path.join(
+        self._dump_root, METADATA_FILE_PREFIX + CORE_METADATA_TAG + "*"))
+    for core_metadata_file in core_metadata_files:
+      with gfile.Open(core_metadata_file, "rb") as f:
+        event = event_pb2.Event()
+        event.ParseFromString(f.read())
+        self._core_metadata.append(
+            extract_core_metadata_from_event_proto(event))
+
+  def _load_fetches_info(self):
+    fetches_info_files = glob.glob(os.path.join(
+        self._dump_root, METADATA_FILE_PREFIX + FETCHES_INFO_FILE_TAG + "*"))
+    self._run_fetches_info = []
+    for fetches_info_file in fetches_info_files:
+      self._run_fetches_info.append(
+          _load_log_message_from_event_file(fetches_info_file))
+
+  def _load_feeds_info(self):
+    feeds_info_files = glob.glob(os.path.join(
+        self._dump_root, METADATA_FILE_PREFIX + FEED_KEYS_INFO_FILE_TAG + "*"))
+    self._run_feed_keys_info = []
+    for feeds_info_file in feeds_info_files:
+      self._run_feed_keys_info.append(
+          _load_log_message_from_event_file(feeds_info_file))
 
   def _dump_file_name_to_datum(self, dir_name, file_name):
     """Obtain a DebugTensorDatum from the directory and file name.
@@ -648,34 +720,39 @@ class DebugDumpDir(object):
     # Calculate the relative path of the dump file with respect to the root.
     debug_dump_rel_path = os.path.join(
         os.path.relpath(dir_name, self._dump_root), file_name)
-
     return DebugTensorDatum(self._dump_root, debug_dump_rel_path)
 
-  def _create_tensor_watch_maps(self):
+  def _create_tensor_watch_maps(self, device_name):
     """Create maps from tensor watch keys to datum and to timestamps.
 
     Create a map from watch key (tensor name + debug op) to `DebugTensorDatum`
     item. Also make a map from watch key to relative timestamp.
     "relative" means (absolute timestamp - t0).
+
+    Args:
+      device_name: (str) name of the device.
     """
 
-    self._watch_key_to_datum = {}
-    self._watch_key_to_rel_time = {}
-    self._watch_key_to_dump_size_bytes = {}
-    for datum in self._dump_tensor_data:
-      if datum.watch_key not in self._watch_key_to_datum:
-        self._watch_key_to_datum[datum.watch_key] = [datum]
-        self._watch_key_to_rel_time[datum.watch_key] = [
-            datum.timestamp - self._t0
-        ]
-        self._watch_key_to_dump_size_bytes[datum.watch_key] = [
-            datum.dump_size_bytes
-        ]
+    self._watch_key_to_datum[device_name] = {}
+    self._watch_key_to_rel_time[device_name] = {}
+    self._watch_key_to_dump_size_bytes[device_name] = {}
+    for datum in self._dump_tensor_data[device_name]:
+      if datum.watch_key not in self._watch_key_to_devices:
+        self._watch_key_to_devices[datum.watch_key] = {device_name}
+      else:
+        self._watch_key_to_devices[datum.watch_key].add(device_name)
+
+      if datum.watch_key not in self._watch_key_to_datum[device_name]:
+        self._watch_key_to_datum[device_name][datum.watch_key] = [datum]
+        self._watch_key_to_rel_time[device_name][datum.watch_key] = [
+            datum.timestamp - self._t0]
+        self._watch_key_to_dump_size_bytes[device_name][datum.watch_key] = [
+            datum.dump_size_bytes]
       else:
-        self._watch_key_to_datum[datum.watch_key].append(datum)
-        self._watch_key_to_rel_time[datum.watch_key].append(datum.timestamp -
-                                                            self._t0)
-        self._watch_key_to_dump_size_bytes[datum.watch_key].append(
+        self._watch_key_to_datum[device_name][datum.watch_key].append(datum)
+        self._watch_key_to_rel_time[device_name][datum.watch_key].append(
+            datum.timestamp - self._t0)
+        self._watch_key_to_dump_size_bytes[device_name][datum.watch_key].append(
             datum.dump_size_bytes)
 
   def set_python_graph(self, python_graph):
@@ -733,22 +810,32 @@ class DebugDumpDir(object):
         `output_names`: Names of the output (fetched) Tensors.
         `target_nodes`: Names of the target nodes.
       If the core metadata have not been loaded, `None`.
+      If more than one core metadata files exist, return a list of the
+        `nametuple` described above.
     """
 
-    return self._core_metadata
+    output = self._core_metadata
+    return output[0] if len(output) == 1 else output
 
   @property
   def dumped_tensor_data(self):
-    return self._dump_tensor_data
+    """Retrieve dumped tensor data."""
+    if len(self.devices()) == 1:
+      return self._dump_tensor_data[self.devices()[0]]
+    else:
+      all_devices_data = six.itervalues(self._dump_tensor_data)
+      data = []
+      for device_data in all_devices_data:
+        data.extend(device_data)
+      return sorted(data, key=lambda x: x.extended_timestamp)
 
   @property
   def t0(self):
-    """Absolute timestamp of the first dumped tensor.
+    """Absolute timestamp of the first dumped tensor across all devices.
 
     Returns:
       (`int`) absolute timestamp of the first dumped tensor, in microseconds.
     """
-
     return self._t0
 
   @property
@@ -756,10 +843,10 @@ class DebugDumpDir(object):
     """Total number of dumped tensors in the dump root directory.
 
     Returns:
-      (`int`) total number of dumped tensors in the dump root directory.
+      (`int`) The total number of dumped tensors in the dump root directory.
     """
-
-    return len(self._dump_tensor_data)
+    return sum(len(self._dump_tensor_data[device_name])
+               for device_name in self._dump_tensor_data)
 
   def _load_partition_graphs(self, partition_graphs, validate):
     """Load and process partition graphs.
@@ -770,56 +857,73 @@ class DebugDumpDir(object):
     tensor dumps.
 
     Args:
-      partition_graphs: Partition graphs executed by the TensorFlow runtime,
-        represented as repeated fields of GraphDef.
-        If no partition_graph is available, use None.
+      partition_graphs: A repeated field of GraphDefs representing the
+          partition graphs executed by the TensorFlow runtime.
       validate: (`bool`) Whether the dump files are to be validated against the
         partition graphs.
-    """
 
-    if partition_graphs:
-      self._partition_graphs = partition_graphs
-    elif self._dump_graph_file_paths:
-      # In case partition graphs are not available from arguments, load them
-      # from the dump directory.
-      self._partition_graphs = [
-          _load_graph_def_from_event_file(dump_file_path)
-          for dump_file_path in self._dump_graph_file_paths
-      ]
-    else:
-      self._partition_graphs = None
-      return
+    Raises:
+      ValueError: If the partition GraphDef of one or more devices fail to be
+        loaded.
+    """
 
     self._node_attributes = {}
-
     self._node_inputs = {}
     self._node_ctrl_inputs = {}
-
     self._node_recipients = {}
     self._node_ctrl_recipients = {}
-
-    self._devices = []
     self._node_devices = {}
     self._node_op_types = {}
+    self._copy_send_nodes = {}
+
+    self._partition_graphs = {}
+    for device_name in self._device_names:
+      partition_graph = None
+      if device_name in self._dump_graph_file_paths:
+        partition_graph = _load_graph_def_from_event_file(
+            self._dump_graph_file_paths[device_name])
+      else:
+        partition_graph = self._find_partition_graph(partition_graphs,
+                                                     device_name)
+
+      if partition_graph:
+        self._partition_graphs[device_name] = partition_graph
 
-    self._copy_send_nodes = []
+      self._node_attributes[device_name] = {}
+      self._node_inputs[device_name] = {}
+      self._node_ctrl_inputs[device_name] = {}
+      self._node_recipients[device_name] = {}
+      self._node_ctrl_recipients[device_name] = {}
+      self._node_op_types[device_name] = {}
+      self._copy_send_nodes[device_name] = []
 
-    for pg in self._partition_graphs:
-      for node in pg.node:
-        self._process_partition_graph_node(node)
+      if partition_graph:
+        for node in partition_graph.node:
+          self._process_partition_graph_node(device_name, node)
 
-    self._prune_non_control_edges_of_debug_ops()
-    self._prune_control_edges_of_debug_ops()
+      self._prune_non_control_edges_of_debug_ops(device_name)
+      self._prune_control_edges_of_debug_ops(device_name)
 
-    self._populate_recipient_maps()
+      self._populate_recipient_maps(device_name)
 
-    if validate:
-      self._validate_dump_with_graphs()
+      if device_name in self._partition_graphs and validate:
+        self._validate_dump_with_graphs(device_name)
+
+  def _find_partition_graph(self, partition_graphs, device_name):
+    if partition_graphs is None:
+      return None
+    else:
+      for graph_def in partition_graphs:
+        for node_def in graph_def.node:
+          if node_def.device == device_name:
+            return graph_def
+      return None
 
-  def _process_partition_graph_node(self, node):
+  def _process_partition_graph_node(self, device_name, node):
     """Process a node from the partition graphs.
 
     Args:
+      device_name: (str) device name.
       node: (NodeDef) A partition-graph node to be processed.
 
     Raises:
@@ -833,84 +937,91 @@ class DebugDumpDir(object):
       (watched_node_name, watched_output_slot, _,
        debug_op) = parse_debug_node_name(node.name)
 
-      self._debug_watches[watched_node_name][watched_output_slot].add(
-          debug_op)
+      self._debug_watches[device_name][watched_node_name][
+          watched_output_slot].add(debug_op)
 
       return
 
-    if node.name in self._node_inputs:
-      raise ValueError("Duplicate node name: '%s'" % node.name)
+    if node.name in self._node_inputs[device_name]:
+      raise ValueError("Duplicate node name on device %s: '%s'" %
+                       (device_name, node.name))
 
-    self._node_attributes[node.name] = node.attr
+    self._node_attributes[device_name][node.name] = node.attr
 
-    if node.device not in self._devices and node.device:
-      self._devices.append(node.device)
+    self._node_inputs[device_name][node.name] = []
+    self._node_ctrl_inputs[device_name][node.name] = []
+    self._node_recipients[device_name][node.name] = []
+    self._node_ctrl_recipients[device_name][node.name] = []
 
-    self._node_inputs[node.name] = []
-    self._node_ctrl_inputs[node.name] = []
-    self._node_recipients[node.name] = []
-    self._node_ctrl_recipients[node.name] = []
-
-    self._node_devices[node.name] = node.device
-    self._node_op_types[node.name] = node.op
+    if node.name not in self._node_devices:
+      self._node_devices[node.name] = set()
+    self._node_devices[node.name].add(node.device)
+    self._node_op_types[device_name][node.name] = node.op
 
     for inp in node.input:
       if is_copy_node(inp) and (node.op == "_Send" or node.op == "_Retval"):
-        self._copy_send_nodes.append(node.name)
+        self._copy_send_nodes[device_name].append(node.name)
 
       if inp.startswith("^"):
         cinp = inp[1:]
-        self._node_ctrl_inputs[node.name].append(cinp)
+        self._node_ctrl_inputs[device_name][node.name].append(cinp)
       else:
-        self._node_inputs[node.name].append(inp)
+        self._node_inputs[device_name][node.name].append(inp)
 
-  def _prune_nodes_from_input_and_recipient_maps(self, nodes_to_prune):
+  def _prune_nodes_from_input_and_recipient_maps(self,
+                                                 device_name,
+                                                 nodes_to_prune):
     """Prune nodes out of input and recipient maps.
 
     Args:
+      device_name: (`str`) device name.
       nodes_to_prune: (`list` of `str`) Names of the nodes to be pruned.
     """
 
     for node in nodes_to_prune:
-      del self._node_inputs[node]
-      del self._node_ctrl_inputs[node]
-      del self._node_recipients[node]
-      del self._node_ctrl_recipients[node]
+      del self._node_inputs[device_name][node]
+      del self._node_ctrl_inputs[device_name][node]
+      del self._node_recipients[device_name][node]
+      del self._node_ctrl_recipients[device_name][node]
 
-  def _prune_non_control_edges_of_debug_ops(self):
+  def _prune_non_control_edges_of_debug_ops(self, device_name):
     """Prune (non-control) edges related to debug ops.
 
     Prune the Copy ops and associated _Send ops inserted by the debugger out
     from the non-control inputs and output recipients map. Replace the inputs
     and recipients with original ones.
+
+    Args:
+      device_name: (`str`) device name.
     """
 
     copy_nodes = []
-    for node in self._node_inputs:
-      if node in self._copy_send_nodes:
+    for node in self._node_inputs[device_name]:
+      if node in self._copy_send_nodes[device_name]:
         continue
 
       if is_copy_node(node):
         copy_nodes.append(node)
 
-      inputs = self._node_inputs[node]
+      inputs = self._node_inputs[device_name][node]
 
       for i in xrange(len(inputs)):
         inp = inputs[i]
         if is_copy_node(inp):
           # Find the input to the Copy node, which should be the original
           # input to the node.
-          orig_inp = self._node_inputs[inp][0]
+          orig_inp = self._node_inputs[device_name][inp][0]
           inputs[i] = orig_inp
 
-    self._prune_nodes_from_input_and_recipient_maps(copy_nodes)
-    self._prune_nodes_from_input_and_recipient_maps(self._copy_send_nodes)
+    self._prune_nodes_from_input_and_recipient_maps(device_name, copy_nodes)
+    self._prune_nodes_from_input_and_recipient_maps(
+        device_name, self._copy_send_nodes[device_name])
 
-  def _prune_control_edges_of_debug_ops(self):
+  def _prune_control_edges_of_debug_ops(self, device_name):
     """Prune control edges related to the debug ops."""
 
-    for node in self._node_ctrl_inputs:
-      ctrl_inputs = self._node_ctrl_inputs[node]
+    for node in self._node_ctrl_inputs[device_name]:
+      ctrl_inputs = self._node_ctrl_inputs[device_name][node]
       debug_op_inputs = []
       for ctrl_inp in ctrl_inputs:
         if is_debug_node(ctrl_inp):
@@ -918,33 +1029,36 @@ class DebugDumpDir(object):
       for debug_op_inp in debug_op_inputs:
         ctrl_inputs.remove(debug_op_inp)
 
-  def _populate_recipient_maps(self):
+  def _populate_recipient_maps(self, device_name):
     """Populate the map from node name to recipient(s) of its output(s)."""
 
-    for node in self._node_inputs:
-      inputs = self._node_inputs[node]
+    for node in self._node_inputs[device_name]:
+      inputs = self._node_inputs[device_name][node]
       for inp in inputs:
         inp = get_node_name(inp)
-        if inp not in self._node_recipients:
-          self._node_recipients[inp] = []
-        self._node_recipients[inp].append(node)
+        if inp not in self._node_recipients[device_name]:
+          self._node_recipients[device_name][inp] = []
+        self._node_recipients[device_name][inp].append(node)
 
-    for node in self._node_ctrl_inputs:
-      ctrl_inputs = self._node_ctrl_inputs[node]
+    for node in self._node_ctrl_inputs[device_name]:
+      ctrl_inputs = self._node_ctrl_inputs[device_name][node]
       for ctrl_inp in ctrl_inputs:
-        if ctrl_inp in self._copy_send_nodes:
+        if ctrl_inp in self._copy_send_nodes[device_name]:
           continue
 
-        if ctrl_inp not in self._node_ctrl_recipients:
-          self._node_ctrl_recipients[ctrl_inp] = []
-        self._node_ctrl_recipients[ctrl_inp].append(node)
+        if ctrl_inp not in self._node_ctrl_recipients[device_name]:
+          self._node_ctrl_recipients[device_name][ctrl_inp] = []
+        self._node_ctrl_recipients[device_name][ctrl_inp].append(node)
 
-  def _validate_dump_with_graphs(self):
+  def _validate_dump_with_graphs(self, device_name):
     """Validate the dumped tensor data against the partition graphs.
 
     Only the watched nodes are validated by this method, because tfdbg allows
     clients to watch only a subset of the nodes.
 
+    Args:
+      device_name: (`str`) device name.
+
     Raises:
       LookupError: If the partition graphs have not been loaded yet.
       ValueError: If dumps contain node names not found in partition graph.
@@ -952,33 +1066,35 @@ class DebugDumpDir(object):
         input relations on the partition graphs.
     """
 
-    if not self._partition_graphs:
-      raise LookupError("No partition graphs loaded.")
+    if not self._partition_graphs[device_name]:
+      raise LookupError(
+          "No partition graphs loaded for device %s" % device_name)
 
     # Verify that the node names in the dump data are all present in the
     # partition graphs.
-    for datum in self._dump_tensor_data:
-      if datum.node_name not in self._node_inputs:
-        raise ValueError("Node name '%s' is not found in partition graphs." %
-                         datum.node_name)
+    for datum in self._dump_tensor_data[device_name]:
+      if datum.node_name not in self._node_inputs[device_name]:
+        raise ValueError("Node name '%s' is not found in partition graphs of "
+                         "device %s." % (datum.node_name, device_name))
 
     pending_inputs = {}
-    for node in self._node_inputs:
+    for node in self._node_inputs[device_name]:
       pending_inputs[node] = []
-      inputs = self._node_inputs[node]
+      inputs = self._node_inputs[device_name][node]
       for inp in inputs:
         inp_node = get_node_name(inp)
         inp_output_slot = get_output_slot(inp)
         # Inputs from Enter and NextIteration nodes are not validated because
         # DebugNodeInserter::InsertNodes() in the debugger core skips creating
         # control edges from debug ops watching these types of nodes.
-        if (inp_node in self._debug_watches and
-            inp_output_slot in self._debug_watches[inp_node] and
-            self._node_op_types.get(inp) not in ("Enter", "NextIteration") and
+        if (inp_node in self._debug_watches[device_name] and
+            inp_output_slot in self._debug_watches[device_name][inp_node] and
+            self._node_op_types[device_name].get(inp) not in (
+                "Enter", "NextIteration") and
             (inp_node, inp_output_slot) not in pending_inputs[node]):
           pending_inputs[node].append((inp_node, inp_output_slot))
 
-    for i, datum in enumerate(self._dump_tensor_data):
+    for i, datum in enumerate(self._dump_tensor_data[device_name]):
       node = datum.node_name
       slot = datum.output_slot
       # In some cases (e.g., system clocks with insufficient precision),
@@ -986,13 +1102,13 @@ class DebugDumpDir(object):
       # following check examines this possibility and avoids raising an error if
       # that is the case.
       if not self._satisfied_at_timestamp(
-          pending_inputs[node], datum.timestamp, start_i=i + 1):
+          device_name, pending_inputs[node], datum.timestamp, start_i=i + 1):
         raise ValueError("Causality violated in timing relations of debug "
                          "dumps: %s (%d): "
                          "these input(s) are not satisfied: %s" %
                          (node, datum.timestamp, repr(pending_inputs[node])))
 
-      recipients = self._node_recipients[node]
+      recipients = self._node_recipients[device_name][node]
       for recipient in recipients:
         recipient_pending_inputs = pending_inputs[recipient]
         if (node, slot) in recipient_pending_inputs:
@@ -1004,12 +1120,13 @@ class DebugDumpDir(object):
             del recipient_pending_inputs[
                 recipient_pending_inputs.index((node, slot))]
 
-  def _satisfied_at_timestamp(self, pending, timestamp, start_i=0):
+  def _satisfied_at_timestamp(self, device_name, pending, timestamp, start_i=0):
     """Determine whether pending inputs are satisfied at given timestamp.
 
     Note: This method mutates the input argument "pending".
 
     Args:
+      device_name: (str) device name.
       pending: A list of 2-tuple (node_name, output_slot): the dependencies to
         check.
       timestamp: (int) the timestamp in question.
@@ -1023,7 +1140,7 @@ class DebugDumpDir(object):
     if not pending:
       return True
 
-    for datum in self._dump_tensor_data[start_i:]:
+    for datum in self._dump_tensor_data[device_name][start_i:]:
       if datum.timestamp > timestamp:
         break
       if (datum.timestamp == timestamp and
@@ -1042,7 +1159,7 @@ class DebugDumpDir(object):
     """Get the partition graphs.
 
     Returns:
-      Partition graphs as repeated fields of GraphDef.
+      Partition graphs as a list of GraphDef.
 
     Raises:
       LookupError: If no partition graphs have been loaded.
@@ -1051,50 +1168,90 @@ class DebugDumpDir(object):
     if self._partition_graphs is None:
       raise LookupError("No partition graphs have been loaded.")
 
-    return self._partition_graphs
+    return self._partition_graphs.values()
 
   @property
   def run_fetches_info(self):
     """Get a str representation of the fetches used in the Session.run() call.
 
     Returns:
-      If the information is available, a `str` obtained from `repr(fetches)`.
+      If the information is available from one `Session.run` call, a `str`
+        obtained from `repr(fetches)`.
+      If the information is available from multiple `Session.run` calls, a
+        `list` of `str` from `repr(fetches)`.
       If the information is not available, `None`.
     """
 
-    return self._run_fetches_info
+    output = self._run_fetches_info
+    return output[0] if len(output) == 1 else output
 
   @property
   def run_feed_keys_info(self):
     """Get a str representation of the feed_dict used in the Session.run() call.
 
     Returns:
-      If the information is available, a `str` obtained from `repr(feed_dict)`.
+      If the information is available from one `Session.run` call, a `str`
+        obtained from `repr(feed_dict)`.
+      If the information is available from multiple `Session.run` calls, a
+        `list` of `str` obtained from `repr(feed_dict)`.
       If the information is not available, `None`.
     """
 
-    return self._run_feed_keys_info
+    output = self._run_feed_keys_info
+    return output[0] if len(output) == 1 else output
+
+  def _infer_device_name(self, device_name, node_name):
+    if device_name is None:
+      if len(self.devices()) == 1:
+        return self.devices()[0]
+      else:
+        if node_name in self._node_devices:
+          if len(self._node_devices[node_name]) == 1:
+            return list(self._node_devices[node_name])[0]
+          else:
+            raise ValueError(
+                "There are multiple (%d) devices with nodes named '%s' but "
+                "device_name is not specified." %
+                (len(self._node_devices[node_name]), node_name))
+        else:
+          raise ValueError("None of the %d devices has a node named '%s'." %
+                           (len(self._device_names), node_name))
+    else:
+      return device_name
 
-  def nodes(self):
+  def nodes(self, device_name=None):
     """Get a list of all nodes from the partition graphs.
 
+    Args:
+      device_name: (`str`) name of device. If there is only one device, this
+        argumnet is optional.
+
     Returns:
       All nodes' names, as a list of str.
 
     Raises:
       LookupError: If no partition graphs have been loaded.
+      ValueError: If there are multiple devices, but device_name is not
+        specified.
     """
-
     if self._partition_graphs is None:
       raise LookupError("No partition graphs have been loaded.")
+    if device_name is None:
+      if len(self.devices()) == 1:
+        device_name = self.devices()[0]
+      else:
+        raise ValueError(
+            "There are multiple (%d) devices, but "
+            "device_name is not specified." % len(self.devices()))
+    return [node_name for node_name in self._node_inputs[device_name]]
 
-    return [node_name for node_name in self._node_inputs]
-
-  def node_attributes(self, node_name):
+  def node_attributes(self, node_name, device_name=None):
     """Get the attributes of a node.
 
     Args:
       node_name: Name of the node in question.
+      device_name: (`str`) name of the device. If there is only one device or if
+        node_name exists on only one device, this argumnet is optional.
 
     Returns:
       Attributes of the node.
@@ -1103,22 +1260,25 @@ class DebugDumpDir(object):
       LookupError: If no partition graphs have been loaded.
       ValueError: If no node named node_name exists.
     """
-
     if self._partition_graphs is None:
       raise LookupError("No partition graphs have been loaded.")
 
-    if node_name in self._node_attributes:
-      return self._node_attributes[node_name]
+    device_name = self._infer_device_name(device_name, node_name)
+    if node_name in self._node_attributes[device_name]:
+      return self._node_attributes[device_name][node_name]
     else:
-      raise ValueError("No node named \"%s\" exists." % node_name)
+      raise ValueError("No node named \"%s\" exists on device %s." % (
+          node_name, device_name))
 
-  def node_inputs(self, node_name, is_control=False):
+  def node_inputs(self, node_name, is_control=False, device_name=None):
     """Get the inputs of given node according to partition graphs.
 
     Args:
       node_name: Name of the node.
       is_control: (`bool`) Whether control inputs, rather than non-control
         inputs, are to be returned.
+      device_name: (`str`) name of the device. If there is only one device or if
+        node_name exists on only one device, this argumnet is optional.
 
     Returns:
       (`list` of `str`) inputs to the node, as a list of node names.
@@ -1133,21 +1293,27 @@ class DebugDumpDir(object):
       raise LookupError(
           "Node inputs are not loaded from partition graphs yet.")
 
-    if node_name not in self._node_inputs:
-      raise ValueError("Node '%s' does not exist in partition graphs." %
-                       node_name)
+    device_name = self._infer_device_name(device_name, node_name)
+    if node_name not in self._node_inputs[device_name]:
+      raise ValueError("Node '%s' does not exist in the partition graph of "
+                       "device %s." % (node_name, device_name))
 
     if is_control:
-      return self._node_ctrl_inputs[node_name]
+      return self._node_ctrl_inputs[device_name][node_name]
     else:
-      return self._node_inputs[node_name]
+      return self._node_inputs[device_name][node_name]
 
-  def transitive_inputs(self, node_name, include_control=True):
+  def transitive_inputs(self,
+                        node_name,
+                        include_control=True,
+                        device_name=None):
     """Get the transitive inputs of given node according to partition graphs.
 
     Args:
-      node_name: Name of the node
+      node_name: Name of the node.
       include_control: Include control inputs (True by default).
+      device_name: (`str`) name of the device. If there is only one device or if
+        node_name exists on only one device, this argumnet is optional.
 
     Returns:
       (`list` of `str`) all transitive inputs to the node, as a list of node
@@ -1163,9 +1329,11 @@ class DebugDumpDir(object):
       raise LookupError(
           "Node inputs are not loaded from partition graphs yet.")
 
-    if node_name not in self._node_inputs:
-      raise ValueError("Node '%s' does not exist in partition graphs." %
-                       node_name)
+    device_name = self._infer_device_name(device_name, node_name)
+    if node_name not in self._node_inputs[device_name]:
+      raise ValueError(
+          "Node '%s' does not exist in the partition graph of device %s." %
+          (node_name, device_name))
 
     inputs = []
 
@@ -1186,21 +1354,21 @@ class DebugDumpDir(object):
 
       # Stop the tracing at a Merge op, as it is generally impossible to infer
       # outside the runtime which input to the Merge op is alive.
-      if self._node_op_types[node] == "Merge":
+      if self._node_op_types[device_name][node] == "Merge":
         return
 
       if node in visited_nodes:
         return
       visited_nodes.append(node)
 
-      for inp in self._node_inputs[node]:
+      for inp in self._node_inputs[device_name][node]:
         if inp == node_name:
           continue
         inputs.append(inp)
         trace_inputs(inp)
 
       if include_control:
-        for ctrl_inp in self._node_ctrl_inputs[node]:
+        for ctrl_inp in self._node_ctrl_inputs[device_name][node]:
           if ctrl_inp == node_name:
             continue
           inputs.append(ctrl_inp)
@@ -1210,13 +1378,15 @@ class DebugDumpDir(object):
 
     return inputs
 
-  def node_recipients(self, node_name, is_control=False):
+  def node_recipients(self, node_name, is_control=False, device_name=None):
     """Get recipient of the given node's output according to partition graphs.
 
     Args:
       node_name: (`str`) name of the node.
       is_control: (`bool`) whether control outputs, rather than non-control
         outputs, are to be returned.
+      device_name: (`str`) name of the device. If there is only one device or if
+        node_name exists on only one device, this argumnet is optional.
 
     Returns:
       (`list` of `str`) all inputs to the node, as a list of node names.
@@ -1231,58 +1401,67 @@ class DebugDumpDir(object):
       raise LookupError(
           "Node recipients are not loaded from partition graphs yet.")
 
-    if node_name not in self._node_recipients:
-      raise ValueError("Node '%s' does not exist in partition graphs." %
-                       node_name)
+    device_name = self._infer_device_name(device_name, node_name)
+    if node_name not in self._node_recipients[device_name]:
+      raise ValueError(
+          "Node '%s' does not exist in the partition graph of device %s." %
+          (node_name, device_name))
 
     if is_control:
-      return self._node_ctrl_recipients[node_name]
+      return self._node_ctrl_recipients[device_name][node_name]
     else:
-      return self._node_recipients[node_name]
+      return self._node_recipients[device_name][node_name]
 
   def devices(self):
-    """Get the list of devices.
+    """Get the list of device names.
 
     Returns:
       (`list` of `str`) names of the devices.
-
-    Raises:
-      LookupError: If node inputs and control inputs have not been loaded
-         from partition graphs yet.
     """
 
-    if self._partition_graphs is None:
-      raise LookupError("Devices are not loaded from partition graphs yet.")
-
-    return self._devices
+    return self._device_names
 
-  def node_exists(self, node_name):
+  def node_exists(self, node_name, device_name=None):
     """Test if a node exists in the partition graphs.
 
     Args:
       node_name: (`str`) name of the node to be checked.
+      device_name: optional device name. If None, will search for the node
+        on all available devices. Otherwise, search for the node only on
+        the given device.
 
     Returns:
       A boolean indicating whether the node exists.
 
     Raises:
       LookupError: If no partition graphs have been loaded yet.
+      ValueError: If device_name is specified but cannot be found.
     """
 
     if self._node_inputs is None:
       raise LookupError(
           "Nodes have not been loaded from partition graphs yet.")
 
-    return node_name in self._node_inputs
+    if (device_name is not None) and device_name not in self._node_inputs:
+      raise ValueError(
+          "The specified device_name '%s' cannot be found." % device_name)
+
+    node_inputs_all_devices = (self._node_inputs if device_name is None
+                               else (self._node_inputs[device_name],))
+
+    return any(node_name in node_inputs_all_devices[dev_name]
+               for dev_name in node_inputs_all_devices)
 
   def node_device(self, node_name):
-    """Get the device of a node.
+    """Get the names of the devices that has nodes of the specified name.
 
     Args:
       node_name: (`str`) name of the node.
 
     Returns:
-      (`str`) name of the device on which the node is placed.
+      (`str` or `list` of `str`) name of the device(s) on which the node of the
+        given name is found. Returns a `str` if there is only one such device,
+        otherwise return a `list` of `str`.
 
     Raises:
       LookupError: If node inputs and control inputs have not been loaded
@@ -1298,13 +1477,16 @@ class DebugDumpDir(object):
       raise ValueError("Node '%s' does not exist in partition graphs." %
                        node_name)
 
-    return self._node_devices[node_name]
+    output = list(self._node_devices[node_name])
+    return output[0] if len(output) == 1 else output
 
-  def node_op_type(self, node_name):
+  def node_op_type(self, node_name, device_name=None):
     """Get the op type of given node.
 
     Args:
       node_name: (`str`) name of the node.
+      device_name: (`str`) name of the device. If there is only one device or if
+        node_name exists on only one device, this argumnet is optional.
 
     Returns:
       (`str`) op type of the node.
@@ -1319,17 +1501,21 @@ class DebugDumpDir(object):
       raise LookupError(
           "Node op types are not loaded from partition graphs yet.")
 
-    if node_name not in self._node_op_types:
-      raise ValueError("Node '%s' does not exist in partition graphs." %
-                       node_name)
+    device_name = self._infer_device_name(device_name, node_name)
+    if node_name not in self._node_op_types[device_name]:
+      raise ValueError(
+          "Node '%s' does not exist in the partition graph of device '%s'. " %
+          (node_name, device_name))
 
-    return self._node_op_types[node_name]
+    return self._node_op_types[device_name][node_name]
 
-  def debug_watch_keys(self, node_name):
+  def debug_watch_keys(self, node_name, device_name=None):
     """Get all tensor watch keys of given node according to partition graphs.
 
     Args:
       node_name: (`str`) name of the node.
+      device_name: (`str`) name of the device. If there is only one device or if
+        node_name exists on only one device, this argumnet is optional.
 
     Returns:
       (`list` of `str`) all debug tensor watch keys. Returns an empty list if
@@ -1340,35 +1526,61 @@ class DebugDumpDir(object):
         partition graphs yet.
     """
 
-    if node_name not in self._debug_watches:
+    try:
+      device_name = self._infer_device_name(device_name, node_name)
+    except ValueError:
+      return []
+
+    if node_name not in self._debug_watches[device_name]:
       return []
 
     watch_keys = []
-    for watched_slot in self._debug_watches[node_name]:
-      debug_ops = self._debug_watches[node_name][watched_slot]
+    for watched_slot in self._debug_watches[device_name][node_name]:
+      debug_ops = self._debug_watches[device_name][node_name][watched_slot]
       for debug_op in debug_ops:
         watch_keys.append(
             _get_tensor_watch_key(node_name, watched_slot, debug_op))
 
     return watch_keys
 
-  def watch_key_to_data(self, debug_watch_key):
+  def watch_key_to_data(self, debug_watch_key, device_name=None):
     """Get all `DebugTensorDatum` instances corresponding to a debug watch key.
 
     Args:
       debug_watch_key: (`str`) debug watch key.
+      device_name: (`str`) name of the device. If there is only one device or if
+        the specified debug_watch_key exists on only one device, this argumnet
+        is optional.
 
     Returns:
       A list of `DebugTensorDatum` instances that correspond to the debug watch
       key. If the watch key does not exist, returns an empty list.
 
     Raises:
-      ValueError: If the debug watch key does not exist.
+      ValueError: If there are multiple devices that have the debug_watch_key,
+        but device_name is not specified.
     """
+    if device_name is None:
+      matching_device_names = [
+          name for name in self._watch_key_to_datum
+          if debug_watch_key in self._watch_key_to_datum[name]]
+      if not matching_device_names:
+        return []
+      elif len(matching_device_names) == 1:
+        device_name = matching_device_names[0]
+      else:
+        raise ValueError(
+            "The debug watch key '%s' exists on multiple (%d) devices, but "
+            "device name is not specified." %
+            (debug_watch_key, len(matching_device_names)))
+    elif device_name not in self._debug_key_to_datum:
+      raise ValueError(
+          "There is no device named '%s' consisting of debug watch keys." %
+          device_name)
 
-    return self._watch_key_to_datum.get(debug_watch_key, [])
+    return self._watch_key_to_datum[device_name].get(debug_watch_key, [])
 
-  def find(self, predicate, first_n=0):
+  def find(self, predicate, first_n=0, device_name=None):
     """Find dumped tensor data by a certain predicate.
 
     Args:
@@ -1386,6 +1598,7 @@ class DebugDumpDir(object):
       first_n: (`int`) return only the first n `DebugTensotDatum` instances (in
         time order) for which the predicate returns True. To return all the
         `DebugTensotDatum` instances, let first_n be <= 0.
+      device_name: optional device name.
 
     Returns:
       A list of all `DebugTensorDatum` objects in this `DebugDumpDir` object
@@ -1394,22 +1607,31 @@ class DebugDumpDir(object):
     """
 
     matched_data = []
-    for datum in self._dump_tensor_data:
-      if predicate(datum, datum.get_tensor()):
-        matched_data.append(datum)
+    for device in (self._dump_tensor_data if device_name is None
+                   else (self._dump_tensor_data[device_name],)):
+      for datum in self._dump_tensor_data[device]:
+        if predicate(datum, datum.get_tensor()):
+          matched_data.append(datum)
 
-        if first_n > 0 and len(matched_data) >= first_n:
-          break
+          if first_n > 0 and len(matched_data) >= first_n:
+            return matched_data
 
     return matched_data
 
-  def get_tensor_file_paths(self, node_name, output_slot, debug_op):
+  def get_tensor_file_paths(self,
+                            node_name,
+                            output_slot,
+                            debug_op,
+                            device_name=None):
     """Get the file paths from a debug-dumped tensor.
 
     Args:
       node_name: (`str`) name of the node that the tensor is produced by.
       output_slot: (`int`) output slot index of tensor.
       debug_op: (`str`) name of the debug op.
+      device_name: (`str`) name of the device. If there is only one device or if
+        the specified debug_watch_key exists on only one device, this argumnet
+        is optional.
 
     Returns:
       List of file path(s) loaded. This is a list because each debugged tensor
@@ -1420,14 +1642,17 @@ class DebugDumpDir(object):
         the debug-dump data.
     """
 
+    device_name = self._infer_device_name(device_name, node_name)
     watch_key = _get_tensor_watch_key(node_name, output_slot, debug_op)
-    if watch_key not in self._watch_key_to_datum:
+    if watch_key not in self._watch_key_to_datum[device_name]:
       raise WatchKeyDoesNotExistInDebugDumpDirError(
-          "Watch key \"%s\" does not exist in the debug dump" % watch_key)
+          "Watch key \"%s\" does not exist in the debug dump of device %s" %
+          (watch_key, device_name))
 
-    return [datum.file_path for datum in self._watch_key_to_datum[watch_key]]
+    return [datum.file_path for datum in
+            self._watch_key_to_datum[device_name][watch_key]]
 
-  def get_tensors(self, node_name, output_slot, debug_op):
+  def get_tensors(self, node_name, output_slot, debug_op, device_name=None):
     """Get the tensor value from for a debug-dumped tensor.
 
     The tensor may be dumped multiple times in the dump root directory, so a
@@ -1437,6 +1662,9 @@ class DebugDumpDir(object):
       node_name: (`str`) name of the node that the tensor is produced by.
       output_slot: (`int`) output slot index of tensor.
       debug_op: (`str`) name of the debug op.
+      device_name: (`str`) name of the device. If there is only one device or if
+        the specified debug_watch_key exists on only one device, this argumnet
+        is optional.
 
     Returns:
       List of tensors (`numpy.ndarray`) loaded from the debug-dump file(s).
@@ -1447,13 +1675,20 @@ class DebugDumpDir(object):
     """
 
     watch_key = _get_tensor_watch_key(node_name, output_slot, debug_op)
-    if watch_key not in self._watch_key_to_datum:
+    try:
+      device_name = self._infer_device_name(device_name, node_name)
+      return [datum.get_tensor() for datum in
+              self._watch_key_to_datum[device_name][watch_key]]
+    except (ValueError, KeyError):
       raise WatchKeyDoesNotExistInDebugDumpDirError(
-          "Watch key \"%s\" does not exist in the debug dump" % watch_key)
-
-    return [datum.get_tensor() for datum in self._watch_key_to_datum[watch_key]]
-
-  def get_rel_timestamps(self, node_name, output_slot, debug_op):
+          "Watch key \"%s\" does not exist in the debug dump of device %s" %
+          (watch_key, device_name))
+
+  def get_rel_timestamps(self,
+                         node_name,
+                         output_slot,
+                         debug_op,
+                         device_name=None):
     """Get the relative timestamp from for a debug-dumped tensor.
 
     Relative timestamp means (absolute timestamp - `t0`), where `t0` is the
@@ -1465,6 +1700,9 @@ class DebugDumpDir(object):
       node_name: (`str`) name of the node that the tensor is produced by.
       output_slot: (`int`) output slot index of tensor.
       debug_op: (`str`) name of the debug op.
+      device_name: (`str`) name of the device. If there is only one device or if
+        the specified debug_watch_key exists on only one device, this argumnet
+        is optional.
 
     Returns:
       (`list` of `int`) list of relative timestamps.
@@ -1474,14 +1712,20 @@ class DebugDumpDir(object):
         exist in the debug dump data.
     """
 
+    device_name = self._infer_device_name(device_name, node_name)
     watch_key = _get_tensor_watch_key(node_name, output_slot, debug_op)
-    if watch_key not in self._watch_key_to_datum:
+    if watch_key not in self._watch_key_to_datum[device_name]:
       raise WatchKeyDoesNotExistInDebugDumpDirError(
           "Watch key \"%s\" does not exist in the debug dump" % watch_key)
 
-    return self._watch_key_to_rel_time[watch_key]
+    # TODO(cais): Figure out whether this should be relative to the global t0.
+    return self._watch_key_to_rel_time[device_name][watch_key]
 
-  def get_dump_sizes_bytes(self, node_name, output_slot, debug_op):
+  def get_dump_sizes_bytes(self,
+                           node_name,
+                           output_slot,
+                           debug_op,
+                           device_name=None):
     """Get the sizes of the dump files for a debug-dumped tensor.
 
     Unit of the file size: byte.
@@ -1490,6 +1734,9 @@ class DebugDumpDir(object):
       node_name: (`str`) name of the node that the tensor is produced by.
       output_slot: (`int`) output slot index of tensor.
       debug_op: (`str`) name of the debug op.
+      device_name: (`str`) name of the device. If there is only one device or if
+        the specified debug_watch_key exists on only one device, this argumnet
+        is optional.
 
     Returns:
       (`list` of `int`): list of dump file sizes in bytes.
@@ -1499,12 +1746,14 @@ class DebugDumpDir(object):
         exist in the debug dump data.
     """
 
+    device_name = self._infer_device_name(device_name, node_name)
     watch_key = _get_tensor_watch_key(node_name, output_slot, debug_op)
-    if watch_key not in self._watch_key_to_datum:
+    if watch_key not in self._watch_key_to_datum[device_name]:
       raise WatchKeyDoesNotExistInDebugDumpDirError(
-          "Watch key \"%s\" does not exist in the debug dump" % watch_key)
+          "Watch key \"%s\" does not exist in the debug dump of device %s" %
+          (watch_key, device_name))
 
-    return self._watch_key_to_dump_size_bytes[watch_key]
+    return self._watch_key_to_dump_size_bytes[device_name][watch_key]
 
   def node_traceback(self, element_name):
     """Try to retrieve the Python traceback of node's construction.
diff --git a/tensorflow/python/debug/lib/debug_data_test.py b/tensorflow/python/debug/lib/debug_data_test.py
index dc45e8df6c..dd621a5afc 100644
--- a/tensorflow/python/debug/lib/debug_data_test.py
+++ b/tensorflow/python/debug/lib/debug_data_test.py
@@ -23,12 +23,29 @@ import tempfile
 
 import numpy as np
 
+from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import tensor_pb2
 from tensorflow.python.debug.lib import debug_data
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import googletest
 
 
+class DeviceNamePathConversionTest(test_util.TensorFlowTestCase):
+
+  def testDeviceNameToDevicePath(self):
+    self.assertEqual(
+        debug_data.METADATA_FILE_PREFIX + debug_data.DEVICE_TAG +
+        ",job_ps,replica_1,task_2,cpu_0",
+        debug_data.device_name_to_device_path("/job:ps/replica:1/task:2/cpu:0"))
+
+  def testDevicePathToDeviceName(self):
+    self.assertEqual(
+        "/job:ps/replica:1/task:2/cpu:0",
+        debug_data.device_path_to_device_name(
+            debug_data.METADATA_FILE_PREFIX + debug_data.DEVICE_TAG +
+            ",job_ps,replica_1,task_2,cpu_0"))
+
+
 class ParseNodeOrTensorNameTest(test_util.TensorFlowTestCase):
 
   def testParseNodeName(self):
@@ -163,7 +180,10 @@ class DebugTensorDatumTest(test_util.TensorFlowTestCase):
 
   def testDebugDatum(self):
     dump_root = "/tmp/tfdbg_1"
-    debug_dump_rel_path = "ns1/ns2/node_a_1_2_DebugIdentity_1472563253536385"
+    debug_dump_rel_path = (
+        debug_data.METADATA_FILE_PREFIX + debug_data.DEVICE_TAG +
+        ",job_localhost,replica_0,task_0,cpu_0" +
+        "/ns1/ns2/node_a_1_2_DebugIdentity_1472563253536385")
 
     datum = debug_data.DebugTensorDatum(dump_root, debug_dump_rel_path)
 
@@ -175,16 +195,18 @@ class DebugTensorDatumTest(test_util.TensorFlowTestCase):
     self.assertEqual("ns1/ns2/node_a_1:2:DebugIdentity", datum.watch_key)
     self.assertEqual(
         os.path.join(dump_root, debug_dump_rel_path), datum.file_path)
-    self.assertEqual("{DebugTensorDatum: %s:%d @ %s @ %d}" % (datum.node_name,
-                                                              datum.output_slot,
-                                                              datum.debug_op,
-                                                              datum.timestamp),
-                     str(datum))
-    self.assertEqual("{DebugTensorDatum: %s:%d @ %s @ %d}" % (datum.node_name,
-                                                              datum.output_slot,
-                                                              datum.debug_op,
-                                                              datum.timestamp),
-                     repr(datum))
+    self.assertEqual(
+        "{DebugTensorDatum (/job:localhost/replica:0/task:0/cpu:0) "
+        "%s:%d @ %s @ %d}" % (datum.node_name,
+                              datum.output_slot,
+                              datum.debug_op,
+                              datum.timestamp), str(datum))
+    self.assertEqual(
+        "{DebugTensorDatum (/job:localhost/replica:0/task:0/cpu:0) "
+        "%s:%d @ %s @ %d}" % (datum.node_name,
+                              datum.output_slot,
+                              datum.debug_op,
+                              datum.timestamp), repr(datum))
 
   def testDumpSizeBytesIsNoneForNonexistentFilePath(self):
     dump_root = "/tmp/tfdbg_1"
@@ -204,18 +226,112 @@ class DebugDumpDirTest(test_util.TensorFlowTestCase):
     # Tear down temporary dump directory.
     shutil.rmtree(self._dump_root)
 
+  def _makeDataDirWithMultipleDevicesAndDuplicateNodeNames(self):
+    cpu_0_dir = os.path.join(
+        self._dump_root,
+        debug_data.METADATA_FILE_PREFIX + debug_data.DEVICE_TAG +
+        ",job_localhost,replica_0,task_0,cpu_0")
+    gpu_0_dir = os.path.join(
+        self._dump_root,
+        debug_data.METADATA_FILE_PREFIX + debug_data.DEVICE_TAG +
+        ",job_localhost,replica_0,task_0,gpu_0")
+    gpu_1_dir = os.path.join(
+        self._dump_root,
+        debug_data.METADATA_FILE_PREFIX + debug_data.DEVICE_TAG +
+        ",job_localhost,replica_0,task_0,gpu_1")
+    os.makedirs(cpu_0_dir)
+    os.makedirs(gpu_0_dir)
+    os.makedirs(gpu_1_dir)
+    open(os.path.join(
+        cpu_0_dir, "node_foo_1_2_DebugIdentity_1472563253536386"), "wb")
+    open(os.path.join(
+        gpu_0_dir, "node_foo_1_2_DebugIdentity_1472563253536385"), "wb")
+    open(os.path.join(
+        gpu_1_dir, "node_foo_1_2_DebugIdentity_1472563253536387"), "wb")
+
   def testDebugDumpDir_nonexistentDumpRoot(self):
     with self.assertRaisesRegexp(IOError, "does not exist"):
       debug_data.DebugDumpDir(tempfile.mktemp() + "_foo")
 
   def testDebugDumpDir_invalidFileNamingPattern(self):
     # File name with too few underscores should lead to an exception.
-    open(os.path.join(self._dump_root, "node1_DebugIdentity_1234"), "wb")
+    device_dir = os.path.join(
+        self._dump_root,
+        debug_data.METADATA_FILE_PREFIX + debug_data.DEVICE_TAG +
+        ",job_localhost,replica_0,task_0,cpu_0")
+    os.makedirs(device_dir)
+    open(os.path.join(device_dir, "node1_DebugIdentity_1234"), "wb")
 
     with self.assertRaisesRegexp(ValueError,
                                  "does not conform to the naming pattern"):
       debug_data.DebugDumpDir(self._dump_root)
 
+  def testDebugDumpDir_validDuplicateNodeNamesWithMultipleDevices(self):
+    self._makeDataDirWithMultipleDevicesAndDuplicateNodeNames()
+
+    graph_cpu_0 = graph_pb2.GraphDef()
+    node = graph_cpu_0.node.add()
+    node.name = "node_foo_1"
+    node.op = "FooOp"
+    node.device = "/job:localhost/replica:0/task:0/cpu:0"
+    graph_gpu_0 = graph_pb2.GraphDef()
+    node = graph_gpu_0.node.add()
+    node.name = "node_foo_1"
+    node.op = "FooOp"
+    node.device = "/job:localhost/replica:0/task:0/gpu:0"
+    graph_gpu_1 = graph_pb2.GraphDef()
+    node = graph_gpu_1.node.add()
+    node.name = "node_foo_1"
+    node.op = "FooOp"
+    node.device = "/job:localhost/replica:0/task:0/gpu:1"
+
+    dump_dir = debug_data.DebugDumpDir(
+        self._dump_root,
+        partition_graphs=[graph_cpu_0, graph_gpu_0, graph_gpu_1])
+
+    self.assertItemsEqual(
+        ["/job:localhost/replica:0/task:0/cpu:0",
+         "/job:localhost/replica:0/task:0/gpu:0",
+         "/job:localhost/replica:0/task:0/gpu:1"], dump_dir.devices())
+    self.assertEqual(1472563253536385, dump_dir.t0)
+    self.assertEqual(3, dump_dir.size)
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        r"There are multiple \(3\) devices, but device_name is not specified"):
+      dump_dir.nodes()
+    self.assertItemsEqual(
+        ["node_foo_1"],
+        dump_dir.nodes(device_name="/job:localhost/replica:0/task:0/cpu:0"))
+
+  def testDuplicateNodeNamesInGraphDefOfSingleDeviceRaisesException(self):
+    self._makeDataDirWithMultipleDevicesAndDuplicateNodeNames()
+    graph_cpu_0 = graph_pb2.GraphDef()
+    node = graph_cpu_0.node.add()
+    node.name = "node_foo_1"
+    node.op = "FooOp"
+    node.device = "/job:localhost/replica:0/task:0/cpu:0"
+    graph_gpu_0 = graph_pb2.GraphDef()
+    node = graph_gpu_0.node.add()
+    node.name = "node_foo_1"
+    node.op = "FooOp"
+    node.device = "/job:localhost/replica:0/task:0/gpu:0"
+    graph_gpu_1 = graph_pb2.GraphDef()
+    node = graph_gpu_1.node.add()
+    node.name = "node_foo_1"
+    node.op = "FooOp"
+    node.device = "/job:localhost/replica:0/task:0/gpu:1"
+    node = graph_gpu_1.node.add()  # Here is the duplicate.
+    node.name = "node_foo_1"
+    node.op = "FooOp"
+    node.device = "/job:localhost/replica:0/task:0/gpu:1"
+
+    with self.assertRaisesRegexp(
+        ValueError, r"Duplicate node name on device "):
+      debug_data.DebugDumpDir(
+          self._dump_root,
+          partition_graphs=[graph_cpu_0, graph_gpu_0, graph_gpu_1])
+
   def testDebugDumpDir_emptyDumpDir(self):
     dump_dir = debug_data.DebugDumpDir(self._dump_root)
 
diff --git a/tensorflow/python/debug/lib/session_debug_multi_gpu_test.py b/tensorflow/python/debug/lib/session_debug_multi_gpu_test.py
new file mode 100644
index 0000000000..b0dc25851c
--- /dev/null
+++ b/tensorflow/python/debug/lib/session_debug_multi_gpu_test.py
@@ -0,0 +1,93 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for debugger functionalities under multiple (i.e., >1) GPUs."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+import tempfile
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import device_lib
+from tensorflow.python.client import session
+from tensorflow.python.debug.lib import debug_data
+from tensorflow.python.debug.lib import debug_utils
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import googletest
+
+
+class SessionDebugMultiGPUTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    self._dump_root = tempfile.mkdtemp()
+
+  def tearDown(self):
+    ops.reset_default_graph()
+
+    # Tear down temporary dump directory.
+    if os.path.isdir(self._dump_root):
+      shutil.rmtree(self._dump_root)
+
+  def testMultiGPUSessionRun(self):
+    local_devices = device_lib.list_local_devices()
+    gpu_device_names = []
+    for device in local_devices:
+      if device.device_type == "GPU":
+        gpu_device_names.append(device.name)
+    gpu_device_names = sorted(gpu_device_names)
+
+    if len(gpu_device_names) < 2:
+      self.skipTest(
+          "This test requires at least 2 GPUs, but only %d is available." %
+          len(gpu_device_names))
+
+    with session.Session() as sess:
+      v = variables.Variable([10.0, 15.0], dtype=dtypes.float32, name="v")
+      with ops.device(gpu_device_names[0]):
+        u0 = math_ops.add(v, v, name="u0")
+      with ops.device(gpu_device_names[1]):
+        u1 = math_ops.multiply(v, v, name="u1")
+      w = math_ops.subtract(u1, u0, name="w")
+
+      sess.run(v.initializer)
+
+      run_options = config_pb2.RunOptions(output_partition_graphs=True)
+      debug_utils.watch_graph(run_options, sess.graph,
+                              debug_urls="file://" + self._dump_root)
+      run_metadata = config_pb2.RunMetadata()
+      self.assertAllClose(
+          [80.0, 195.0],
+          sess.run(w, options=run_options, run_metadata=run_metadata))
+
+      debug_dump_dir = debug_data.DebugDumpDir(
+          self._dump_root, partition_graphs=run_metadata.partition_graphs)
+      self.assertEqual(3, len(debug_dump_dir.devices()))
+      self.assertAllClose(
+          [10.0, 15.0], debug_dump_dir.get_tensors("v", 0, "DebugIdentity")[0])
+      self.assertAllClose(
+          [20.0, 30.0], debug_dump_dir.get_tensors("u0", 0, "DebugIdentity")[0])
+      self.assertAllClose(
+          [100.0, 225.0],
+          debug_dump_dir.get_tensors("u1", 0, "DebugIdentity")[0])
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/python/debug/lib/session_debug_testlib.py b/tensorflow/python/debug/lib/session_debug_testlib.py
index ef1d1bb2f3..d7da1952a0 100644
--- a/tensorflow/python/debug/lib/session_debug_testlib.py
+++ b/tensorflow/python/debug/lib/session_debug_testlib.py
@@ -249,8 +249,12 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
     self.assertIn(results.v.op.type, results.dump.node_op_type(results.v_name))
     self.assertIn(results.w.op.type, results.dump.node_op_type(results.w_name))
 
-    with self.assertRaisesRegexp(
-        ValueError, "Node 'foo_bar' does not exist in partition graphs."):
+    if test_util.gpu_device_name():
+      expected_error_regexp = r"None of the .* devices has a node named "
+    else:
+      expected_error_regexp = (
+          r"Node \'foo_bar\' does not exist in the partition graph of device")
+    with self.assertRaisesRegexp(ValueError, expected_error_regexp):
       results.dump.node_op_type("foo_bar")
 
   def testDumpStringTensorsWorks(self):
@@ -436,9 +440,11 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
       # Verify dump files
       self.assertTrue(os.path.isdir(self._dump_root))
 
-      self.assertTrue(os.path.isdir(os.path.join(self._dump_root, u_namespace)))
-      self.assertTrue(
-          os.path.isdir(os.path.join(self._dump_root, v_namespace, "v")))
+      u_glob_out = glob.glob(os.path.join(self._dump_root, "*", u_namespace))
+      v_glob_out = glob.glob(os.path.join(
+          self._dump_root, "*", v_namespace, "v"))
+      self.assertTrue(os.path.isdir(u_glob_out[0]))
+      self.assertTrue(os.path.isdir(v_glob_out[0]))
 
       dump = debug_data.DebugDumpDir(
           self._dump_root, partition_graphs=run_metadata.partition_graphs)
@@ -688,7 +694,11 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
     u_read_name = u_name + "/read"
 
     # Test node name list lookup of the DebugDumpDir object.
-    node_names = dump.nodes()
+    if test_util.gpu_device_name():
+      node_names = dump.nodes(
+          device_name="/job:localhost/replica:0/task:0/gpu:0")
+    else:
+      node_names = dump.nodes()
     self.assertTrue(u_name in node_names)
     self.assertTrue(u_read_name in node_names)
 
@@ -698,7 +708,11 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
     self.assertEqual(1, len(u_attr["shape"].shape.dim))
     self.assertEqual(2, u_attr["shape"].shape.dim[0].size)
 
-    with self.assertRaisesRegexp(ValueError, "No node named \"foo\" exists"):
+    if test_util.gpu_device_name():
+      expected_error_regexp = r"None of the .* devices has a node named "
+    else:
+      expected_error_regexp = r"No node named \"foo\" exists"
+    with self.assertRaisesRegexp(ValueError, expected_error_regexp):
       dump.node_attributes("foo")
 
   def testGraphStructureLookupGivesDebugWatchKeys(self):
@@ -721,7 +735,6 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
     self.assertEqual(0, u_data[0].output_slot)
     self.assertEqual("DebugIdentity", u_data[0].debug_op)
     self.assertGreaterEqual(u_data[0].timestamp, 0)
-
     self.assertEqual([], dump.watch_key_to_data("foo"))
 
   def testGraphStructureLookupGivesNodeInputsAndRecipients(self):
@@ -752,12 +765,13 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
     self.assertEqual([], dump.node_recipients(w_name, is_control=True))
 
     # Test errors raised on invalid node names.
-    with self.assertRaisesRegexp(ValueError,
-                                 "does not exist in partition graphs"):
+    if test_util.gpu_device_name():
+      expected_error_regexp = r"None of the .* devices has a node named "
+    else:
+      expected_error_regexp = "does not exist in the partition graph of device "
+    with self.assertRaisesRegexp(ValueError, expected_error_regexp):
       dump.node_inputs(u_name + "foo")
-
-    with self.assertRaisesRegexp(ValueError,
-                                 "does not exist in partition graphs"):
+    with self.assertRaisesRegexp(ValueError, expected_error_regexp):
       dump.node_recipients(u_name + "foo")
 
     # Test transitive_inputs().
@@ -768,8 +782,7 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
     self.assertEqual(
         set([u_name, u_read_name, v_name]), set(dump.transitive_inputs(w_name)))
 
-    with self.assertRaisesRegexp(ValueError,
-                                 "does not exist in partition graphs"):
+    with self.assertRaisesRegexp(ValueError, expected_error_regexp):
       dump.transitive_inputs(u_name + "foo")
 
   def testGraphStructureLookupWithoutPartitionGraphsDoesNotErrorOut(self):
@@ -1066,10 +1079,12 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
       y = array_ops.squeeze(ph, name="mismatch/y")
 
       run_options = config_pb2.RunOptions(output_partition_graphs=True)
+      run_metadata = config_pb2.RunMetadata()
       debug_utils.watch_graph(
           run_options, sess.graph, debug_urls=self._debug_urls(), global_step=1)
 
-      sess.run(x, feed_dict={ph: np.array([[7.0, 8.0]])}, options=run_options)
+      sess.run(x, feed_dict={ph: np.array([[7.0, 8.0]])}, options=run_options,
+               run_metadata=run_metadata)
       dump1 = debug_data.DebugDumpDir(self._dump_root)
       self.assertEqual(1, dump1.core_metadata.global_step)
       self.assertGreaterEqual(dump1.core_metadata.session_run_index, 0)
author	Shanqing Cai <cais@google.com>	2017-06-05 10:29:50 -0700
committer	TensorFlower Gardener <gardener@tensorflow.org>	2017-06-05 10:33:23 -0700
commit	cc2dd4ac8538045e94e3f8fe4fb1c532f67c1844 (patch)
tree	80db01fd15e27ebc0d7b88959caf3140c644bd39
parent	a5909d64320a9dfd940b298bcb0bd758e514a04f (diff)