aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/core/debug
diff options
context:
space:
mode:
authorGravatar Shanqing Cai <cais@google.com>2017-06-05 10:29:50 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2017-06-05 10:33:23 -0700
commitcc2dd4ac8538045e94e3f8fe4fb1c532f67c1844 (patch)
tree80db01fd15e27ebc0d7b88959caf3140c644bd39 /tensorflow/core/debug
parenta5909d64320a9dfd940b298bcb0bd758e514a04f (diff)
tfdbg: dump debug data from different devices in separate directories
Fixes: #7051 wherein TFDBG failed to load the data dump from a Session.run() involving multiple GPUs. The root cause of the bug was that TFDBG previously assumed that node names are unique across all partition graphs. This is however not the case when multiple GPUs exist. The Send/Recv nodes in the partition graphs of the GPUs can have duplicate names. There will potentially be other cases like this in the future due to other reasons (e.g., distributed sessions and/or graph optimization). This CL relaxes this assumption, by dumping the GraphDef and tensor data from different devices into different sub-directories under the dump root directory. PiperOrigin-RevId: 158029814
Diffstat (limited to 'tensorflow/core/debug')
-rw-r--r--tensorflow/core/debug/debug_io_utils.cc38
-rw-r--r--tensorflow/core/debug/debug_io_utils.h8
-rw-r--r--tensorflow/core/debug/debug_io_utils_test.cc28
-rw-r--r--tensorflow/core/debug/grpc_session_debug_test.cc10
4 files changed, 69 insertions, 15 deletions
diff --git a/tensorflow/core/debug/debug_io_utils.cc b/tensorflow/core/debug/debug_io_utils.cc
index 25847a20a4..54366ce249 100644
--- a/tensorflow/core/debug/debug_io_utils.cc
+++ b/tensorflow/core/debug/debug_io_utils.cc
@@ -119,6 +119,18 @@ Status PublishEncodedGraphDefInChunks(const string& encoded_graph_def,
} // namespace
+// static
+const char* const DebugIO::kMetadataFilePrefix = "_tfdbg_";
+
+// static
+const char* const DebugIO::kCoreMetadataTag = "core_metadata_";
+
+// static
+const char* const DebugIO::kDeviceTag = "device_";
+
+// static
+const char* const DebugIO::kGraphTag = "graph_";
+
DebugNodeKey::DebugNodeKey(const string& device_name, const string& node_name,
const int32 output_slot, const string& debug_op)
: device_name(device_name),
@@ -126,7 +138,8 @@ DebugNodeKey::DebugNodeKey(const string& device_name, const string& node_name,
output_slot(output_slot),
debug_op(debug_op),
debug_node_name(
- strings::StrCat(node_name, ":", output_slot, ":", debug_op)) {}
+ strings::StrCat(node_name, ":", output_slot, ":", debug_op)),
+ device_path(DeviceNameToDevicePath(device_name)) {}
Status ReadEventFromFile(const string& dump_file_path, Event* event) {
Env* env(Env::Default());
@@ -158,6 +171,15 @@ Status ReadEventFromFile(const string& dump_file_path, Event* event) {
}
// static
+const string DebugNodeKey::DeviceNameToDevicePath(const string& device_name) {
+ return strings::StrCat(
+ DebugIO::kMetadataFilePrefix, DebugIO::kDeviceTag,
+ str_util::StringReplace(
+ str_util::StringReplace(device_name, ":", "_", true), "/", ",",
+ true));
+}
+
+// static
const char* const DebugIO::kFileURLScheme = "file://";
// static
const char* const DebugIO::kGrpcURLScheme = "grpc://";
@@ -236,7 +258,8 @@ Status DebugIO::PublishDebugMetadata(
const string core_metadata_path = AppendTimestampToFilePath(
io::JoinPath(
dump_root_dir,
- strings::StrCat("_tfdbg_core_metadata_", "sessionrun",
+ strings::StrCat(DebugIO::kMetadataFilePrefix,
+ DebugIO::kCoreMetadataTag, "sessionrun",
strings::Printf("%.14lld", session_run_index))),
Env::Default()->NowMicros());
status.Update(DebugFileIO::DumpEventProtoToFile(
@@ -325,10 +348,11 @@ Status DebugIO::PublishGraph(const Graph& graph, const string& device_name,
Status status = Status::OK();
for (const string& debug_url : debug_urls) {
if (debug_url.find(kFileURLScheme) == 0) {
- const string dump_root_dir = debug_url.substr(strlen(kFileURLScheme));
- // TODO(cais): (b/38325442) Serialize the GraphDef to a directory that
- // reflects the device name.
- const string file_name = strings::StrCat("_tfdbg_graph_", now_micros);
+ const string dump_root_dir =
+ io::JoinPath(debug_url.substr(strlen(kFileURLScheme)),
+ DebugNodeKey::DeviceNameToDevicePath(device_name));
+ const string file_name = strings::StrCat(DebugIO::kMetadataFilePrefix,
+ DebugIO::kGraphTag, now_micros);
status.Update(
DebugFileIO::DumpEventProtoToFile(event, dump_root_dir, file_name));
@@ -437,7 +461,7 @@ string DebugFileIO::GetDumpFilePath(const string& dump_root_dir,
const DebugNodeKey& debug_node_key,
const uint64 wall_time_us) {
return AppendTimestampToFilePath(
- io::JoinPath(dump_root_dir,
+ io::JoinPath(dump_root_dir, debug_node_key.device_path,
strings::StrCat(debug_node_key.node_name, "_",
debug_node_key.output_slot, "_",
debug_node_key.debug_op)),
diff --git a/tensorflow/core/debug/debug_io_utils.h b/tensorflow/core/debug/debug_io_utils.h
index f3e76cc0ee..69d8c7bd4e 100644
--- a/tensorflow/core/debug/debug_io_utils.h
+++ b/tensorflow/core/debug/debug_io_utils.h
@@ -44,11 +44,14 @@ struct DebugNodeKey {
DebugNodeKey(const string& device_name, const string& node_name,
const int32 output_slot, const string& debug_op);
+ static const string DeviceNameToDevicePath(const string& device_name);
+
const string device_name;
const string node_name;
const int32 output_slot;
const string debug_op;
const string debug_node_name;
+ const string device_path;
};
class DebugIO {
@@ -136,6 +139,11 @@ class DebugIO {
static Status CloseDebugURL(const string& debug_url);
+ static const char* const kMetadataFilePrefix;
+ static const char* const kCoreMetadataTag;
+ static const char* const kDeviceTag;
+ static const char* const kGraphTag;
+
static const char* const kFileURLScheme;
static const char* const kGrpcURLScheme;
};
diff --git a/tensorflow/core/debug/debug_io_utils_test.cc b/tensorflow/core/debug/debug_io_utils_test.cc
index 406bcae07f..77039aa4ab 100644
--- a/tensorflow/core/debug/debug_io_utils_test.cc
+++ b/tensorflow/core/debug/debug_io_utils_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
#include "tensorflow/core/lib/core/notification.h"
#include "tensorflow/core/lib/core/status_test_util.h"
#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/io/path.h"
#include "tensorflow/core/lib/strings/str_util.h"
#include "tensorflow/core/platform/env.h"
#include "tensorflow/core/util/event.pb.h"
@@ -47,6 +48,18 @@ class DebugIOUtilsTest : public ::testing::Test {
std::unique_ptr<Tensor> tensor_b_;
};
+TEST_F(DebugIOUtilsTest, ConstructDebugNodeKey) {
+ DebugNodeKey debug_node_key("/job:worker/replica:1/task:0/gpu:2",
+ "hidden_1/MatMul", 0, "DebugIdentity");
+ EXPECT_EQ("/job:worker/replica:1/task:0/gpu:2", debug_node_key.device_name);
+ EXPECT_EQ("hidden_1/MatMul", debug_node_key.node_name);
+ EXPECT_EQ(0, debug_node_key.output_slot);
+ EXPECT_EQ("DebugIdentity", debug_node_key.debug_op);
+ EXPECT_EQ("hidden_1/MatMul:0:DebugIdentity", debug_node_key.debug_node_name);
+ EXPECT_EQ("_tfdbg_device_,job_worker,replica_1,task_0,gpu_2",
+ debug_node_key.device_path);
+}
+
TEST_F(DebugIOUtilsTest, DumpFloatTensorToFileSunnyDay) {
Initialize();
@@ -138,10 +151,14 @@ TEST_F(DebugIOUtilsTest, DumpTensorToFileCannotCreateDirectory) {
// First, create the file at the path.
const string test_dir = testing::TmpDir();
- const string txt_file_name = strings::StrCat(test_dir, "/baz");
-
- if (!env_->FileExists(test_dir).ok()) {
- ASSERT_TRUE(env_->CreateDir(test_dir).ok());
+ const string kDeviceName = "/job:localhost/replica:0/task:0/cpu:0";
+ const DebugNodeKey kDebugNodeKey(kDeviceName, "baz/tensor_a", 0,
+ "DebugIdentity");
+ const string txt_file_dir =
+ io::JoinPath(test_dir, DebugNodeKey::DeviceNameToDevicePath(kDeviceName));
+ const string txt_file_name = io::JoinPath(txt_file_dir, "baz");
+ if (!env_->FileExists(txt_file_dir).ok()) {
+ ASSERT_TRUE(env_->RecursivelyCreateDir(txt_file_dir).ok());
}
ASSERT_EQ(error::Code::NOT_FOUND, env_->FileExists(txt_file_name).code());
@@ -157,8 +174,7 @@ TEST_F(DebugIOUtilsTest, DumpTensorToFileCannotCreateDirectory) {
// Second, try to dump the tensor to a path that requires "baz" to be a
// directory, which should lead to an error.
- const DebugNodeKey kDebugNodeKey("/job:localhost/replica:0/task:0/cpu:0",
- "baz/tensor_a", 0, "DebugIdentity");
+
const uint64 wall_time = env_->NowMicros();
string dump_file_name;
diff --git a/tensorflow/core/debug/grpc_session_debug_test.cc b/tensorflow/core/debug/grpc_session_debug_test.cc
index 6c68729410..9584d8b9f3 100644
--- a/tensorflow/core/debug/grpc_session_debug_test.cc
+++ b/tensorflow/core/debug/grpc_session_debug_test.cc
@@ -187,7 +187,10 @@ TEST_F(GrpcSessionDebugTest, FileDebugURL) {
IsSingleFloatValue(outputs[0], 4.0);
std::vector<Tensor> dumped_tensors;
- LoadTensorDumps("n", &dumped_tensors);
+ LoadTensorDumps(io::JoinPath(DebugNodeKey::DeviceNameToDevicePath(
+ cluster->devices()[0].name()),
+ "n"),
+ &dumped_tensors);
if (i == 0 || i == 5) {
ASSERT_EQ(0, dumped_tensors.size());
@@ -267,7 +270,10 @@ TEST_F(GrpcSessionDebugTest, MultiDevices_String) {
TF_CHECK_OK(session->Close());
std::vector<Tensor> dumped_tensors;
- LoadTensorDumps("n", &dumped_tensors);
+ LoadTensorDumps(
+ io::JoinPath(DebugNodeKey::DeviceNameToDevicePath(a_dev.name()),
+ "n"),
+ &dumped_tensors);
ASSERT_EQ(1, dumped_tensors.size());
ASSERT_EQ(TensorShape({2, 2}), dumped_tensors[0].shape());
for (size_t i = 0; i < 4; ++i) {