diff options
author | 2017-11-30 15:20:30 -0800 | |
---|---|---|
committer | 2017-11-30 15:23:54 -0800 | |
commit | ce4200eae990d7f5efdfb727939d38bf48001ba2 (patch) | |
tree | 7c9c3e9cd932273198fa2ef8e4fb24637de1f42f /tensorflow/core/profiler | |
parent | 6bfc73a0b3c6810725a5eb0020470457cc5cc23e (diff) |
Fix profiler to track some missed persistent bytes.
PiperOrigin-RevId: 177516249
Diffstat (limited to 'tensorflow/core/profiler')
-rw-r--r-- | tensorflow/core/profiler/g3doc/options.md | 11 | ||||
-rw-r--r-- | tensorflow/core/profiler/internal/tfprof_node.cc | 28 | ||||
-rw-r--r-- | tensorflow/core/profiler/internal/tfprof_node.h | 10 | ||||
-rw-r--r-- | tensorflow/core/profiler/internal/tfprof_show_test.cc | 37 | ||||
-rw-r--r-- | tensorflow/core/profiler/internal/tfprof_stats_test.cc | 105 | ||||
-rw-r--r-- | tensorflow/core/profiler/tfprof_log.proto | 5 |
6 files changed, 119 insertions, 77 deletions
diff --git a/tensorflow/core/profiler/g3doc/options.md b/tensorflow/core/profiler/g3doc/options.md index 4c73e372e3..dd12f76d6f 100644 --- a/tensorflow/core/profiler/g3doc/options.md +++ b/tensorflow/core/profiler/g3doc/options.md @@ -60,11 +60,14 @@ Currently, profiler only tracks the allocation of memory. As a result, the accumulated memory request is uaually larger than the peak memory of the overall model. -bytes: The memory allocations requested by the operation. -peak_bytes: The peak requested memory (not de-allocated) by the operation. -residual_bytes: The memory requested by the operation and not de-allocated +It's recommended to generate timeline to see the allocator memory usage over +time. + +`bytes`: The memory allocations requested by the operation. +`peak_bytes`: The peak requested memory (not de-allocated) by the operation. +`residual_bytes`: The memory requested by the operation and not de-allocated when Compute finishes. -output_bytes: The memory output by the operation. It's not necessarily requested +`output_bytes`: The memory output by the operation. It's not necessarily requested by the current operation. For example, it can be a tensor forwarded from input to output, with in-place mutation. diff --git a/tensorflow/core/profiler/internal/tfprof_node.cc b/tensorflow/core/profiler/internal/tfprof_node.cc index 671b65d708..5cd1050bcc 100644 --- a/tensorflow/core/profiler/internal/tfprof_node.cc +++ b/tensorflow/core/profiler/internal/tfprof_node.cc @@ -139,6 +139,25 @@ void ExecStep::AddMemoryStats(const string& dev, exec_.accelerator_persistent_bytes() + step_stat.memory_stats().device_persistent_memory_size()); } + + // TODO(xpan): Make this more accurate: + // High level: Memory tracking is suspicous and requires large scale + // clean up. + // Investigte the memory usage difference between CPU/GPU with OpViewTest. + // + // 1. OpKernelConstruction::allocate_xxx is not traced. Below, we only + // discuss OpKernelContext-related allocations. + // 2. allocate_output calls allocate_tensor, which is properly tracked in + // 'NodeExecStats.memory'. + // 3. allocate_temp is only tracked through record_xxx_temp. It appears + // in 'NodeExecStats.memory_stats'. + // 4. allocate_persistent calls allocate_tensor, which is properly tracked + // in 'NodeExecStats.memory'. However, there is no way to count it as + // persistent now. + // 5. record_xxx_persistent is called when allocate_persistent + // is not used and hence tracks some complementary bytes. It appears in + // 'NodeExecStats.memory_stats'. It's suspicious. But we should + // use it now since it covers constant op. int64 residual_bytes = 0; int64 requested_bytes = 0; int64 peak_bytes = 0; @@ -147,6 +166,15 @@ void ExecStep::AddMemoryStats(const string& dev, requested_bytes += mem.total_bytes(); peak_bytes += mem.peak_bytes(); } + residual_bytes += + exec_.host_persistent_bytes() + exec_.accelerator_persistent_bytes(); + requested_bytes += exec_.host_persistent_bytes() + + exec_.accelerator_persistent_bytes() + + exec_.host_temp_bytes() + exec_.accelerator_temp_bytes(); + peak_bytes += exec_.host_persistent_bytes() + + exec_.accelerator_persistent_bytes() + exec_.host_temp_bytes() + + exec_.accelerator_temp_bytes(); + exec_.set_requested_bytes(requested_bytes); exec_.set_residual_bytes(residual_bytes); exec_.set_peak_bytes(peak_bytes); diff --git a/tensorflow/core/profiler/internal/tfprof_node.h b/tensorflow/core/profiler/internal/tfprof_node.h index e2d0563a07..77c14cb792 100644 --- a/tensorflow/core/profiler/internal/tfprof_node.h +++ b/tensorflow/core/profiler/internal/tfprof_node.h @@ -593,17 +593,11 @@ class TFGraphNode { int64 accelerator_persistent_bytes() const { int64 persistent_bytes = 0; for (const auto& exec : execs_) { - persistent_bytes += exec.second.accelerator_persistent_bytes(); + persistent_bytes = std::max(persistent_bytes, + exec.second.accelerator_persistent_bytes()); } return persistent_bytes; } - int64 host_persistent_bytes(int64 step) const { - auto exec = execs_.find(step); - if (exec == execs_.end()) { - return 0; - } - return exec->second.host_persistent_bytes(); - } const std::map<int32, std::pair<int64, uint64>>& output_memory( int64 step) const { auto exec = execs_.find(step); diff --git a/tensorflow/core/profiler/internal/tfprof_show_test.cc b/tensorflow/core/profiler/internal/tfprof_show_test.cc index 1f19f8c322..98773ae19e 100644 --- a/tensorflow/core/profiler/internal/tfprof_show_test.cc +++ b/tensorflow/core/profiler/internal/tfprof_show_test.cc @@ -105,12 +105,13 @@ TEST_F(TFProfShowTest, DumpScopeMode) { "node name | # parameters | # float_ops | requested bytes | peak bytes | " "residual bytes | output bytes | total execution time | accelerator " "execution time | cpu execution time\n_TFProfRoot (--/451 params, --/0 " - "flops, --/0B, --/0B, --/0B, --/2.56KB, --/13us, --/0us, --/13us)\n DW " - "(3x3x3x6, 162/162 params, 0/0 flops, 0B/0B, 0B/0B, 0B/0B, " - "1.28KB/1.28KB, 2us/2us, 0us/0us, 2us/2us)\n DW2 (2x2x6x12, 288/288 " - "params, 0/0 flops, 0B/0B, 0B/0B, 0B/0B, 1.28KB/1.28KB, 11us/11us, " - "0us/0us, 11us/11us)\n ScalarW (1, 1/1 params, 0/0 flops, 0B/0B, 0B/0B, " - "0B/0B, 0B/0B, 0us/0us, 0us/0us, 0us/0us)\n", + "flops, --/2.56KB, --/2.56KB, --/2.56KB, --/2.56KB, --/13us, --/0us, " + "--/13us)\n DW (3x3x3x6, 162/162 params, 0/0 flops, 1.28KB/1.28KB, " + "1.28KB/1.28KB, 1.28KB/1.28KB, 1.28KB/1.28KB, 2us/2us, 0us/0us, " + "2us/2us)\n DW2 (2x2x6x12, 288/288 params, 0/0 flops, 1.28KB/1.28KB, " + "1.28KB/1.28KB, 1.28KB/1.28KB, 1.28KB/1.28KB, 11us/11us, 0us/0us, " + "11us/11us)\n ScalarW (1, 1/1 params, 0/0 flops, 0B/0B, 0B/0B, 0B/0B, " + "0B/0B, 0us/0us, 0us/0us, 0us/0us)\n", dump_str); EXPECT_EQ(dump_str, TestToFromProto("scope", opts)); @@ -178,22 +179,22 @@ TEST_F(TFProfShowTest, DumpOpMode) { EXPECT_EQ( "nodename|requestedbytes|totalexecutiontime|acceleratorexecutiontime|" "cpuexecutiontime|#parameters|#float_ops|opoccurrence(run|defined)|" - "inputshapes\nVariableV20B(0.00%,0.00%),13us(100.00%,0.26%),0us(100.00%," - "0.00%),13us(100.00%,0.29%),451params(100.00%,100.00%),0float_ops(100.00%" - ",0.00%),2|3\n\ninput_type:\t(run*2|defined*3)\texec_time:13us\n\nAdd0B(" - "0.00%,0.00%),0us(99.74%,0.00%),0us(100.00%,0.00%),0us(99.71%,0.00%)," - "0params(0.00%,0.00%),0float_ops(100.00%,0.00%),0|3\n\ninput_type:0:1," - "\t1:1\t(run*0|defined*1)\texec_time:0us\ninput_type:0:2x2x6x12,\t1:1\t(" - "run*0|defined*1)\texec_time:0us\ninput_type:0:3x3x3x6,\t1:1\t(run*0|" - "defined*1)\texec_time:0us\n\nAssign0B(0.00%,0.00%),0us(99.74%,0.00%)," - "0us(100.00%,0.00%),0us(99.71%,0.00%),0params(0.00%,0.00%),0float_ops(" - "100.00%,0.00%),0|3\n\ninput_type:0:1,\t1:1\t(run*0|defined*1)\texec_" + "inputshapes\nVariableV22.56KB(100.00%,8.40%),13us(100.00%,0.26%),0us(" + "100.00%,0.00%),13us(100.00%,0.29%),451params(100.00%,100.00%),0float_" + "ops(100.00%,0.00%),2|3\n\ninput_type:\t(run*2|defined*3)\texec_time:" + "13us\n\nAdd0B(0.00%,0.00%),0us(99.74%,0.00%),0us(100.00%,0.00%),0us(99." + "71%,0.00%),0params(0.00%,0.00%),0float_ops(100.00%,0.00%),0|3\n\ninput_" + "type:0:1,\t1:1\t(run*0|defined*1)\texec_time:0us\ninput_type:0:2x2x6x12," + "\t1:1\t(run*0|defined*1)\texec_time:0us\ninput_type:0:3x3x3x6,\t1:1\t(" + "run*0|defined*1)\texec_time:0us\n\nAssign0B(0.00%,0.00%),0us(99.74%,0." + "00%),0us(100.00%,0.00%),0us(99.71%,0.00%),0params(0.00%,0.00%),0float_" + "ops(100.00%,0.00%),0|3\n\ninput_type:0:1,\t1:1\t(run*0|defined*1)\texec_" "time:0us\ninput_type:0:2x2x6x12,\t1:2x2x6x12\t(run*0|defined*1)\texec_" "time:0us\ninput_type:0:3x3x3x6,\t1:3x3x3x6\t(run*0|defined*1)\texec_" "time:0us\n\nConst0B(0.00%,0.00%),2us(99.74%,0.04%),0us(100.00%,0.00%)," "2us(99.71%,0.04%),0params(0.00%,0.00%),0float_ops(100.00%,0.00%),1|" - "10\n\ninput_type:\t(run*1|defined*10)\texec_time:2us\n\nConv2D14.59KB(" - "100.00%,100.00%),4.89ms(99.70%,98.87%),404us(100.00%,100.00%),4.49ms(99." + "10\n\ninput_type:\t(run*1|defined*10)\texec_time:2us\n\nConv2D27.90KB(" + "91.60%,91.60%),4.89ms(99.70%,98.87%),404us(100.00%,100.00%),4.49ms(99." "67%,98.77%),0params(0.00%,0.00%),10.44kfloat_ops(100.00%,100.00%),2|" "2\n\ninput_type:0:2x3x3x6,\t1:2x2x6x12\t(run*1|defined*1)\texec_time:" "597us\ninput_type:0:2x6x6x3,\t1:3x3x3x6\t(run*1|defined*1)\texec_time:4." diff --git a/tensorflow/core/profiler/internal/tfprof_stats_test.cc b/tensorflow/core/profiler/internal/tfprof_stats_test.cc index 2f2101d76b..b86a83cb1b 100644 --- a/tensorflow/core/profiler/internal/tfprof_stats_test.cc +++ b/tensorflow/core/profiler/internal/tfprof_stats_test.cc @@ -89,21 +89,27 @@ TEST_F(TFProfStatsTest, CustomOpType) { GraphNodeProto expected; CHECK(protobuf::TextFormat::ParseFromString( - "name: \"_TFProfRoot\"\ntotal_exec_micros: 13\ntotal_parameters: " - "451\nchildren {\n name: \"DW\"\n exec_micros: 2\n parameters: 162\n " - "total_exec_micros: 2\n total_parameters: 162\n devices: " + "name: \"_TFProfRoot\"\ntotal_exec_micros: 13\ntotal_requested_bytes: " + "2560\ntotal_parameters: 451\nchildren {\n name: \"DW\"\n exec_micros: " + "2\n requested_bytes: 1280\n parameters: 162\n total_exec_micros: 2\n " + " total_requested_bytes: 1280\n total_parameters: 162\n devices: " "\"/job:localhost/replica:0/task:0/gpu:0\"\n cpu_exec_micros: 2\n " "total_cpu_exec_micros: 2\n run_count: 1\n total_run_count: 1\n " - "total_definition_count: 1\n output_bytes: 1280\n total_output_bytes: " - "1280\n}\nchildren {\n name: \"DW2\"\n exec_micros: 11\n parameters: " - "288\n total_exec_micros: 11\n total_parameters: 288\n devices: " + "total_definition_count: 1\n peak_bytes: 1280\n residual_bytes: 1280\n " + " output_bytes: 1280\n total_peak_bytes: 1280\n total_residual_bytes: " + "1280\n total_output_bytes: 1280\n}\nchildren {\n name: \"DW2\"\n " + "exec_micros: 11\n requested_bytes: 1280\n parameters: 288\n " + "total_exec_micros: 11\n total_requested_bytes: 1280\n " + "total_parameters: 288\n devices: " "\"/job:localhost/replica:0/task:0/gpu:0\"\n cpu_exec_micros: 11\n " "total_cpu_exec_micros: 11\n run_count: 1\n total_run_count: 1\n " - "total_definition_count: 1\n output_bytes: 1280\n total_output_bytes: " - "1280\n}\nchildren {\n name: \"ScalarW\"\n parameters: 1\n " - "total_parameters: 1\n total_definition_count: " + "total_definition_count: 1\n peak_bytes: 1280\n residual_bytes: 1280\n " + " output_bytes: 1280\n total_peak_bytes: 1280\n total_residual_bytes: " + "1280\n total_output_bytes: 1280\n}\nchildren {\n name: \"ScalarW\"\n " + "parameters: 1\n total_parameters: 1\n total_definition_count: " "1\n}\ntotal_cpu_exec_micros: 13\ntotal_run_count: " - "2\ntotal_definition_count: 3\ntotal_output_bytes: 2560\n", + "2\ntotal_definition_count: 3\ntotal_peak_bytes: " + "2560\ntotal_residual_bytes: 2560\ntotal_output_bytes: 2560\n", &expected)); EXPECT_EQ(expected.DebugString(), root.DebugString()); @@ -119,21 +125,27 @@ TEST_F(TFProfStatsTest, CheckPointOpType) { GraphNodeProto expected; CHECK(protobuf::TextFormat::ParseFromString( - "name: \"_TFProfRoot\"\ntotal_exec_micros: 13\ntotal_parameters: " - "451\nchildren {\n name: \"DW\"\n exec_micros: 2\n parameters: 162\n " - "total_exec_micros: 2\n total_parameters: 162\n devices: " + "name: \"_TFProfRoot\"\ntotal_exec_micros: 13\ntotal_requested_bytes: " + "2560\ntotal_parameters: 451\nchildren {\n name: \"DW\"\n exec_micros: " + "2\n requested_bytes: 1280\n parameters: 162\n total_exec_micros: 2\n " + " total_requested_bytes: 1280\n total_parameters: 162\n devices: " "\"/job:localhost/replica:0/task:0/gpu:0\"\n cpu_exec_micros: 2\n " "total_cpu_exec_micros: 2\n run_count: 1\n total_run_count: 1\n " - "total_definition_count: 1\n output_bytes: 1280\n total_output_bytes: " - "1280\n}\nchildren {\n name: \"DW2\"\n exec_micros: 11\n parameters: " - "288\n total_exec_micros: 11\n total_parameters: 288\n devices: " + "total_definition_count: 1\n peak_bytes: 1280\n residual_bytes: 1280\n " + " output_bytes: 1280\n total_peak_bytes: 1280\n total_residual_bytes: " + "1280\n total_output_bytes: 1280\n}\nchildren {\n name: \"DW2\"\n " + "exec_micros: 11\n requested_bytes: 1280\n parameters: 288\n " + "total_exec_micros: 11\n total_requested_bytes: 1280\n " + "total_parameters: 288\n devices: " "\"/job:localhost/replica:0/task:0/gpu:0\"\n cpu_exec_micros: 11\n " "total_cpu_exec_micros: 11\n run_count: 1\n total_run_count: 1\n " - "total_definition_count: 1\n output_bytes: 1280\n total_output_bytes: " - "1280\n}\nchildren {\n name: \"ScalarW\"\n parameters: 1\n " - "total_parameters: 1\n total_definition_count: " + "total_definition_count: 1\n peak_bytes: 1280\n residual_bytes: 1280\n " + " output_bytes: 1280\n total_peak_bytes: 1280\n total_residual_bytes: " + "1280\n total_output_bytes: 1280\n}\nchildren {\n name: \"ScalarW\"\n " + "parameters: 1\n total_parameters: 1\n total_definition_count: " "1\n}\ntotal_cpu_exec_micros: 13\ntotal_run_count: " - "2\ntotal_definition_count: 3\ntotal_output_bytes: 2560\n", + "2\ntotal_definition_count: 3\ntotal_peak_bytes: " + "2560\ntotal_residual_bytes: 2560\ntotal_output_bytes: 2560\n", &expected)); EXPECT_EQ(expected.DebugString(), root.DebugString()); @@ -150,7 +162,7 @@ TEST_F(TFProfStatsTest, TestGraph) { GraphNodeProto expected; CHECK(protobuf::TextFormat::ParseFromString( "name: \"_TFProfRoot\"\ntotal_exec_micros: 4945\ntotal_requested_bytes: " - "14592\ntotal_parameters: 451\nchildren {\n name: " + "30464\ntotal_parameters: 451\nchildren {\n name: " "\"DW/Initializer/random_normal/mul\"\n children {\n name: " "\"DW/Initializer/random_normal/RandomStandardNormal\"\n children {\n " " name: \"DW/Initializer/random_normal/shape\"\n " @@ -166,7 +178,7 @@ TEST_F(TFProfStatsTest, TestGraph) { "4\n}\ntotal_float_ops: 10440\ntotal_accelerator_exec_micros: " "404\ntotal_cpu_exec_micros: 4541\ntotal_run_count: " "6\ntotal_definition_count: 32\ntotal_peak_bytes: " - "9984\ntotal_residual_bytes: 1280\ntotal_output_bytes: 4864\n", + "25856\ntotal_residual_bytes: 3840\ntotal_output_bytes: 4864\n", &expected)); EXPECT_EQ(expected.DebugString(), root.DebugString()); @@ -181,9 +193,9 @@ TEST_F(TFProfStatsTest, TestFloatOps) { GraphNodeProto expected; CHECK(protobuf::TextFormat::ParseFromString( "name: \"_TFProfRoot\"\ntotal_exec_micros: 4945\ntotal_requested_bytes: " - "14592\ntotal_parameters: 451\nchildren {\n name: \"Conv2D\"\n " - "exec_micros: 4292\n requested_bytes: 9472\n total_exec_micros: 4292\n " - " total_requested_bytes: 9472\n devices: " + "30464\ntotal_parameters: 451\nchildren {\n name: \"Conv2D\"\n " + "exec_micros: 4292\n requested_bytes: 18176\n total_exec_micros: " + "4292\n total_requested_bytes: 18176\n devices: " "\"/job:localhost/replica:0/task:0/gpu:0\"\n float_ops: 5832\n " "total_float_ops: 5832\n input_shapes {\n key: 0\n value {\n " "dim {\n size: 2\n }\n dim {\n size: 6\n " @@ -194,11 +206,11 @@ TEST_F(TFProfStatsTest, TestFloatOps) { "6\n }\n }\n }\n accelerator_exec_micros: 226\n " "cpu_exec_micros: 4066\n total_accelerator_exec_micros: 226\n " "total_cpu_exec_micros: 4066\n run_count: 1\n total_run_count: 1\n " - "total_definition_count: 1\n peak_bytes: 5888\n residual_bytes: 768\n " - "output_bytes: 768\n total_peak_bytes: 5888\n total_residual_bytes: " + "total_definition_count: 1\n peak_bytes: 14592\n residual_bytes: 768\n " + " output_bytes: 768\n total_peak_bytes: 14592\n total_residual_bytes: " "768\n total_output_bytes: 768\n}\nchildren {\n name: \"Conv2D_1\"\n " - "exec_micros: 597\n requested_bytes: 5120\n total_exec_micros: 597\n " - "total_requested_bytes: 5120\n devices: " + "exec_micros: 597\n requested_bytes: 9728\n total_exec_micros: 597\n " + "total_requested_bytes: 9728\n devices: " "\"/job:localhost/replica:0/task:0/gpu:0\"\n float_ops: 4608\n " "total_float_ops: 4608\n input_shapes {\n key: 0\n value {\n " "dim {\n size: 2\n }\n dim {\n size: 3\n " @@ -209,12 +221,12 @@ TEST_F(TFProfStatsTest, TestFloatOps) { "12\n }\n }\n }\n accelerator_exec_micros: 178\n " "cpu_exec_micros: 419\n total_accelerator_exec_micros: 178\n " "total_cpu_exec_micros: 419\n run_count: 1\n total_run_count: 1\n " - "total_definition_count: 1\n peak_bytes: 4096\n residual_bytes: 512\n " - "output_bytes: 512\n total_peak_bytes: 4096\n total_residual_bytes: " + "total_definition_count: 1\n peak_bytes: 8704\n residual_bytes: 512\n " + "output_bytes: 512\n total_peak_bytes: 8704\n total_residual_bytes: " "512\n total_output_bytes: 512\n}\ntotal_float_ops: " "10440\ntotal_accelerator_exec_micros: 404\ntotal_cpu_exec_micros: " "4541\ntotal_run_count: 6\ntotal_definition_count: 35\ntotal_peak_bytes: " - "9984\ntotal_residual_bytes: 1280\ntotal_output_bytes: 4864\n", + "25856\ntotal_residual_bytes: 3840\ntotal_output_bytes: 4864\n", &expected)); EXPECT_EQ(expected.DebugString(), root.DebugString()); @@ -231,9 +243,9 @@ TEST_F(TFProfStatsTest, TestAccountShownNameOnly) { GraphNodeProto expected; CHECK(protobuf::TextFormat::ParseFromString( "name: \"_TFProfRoot\"\ntotal_exec_micros: 597\ntotal_requested_bytes: " - "5120\nchildren {\n name: \"Conv2D_1\"\n exec_micros: 597\n " - "requested_bytes: 5120\n total_exec_micros: 597\n " - "total_requested_bytes: 5120\n devices: " + "9728\nchildren {\n name: \"Conv2D_1\"\n exec_micros: 597\n " + "requested_bytes: 9728\n total_exec_micros: 597\n " + "total_requested_bytes: 9728\n devices: " "\"/job:localhost/replica:0/task:0/gpu:0\"\n float_ops: 4608\n " "total_float_ops: 4608\n input_shapes {\n key: 0\n value {\n " "dim {\n size: 2\n }\n dim {\n size: 3\n " @@ -244,12 +256,12 @@ TEST_F(TFProfStatsTest, TestAccountShownNameOnly) { "12\n }\n }\n }\n accelerator_exec_micros: 178\n " "cpu_exec_micros: 419\n total_accelerator_exec_micros: 178\n " "total_cpu_exec_micros: 419\n run_count: 1\n total_run_count: 1\n " - "total_definition_count: 1\n peak_bytes: 4096\n residual_bytes: 512\n " - "output_bytes: 512\n total_peak_bytes: 4096\n total_residual_bytes: " + "total_definition_count: 1\n peak_bytes: 8704\n residual_bytes: 512\n " + "output_bytes: 512\n total_peak_bytes: 8704\n total_residual_bytes: " "512\n total_output_bytes: 512\n}\ntotal_float_ops: " "4608\ntotal_accelerator_exec_micros: 178\ntotal_cpu_exec_micros: " "419\ntotal_run_count: 1\ntotal_definition_count: 2\ntotal_peak_bytes: " - "4096\ntotal_residual_bytes: 512\ntotal_output_bytes: 512\n", + "8704\ntotal_residual_bytes: 512\ntotal_output_bytes: 512\n", &expected)); EXPECT_EQ(expected.DebugString(), root.DebugString()); @@ -265,8 +277,9 @@ TEST_F(TFProfStatsTest, TestShowTensorValue) { GraphNodeProto expected; CHECK(protobuf::TextFormat::ParseFromString( "name: \"_TFProfRoot\"\ntotal_exec_micros: 4945\ntotal_requested_bytes: " - "14592\ntotal_parameters: 451\nchildren {\n name: \"DW\"\n " - "exec_micros: 2\n parameters: 162\n total_exec_micros: 2\n " + "30464\ntotal_parameters: 451\nchildren {\n name: \"DW\"\n " + "exec_micros: 2\n requested_bytes: 1280\n parameters: 162\n " + "total_exec_micros: 2\n total_requested_bytes: 1280\n " "total_parameters: 162\n devices: " "\"/job:localhost/replica:0/task:0/gpu:0\"\n tensor_value {\n dtype: " "DT_FLOAT\n value_double: -0.000534315\n value_double: " @@ -351,11 +364,13 @@ TEST_F(TFProfStatsTest, TestShowTensorValue) { "value_double: 0.000374641\n value_double: -0.00149603\n " "value_double: -0.000317367\n value_double: -0.000417829\n }\n " "cpu_exec_micros: 2\n total_cpu_exec_micros: 2\n run_count: 1\n " - "total_run_count: 1\n total_definition_count: 10\n output_bytes: " - "1280\n total_output_bytes: 1280\n}\ntotal_float_ops: " - "10440\ntotal_accelerator_exec_micros: 404\ntotal_cpu_exec_micros: " - "4541\ntotal_run_count: 6\ntotal_definition_count: 35\ntotal_peak_bytes: " - "9984\ntotal_residual_bytes: 1280\ntotal_output_bytes: 4864\n", + "total_run_count: 1\n total_definition_count: 10\n peak_bytes: 1280\n " + "residual_bytes: 1280\n output_bytes: 1280\n total_peak_bytes: 1280\n " + "total_residual_bytes: 1280\n total_output_bytes: " + "1280\n}\ntotal_float_ops: 10440\ntotal_accelerator_exec_micros: " + "404\ntotal_cpu_exec_micros: 4541\ntotal_run_count: " + "6\ntotal_definition_count: 35\ntotal_peak_bytes: " + "25856\ntotal_residual_bytes: 3840\ntotal_output_bytes: 4864\n", &expected)); EXPECT_EQ(expected.DebugString(), root.DebugString()); } diff --git a/tensorflow/core/profiler/tfprof_log.proto b/tensorflow/core/profiler/tfprof_log.proto index f92301133a..b49bdf64ac 100644 --- a/tensorflow/core/profiler/tfprof_log.proto +++ b/tensorflow/core/profiler/tfprof_log.proto @@ -124,9 +124,10 @@ message ExecProfile { int64 residual_bytes = 9; // Total bytes output by the op (not necessarily requested by the op). int64 output_bytes = 10; - // Total temporary bytes allocated and released by the op. + // NOTE: Please don't depend on the following 4 fields yet. Due to + // TensorFlow internal tracing issues, the numbers can be quite wrong. + // TODO(xpan): Fix the TensorFlow internal tracing. int64 host_temp_bytes = 11; - // Total persistent bytes (e.g. variable) allocated by the op. int64 host_persistent_bytes = 12; int64 accelerator_temp_bytes = 13; int64 accelerator_persistent_bytes = 14; |