aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/core/profiler
diff options
context:
space:
mode:
authorGravatar A. Unique TensorFlower <gardener@tensorflow.org>2017-11-30 15:20:30 -0800
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2017-11-30 15:23:54 -0800
commitce4200eae990d7f5efdfb727939d38bf48001ba2 (patch)
tree7c9c3e9cd932273198fa2ef8e4fb24637de1f42f /tensorflow/core/profiler
parent6bfc73a0b3c6810725a5eb0020470457cc5cc23e (diff)
Fix profiler to track some missed persistent bytes.
PiperOrigin-RevId: 177516249
Diffstat (limited to 'tensorflow/core/profiler')
-rw-r--r--tensorflow/core/profiler/g3doc/options.md11
-rw-r--r--tensorflow/core/profiler/internal/tfprof_node.cc28
-rw-r--r--tensorflow/core/profiler/internal/tfprof_node.h10
-rw-r--r--tensorflow/core/profiler/internal/tfprof_show_test.cc37
-rw-r--r--tensorflow/core/profiler/internal/tfprof_stats_test.cc105
-rw-r--r--tensorflow/core/profiler/tfprof_log.proto5
6 files changed, 119 insertions, 77 deletions
diff --git a/tensorflow/core/profiler/g3doc/options.md b/tensorflow/core/profiler/g3doc/options.md
index 4c73e372e3..dd12f76d6f 100644
--- a/tensorflow/core/profiler/g3doc/options.md
+++ b/tensorflow/core/profiler/g3doc/options.md
@@ -60,11 +60,14 @@ Currently, profiler only tracks the allocation of memory. As a result, the
accumulated memory request is uaually larger than the peak memory of the overall
model.
-bytes: The memory allocations requested by the operation.
-peak_bytes: The peak requested memory (not de-allocated) by the operation.
-residual_bytes: The memory requested by the operation and not de-allocated
+It's recommended to generate timeline to see the allocator memory usage over
+time.
+
+`bytes`: The memory allocations requested by the operation.
+`peak_bytes`: The peak requested memory (not de-allocated) by the operation.
+`residual_bytes`: The memory requested by the operation and not de-allocated
when Compute finishes.
-output_bytes: The memory output by the operation. It's not necessarily requested
+`output_bytes`: The memory output by the operation. It's not necessarily requested
by the current operation. For example, it can be a tensor
forwarded from input to output, with in-place mutation.
diff --git a/tensorflow/core/profiler/internal/tfprof_node.cc b/tensorflow/core/profiler/internal/tfprof_node.cc
index 671b65d708..5cd1050bcc 100644
--- a/tensorflow/core/profiler/internal/tfprof_node.cc
+++ b/tensorflow/core/profiler/internal/tfprof_node.cc
@@ -139,6 +139,25 @@ void ExecStep::AddMemoryStats(const string& dev,
exec_.accelerator_persistent_bytes() +
step_stat.memory_stats().device_persistent_memory_size());
}
+
+ // TODO(xpan): Make this more accurate:
+ // High level: Memory tracking is suspicous and requires large scale
+ // clean up.
+ // Investigte the memory usage difference between CPU/GPU with OpViewTest.
+ //
+ // 1. OpKernelConstruction::allocate_xxx is not traced. Below, we only
+ // discuss OpKernelContext-related allocations.
+ // 2. allocate_output calls allocate_tensor, which is properly tracked in
+ // 'NodeExecStats.memory'.
+ // 3. allocate_temp is only tracked through record_xxx_temp. It appears
+ // in 'NodeExecStats.memory_stats'.
+ // 4. allocate_persistent calls allocate_tensor, which is properly tracked
+ // in 'NodeExecStats.memory'. However, there is no way to count it as
+ // persistent now.
+ // 5. record_xxx_persistent is called when allocate_persistent
+ // is not used and hence tracks some complementary bytes. It appears in
+ // 'NodeExecStats.memory_stats'. It's suspicious. But we should
+ // use it now since it covers constant op.
int64 residual_bytes = 0;
int64 requested_bytes = 0;
int64 peak_bytes = 0;
@@ -147,6 +166,15 @@ void ExecStep::AddMemoryStats(const string& dev,
requested_bytes += mem.total_bytes();
peak_bytes += mem.peak_bytes();
}
+ residual_bytes +=
+ exec_.host_persistent_bytes() + exec_.accelerator_persistent_bytes();
+ requested_bytes += exec_.host_persistent_bytes() +
+ exec_.accelerator_persistent_bytes() +
+ exec_.host_temp_bytes() + exec_.accelerator_temp_bytes();
+ peak_bytes += exec_.host_persistent_bytes() +
+ exec_.accelerator_persistent_bytes() + exec_.host_temp_bytes() +
+ exec_.accelerator_temp_bytes();
+
exec_.set_requested_bytes(requested_bytes);
exec_.set_residual_bytes(residual_bytes);
exec_.set_peak_bytes(peak_bytes);
diff --git a/tensorflow/core/profiler/internal/tfprof_node.h b/tensorflow/core/profiler/internal/tfprof_node.h
index e2d0563a07..77c14cb792 100644
--- a/tensorflow/core/profiler/internal/tfprof_node.h
+++ b/tensorflow/core/profiler/internal/tfprof_node.h
@@ -593,17 +593,11 @@ class TFGraphNode {
int64 accelerator_persistent_bytes() const {
int64 persistent_bytes = 0;
for (const auto& exec : execs_) {
- persistent_bytes += exec.second.accelerator_persistent_bytes();
+ persistent_bytes = std::max(persistent_bytes,
+ exec.second.accelerator_persistent_bytes());
}
return persistent_bytes;
}
- int64 host_persistent_bytes(int64 step) const {
- auto exec = execs_.find(step);
- if (exec == execs_.end()) {
- return 0;
- }
- return exec->second.host_persistent_bytes();
- }
const std::map<int32, std::pair<int64, uint64>>& output_memory(
int64 step) const {
auto exec = execs_.find(step);
diff --git a/tensorflow/core/profiler/internal/tfprof_show_test.cc b/tensorflow/core/profiler/internal/tfprof_show_test.cc
index 1f19f8c322..98773ae19e 100644
--- a/tensorflow/core/profiler/internal/tfprof_show_test.cc
+++ b/tensorflow/core/profiler/internal/tfprof_show_test.cc
@@ -105,12 +105,13 @@ TEST_F(TFProfShowTest, DumpScopeMode) {
"node name | # parameters | # float_ops | requested bytes | peak bytes | "
"residual bytes | output bytes | total execution time | accelerator "
"execution time | cpu execution time\n_TFProfRoot (--/451 params, --/0 "
- "flops, --/0B, --/0B, --/0B, --/2.56KB, --/13us, --/0us, --/13us)\n DW "
- "(3x3x3x6, 162/162 params, 0/0 flops, 0B/0B, 0B/0B, 0B/0B, "
- "1.28KB/1.28KB, 2us/2us, 0us/0us, 2us/2us)\n DW2 (2x2x6x12, 288/288 "
- "params, 0/0 flops, 0B/0B, 0B/0B, 0B/0B, 1.28KB/1.28KB, 11us/11us, "
- "0us/0us, 11us/11us)\n ScalarW (1, 1/1 params, 0/0 flops, 0B/0B, 0B/0B, "
- "0B/0B, 0B/0B, 0us/0us, 0us/0us, 0us/0us)\n",
+ "flops, --/2.56KB, --/2.56KB, --/2.56KB, --/2.56KB, --/13us, --/0us, "
+ "--/13us)\n DW (3x3x3x6, 162/162 params, 0/0 flops, 1.28KB/1.28KB, "
+ "1.28KB/1.28KB, 1.28KB/1.28KB, 1.28KB/1.28KB, 2us/2us, 0us/0us, "
+ "2us/2us)\n DW2 (2x2x6x12, 288/288 params, 0/0 flops, 1.28KB/1.28KB, "
+ "1.28KB/1.28KB, 1.28KB/1.28KB, 1.28KB/1.28KB, 11us/11us, 0us/0us, "
+ "11us/11us)\n ScalarW (1, 1/1 params, 0/0 flops, 0B/0B, 0B/0B, 0B/0B, "
+ "0B/0B, 0us/0us, 0us/0us, 0us/0us)\n",
dump_str);
EXPECT_EQ(dump_str, TestToFromProto("scope", opts));
@@ -178,22 +179,22 @@ TEST_F(TFProfShowTest, DumpOpMode) {
EXPECT_EQ(
"nodename|requestedbytes|totalexecutiontime|acceleratorexecutiontime|"
"cpuexecutiontime|#parameters|#float_ops|opoccurrence(run|defined)|"
- "inputshapes\nVariableV20B(0.00%,0.00%),13us(100.00%,0.26%),0us(100.00%,"
- "0.00%),13us(100.00%,0.29%),451params(100.00%,100.00%),0float_ops(100.00%"
- ",0.00%),2|3\n\ninput_type:\t(run*2|defined*3)\texec_time:13us\n\nAdd0B("
- "0.00%,0.00%),0us(99.74%,0.00%),0us(100.00%,0.00%),0us(99.71%,0.00%),"
- "0params(0.00%,0.00%),0float_ops(100.00%,0.00%),0|3\n\ninput_type:0:1,"
- "\t1:1\t(run*0|defined*1)\texec_time:0us\ninput_type:0:2x2x6x12,\t1:1\t("
- "run*0|defined*1)\texec_time:0us\ninput_type:0:3x3x3x6,\t1:1\t(run*0|"
- "defined*1)\texec_time:0us\n\nAssign0B(0.00%,0.00%),0us(99.74%,0.00%),"
- "0us(100.00%,0.00%),0us(99.71%,0.00%),0params(0.00%,0.00%),0float_ops("
- "100.00%,0.00%),0|3\n\ninput_type:0:1,\t1:1\t(run*0|defined*1)\texec_"
+ "inputshapes\nVariableV22.56KB(100.00%,8.40%),13us(100.00%,0.26%),0us("
+ "100.00%,0.00%),13us(100.00%,0.29%),451params(100.00%,100.00%),0float_"
+ "ops(100.00%,0.00%),2|3\n\ninput_type:\t(run*2|defined*3)\texec_time:"
+ "13us\n\nAdd0B(0.00%,0.00%),0us(99.74%,0.00%),0us(100.00%,0.00%),0us(99."
+ "71%,0.00%),0params(0.00%,0.00%),0float_ops(100.00%,0.00%),0|3\n\ninput_"
+ "type:0:1,\t1:1\t(run*0|defined*1)\texec_time:0us\ninput_type:0:2x2x6x12,"
+ "\t1:1\t(run*0|defined*1)\texec_time:0us\ninput_type:0:3x3x3x6,\t1:1\t("
+ "run*0|defined*1)\texec_time:0us\n\nAssign0B(0.00%,0.00%),0us(99.74%,0."
+ "00%),0us(100.00%,0.00%),0us(99.71%,0.00%),0params(0.00%,0.00%),0float_"
+ "ops(100.00%,0.00%),0|3\n\ninput_type:0:1,\t1:1\t(run*0|defined*1)\texec_"
"time:0us\ninput_type:0:2x2x6x12,\t1:2x2x6x12\t(run*0|defined*1)\texec_"
"time:0us\ninput_type:0:3x3x3x6,\t1:3x3x3x6\t(run*0|defined*1)\texec_"
"time:0us\n\nConst0B(0.00%,0.00%),2us(99.74%,0.04%),0us(100.00%,0.00%),"
"2us(99.71%,0.04%),0params(0.00%,0.00%),0float_ops(100.00%,0.00%),1|"
- "10\n\ninput_type:\t(run*1|defined*10)\texec_time:2us\n\nConv2D14.59KB("
- "100.00%,100.00%),4.89ms(99.70%,98.87%),404us(100.00%,100.00%),4.49ms(99."
+ "10\n\ninput_type:\t(run*1|defined*10)\texec_time:2us\n\nConv2D27.90KB("
+ "91.60%,91.60%),4.89ms(99.70%,98.87%),404us(100.00%,100.00%),4.49ms(99."
"67%,98.77%),0params(0.00%,0.00%),10.44kfloat_ops(100.00%,100.00%),2|"
"2\n\ninput_type:0:2x3x3x6,\t1:2x2x6x12\t(run*1|defined*1)\texec_time:"
"597us\ninput_type:0:2x6x6x3,\t1:3x3x3x6\t(run*1|defined*1)\texec_time:4."
diff --git a/tensorflow/core/profiler/internal/tfprof_stats_test.cc b/tensorflow/core/profiler/internal/tfprof_stats_test.cc
index 2f2101d76b..b86a83cb1b 100644
--- a/tensorflow/core/profiler/internal/tfprof_stats_test.cc
+++ b/tensorflow/core/profiler/internal/tfprof_stats_test.cc
@@ -89,21 +89,27 @@ TEST_F(TFProfStatsTest, CustomOpType) {
GraphNodeProto expected;
CHECK(protobuf::TextFormat::ParseFromString(
- "name: \"_TFProfRoot\"\ntotal_exec_micros: 13\ntotal_parameters: "
- "451\nchildren {\n name: \"DW\"\n exec_micros: 2\n parameters: 162\n "
- "total_exec_micros: 2\n total_parameters: 162\n devices: "
+ "name: \"_TFProfRoot\"\ntotal_exec_micros: 13\ntotal_requested_bytes: "
+ "2560\ntotal_parameters: 451\nchildren {\n name: \"DW\"\n exec_micros: "
+ "2\n requested_bytes: 1280\n parameters: 162\n total_exec_micros: 2\n "
+ " total_requested_bytes: 1280\n total_parameters: 162\n devices: "
"\"/job:localhost/replica:0/task:0/gpu:0\"\n cpu_exec_micros: 2\n "
"total_cpu_exec_micros: 2\n run_count: 1\n total_run_count: 1\n "
- "total_definition_count: 1\n output_bytes: 1280\n total_output_bytes: "
- "1280\n}\nchildren {\n name: \"DW2\"\n exec_micros: 11\n parameters: "
- "288\n total_exec_micros: 11\n total_parameters: 288\n devices: "
+ "total_definition_count: 1\n peak_bytes: 1280\n residual_bytes: 1280\n "
+ " output_bytes: 1280\n total_peak_bytes: 1280\n total_residual_bytes: "
+ "1280\n total_output_bytes: 1280\n}\nchildren {\n name: \"DW2\"\n "
+ "exec_micros: 11\n requested_bytes: 1280\n parameters: 288\n "
+ "total_exec_micros: 11\n total_requested_bytes: 1280\n "
+ "total_parameters: 288\n devices: "
"\"/job:localhost/replica:0/task:0/gpu:0\"\n cpu_exec_micros: 11\n "
"total_cpu_exec_micros: 11\n run_count: 1\n total_run_count: 1\n "
- "total_definition_count: 1\n output_bytes: 1280\n total_output_bytes: "
- "1280\n}\nchildren {\n name: \"ScalarW\"\n parameters: 1\n "
- "total_parameters: 1\n total_definition_count: "
+ "total_definition_count: 1\n peak_bytes: 1280\n residual_bytes: 1280\n "
+ " output_bytes: 1280\n total_peak_bytes: 1280\n total_residual_bytes: "
+ "1280\n total_output_bytes: 1280\n}\nchildren {\n name: \"ScalarW\"\n "
+ "parameters: 1\n total_parameters: 1\n total_definition_count: "
"1\n}\ntotal_cpu_exec_micros: 13\ntotal_run_count: "
- "2\ntotal_definition_count: 3\ntotal_output_bytes: 2560\n",
+ "2\ntotal_definition_count: 3\ntotal_peak_bytes: "
+ "2560\ntotal_residual_bytes: 2560\ntotal_output_bytes: 2560\n",
&expected));
EXPECT_EQ(expected.DebugString(), root.DebugString());
@@ -119,21 +125,27 @@ TEST_F(TFProfStatsTest, CheckPointOpType) {
GraphNodeProto expected;
CHECK(protobuf::TextFormat::ParseFromString(
- "name: \"_TFProfRoot\"\ntotal_exec_micros: 13\ntotal_parameters: "
- "451\nchildren {\n name: \"DW\"\n exec_micros: 2\n parameters: 162\n "
- "total_exec_micros: 2\n total_parameters: 162\n devices: "
+ "name: \"_TFProfRoot\"\ntotal_exec_micros: 13\ntotal_requested_bytes: "
+ "2560\ntotal_parameters: 451\nchildren {\n name: \"DW\"\n exec_micros: "
+ "2\n requested_bytes: 1280\n parameters: 162\n total_exec_micros: 2\n "
+ " total_requested_bytes: 1280\n total_parameters: 162\n devices: "
"\"/job:localhost/replica:0/task:0/gpu:0\"\n cpu_exec_micros: 2\n "
"total_cpu_exec_micros: 2\n run_count: 1\n total_run_count: 1\n "
- "total_definition_count: 1\n output_bytes: 1280\n total_output_bytes: "
- "1280\n}\nchildren {\n name: \"DW2\"\n exec_micros: 11\n parameters: "
- "288\n total_exec_micros: 11\n total_parameters: 288\n devices: "
+ "total_definition_count: 1\n peak_bytes: 1280\n residual_bytes: 1280\n "
+ " output_bytes: 1280\n total_peak_bytes: 1280\n total_residual_bytes: "
+ "1280\n total_output_bytes: 1280\n}\nchildren {\n name: \"DW2\"\n "
+ "exec_micros: 11\n requested_bytes: 1280\n parameters: 288\n "
+ "total_exec_micros: 11\n total_requested_bytes: 1280\n "
+ "total_parameters: 288\n devices: "
"\"/job:localhost/replica:0/task:0/gpu:0\"\n cpu_exec_micros: 11\n "
"total_cpu_exec_micros: 11\n run_count: 1\n total_run_count: 1\n "
- "total_definition_count: 1\n output_bytes: 1280\n total_output_bytes: "
- "1280\n}\nchildren {\n name: \"ScalarW\"\n parameters: 1\n "
- "total_parameters: 1\n total_definition_count: "
+ "total_definition_count: 1\n peak_bytes: 1280\n residual_bytes: 1280\n "
+ " output_bytes: 1280\n total_peak_bytes: 1280\n total_residual_bytes: "
+ "1280\n total_output_bytes: 1280\n}\nchildren {\n name: \"ScalarW\"\n "
+ "parameters: 1\n total_parameters: 1\n total_definition_count: "
"1\n}\ntotal_cpu_exec_micros: 13\ntotal_run_count: "
- "2\ntotal_definition_count: 3\ntotal_output_bytes: 2560\n",
+ "2\ntotal_definition_count: 3\ntotal_peak_bytes: "
+ "2560\ntotal_residual_bytes: 2560\ntotal_output_bytes: 2560\n",
&expected));
EXPECT_EQ(expected.DebugString(), root.DebugString());
@@ -150,7 +162,7 @@ TEST_F(TFProfStatsTest, TestGraph) {
GraphNodeProto expected;
CHECK(protobuf::TextFormat::ParseFromString(
"name: \"_TFProfRoot\"\ntotal_exec_micros: 4945\ntotal_requested_bytes: "
- "14592\ntotal_parameters: 451\nchildren {\n name: "
+ "30464\ntotal_parameters: 451\nchildren {\n name: "
"\"DW/Initializer/random_normal/mul\"\n children {\n name: "
"\"DW/Initializer/random_normal/RandomStandardNormal\"\n children {\n "
" name: \"DW/Initializer/random_normal/shape\"\n "
@@ -166,7 +178,7 @@ TEST_F(TFProfStatsTest, TestGraph) {
"4\n}\ntotal_float_ops: 10440\ntotal_accelerator_exec_micros: "
"404\ntotal_cpu_exec_micros: 4541\ntotal_run_count: "
"6\ntotal_definition_count: 32\ntotal_peak_bytes: "
- "9984\ntotal_residual_bytes: 1280\ntotal_output_bytes: 4864\n",
+ "25856\ntotal_residual_bytes: 3840\ntotal_output_bytes: 4864\n",
&expected));
EXPECT_EQ(expected.DebugString(), root.DebugString());
@@ -181,9 +193,9 @@ TEST_F(TFProfStatsTest, TestFloatOps) {
GraphNodeProto expected;
CHECK(protobuf::TextFormat::ParseFromString(
"name: \"_TFProfRoot\"\ntotal_exec_micros: 4945\ntotal_requested_bytes: "
- "14592\ntotal_parameters: 451\nchildren {\n name: \"Conv2D\"\n "
- "exec_micros: 4292\n requested_bytes: 9472\n total_exec_micros: 4292\n "
- " total_requested_bytes: 9472\n devices: "
+ "30464\ntotal_parameters: 451\nchildren {\n name: \"Conv2D\"\n "
+ "exec_micros: 4292\n requested_bytes: 18176\n total_exec_micros: "
+ "4292\n total_requested_bytes: 18176\n devices: "
"\"/job:localhost/replica:0/task:0/gpu:0\"\n float_ops: 5832\n "
"total_float_ops: 5832\n input_shapes {\n key: 0\n value {\n "
"dim {\n size: 2\n }\n dim {\n size: 6\n "
@@ -194,11 +206,11 @@ TEST_F(TFProfStatsTest, TestFloatOps) {
"6\n }\n }\n }\n accelerator_exec_micros: 226\n "
"cpu_exec_micros: 4066\n total_accelerator_exec_micros: 226\n "
"total_cpu_exec_micros: 4066\n run_count: 1\n total_run_count: 1\n "
- "total_definition_count: 1\n peak_bytes: 5888\n residual_bytes: 768\n "
- "output_bytes: 768\n total_peak_bytes: 5888\n total_residual_bytes: "
+ "total_definition_count: 1\n peak_bytes: 14592\n residual_bytes: 768\n "
+ " output_bytes: 768\n total_peak_bytes: 14592\n total_residual_bytes: "
"768\n total_output_bytes: 768\n}\nchildren {\n name: \"Conv2D_1\"\n "
- "exec_micros: 597\n requested_bytes: 5120\n total_exec_micros: 597\n "
- "total_requested_bytes: 5120\n devices: "
+ "exec_micros: 597\n requested_bytes: 9728\n total_exec_micros: 597\n "
+ "total_requested_bytes: 9728\n devices: "
"\"/job:localhost/replica:0/task:0/gpu:0\"\n float_ops: 4608\n "
"total_float_ops: 4608\n input_shapes {\n key: 0\n value {\n "
"dim {\n size: 2\n }\n dim {\n size: 3\n "
@@ -209,12 +221,12 @@ TEST_F(TFProfStatsTest, TestFloatOps) {
"12\n }\n }\n }\n accelerator_exec_micros: 178\n "
"cpu_exec_micros: 419\n total_accelerator_exec_micros: 178\n "
"total_cpu_exec_micros: 419\n run_count: 1\n total_run_count: 1\n "
- "total_definition_count: 1\n peak_bytes: 4096\n residual_bytes: 512\n "
- "output_bytes: 512\n total_peak_bytes: 4096\n total_residual_bytes: "
+ "total_definition_count: 1\n peak_bytes: 8704\n residual_bytes: 512\n "
+ "output_bytes: 512\n total_peak_bytes: 8704\n total_residual_bytes: "
"512\n total_output_bytes: 512\n}\ntotal_float_ops: "
"10440\ntotal_accelerator_exec_micros: 404\ntotal_cpu_exec_micros: "
"4541\ntotal_run_count: 6\ntotal_definition_count: 35\ntotal_peak_bytes: "
- "9984\ntotal_residual_bytes: 1280\ntotal_output_bytes: 4864\n",
+ "25856\ntotal_residual_bytes: 3840\ntotal_output_bytes: 4864\n",
&expected));
EXPECT_EQ(expected.DebugString(), root.DebugString());
@@ -231,9 +243,9 @@ TEST_F(TFProfStatsTest, TestAccountShownNameOnly) {
GraphNodeProto expected;
CHECK(protobuf::TextFormat::ParseFromString(
"name: \"_TFProfRoot\"\ntotal_exec_micros: 597\ntotal_requested_bytes: "
- "5120\nchildren {\n name: \"Conv2D_1\"\n exec_micros: 597\n "
- "requested_bytes: 5120\n total_exec_micros: 597\n "
- "total_requested_bytes: 5120\n devices: "
+ "9728\nchildren {\n name: \"Conv2D_1\"\n exec_micros: 597\n "
+ "requested_bytes: 9728\n total_exec_micros: 597\n "
+ "total_requested_bytes: 9728\n devices: "
"\"/job:localhost/replica:0/task:0/gpu:0\"\n float_ops: 4608\n "
"total_float_ops: 4608\n input_shapes {\n key: 0\n value {\n "
"dim {\n size: 2\n }\n dim {\n size: 3\n "
@@ -244,12 +256,12 @@ TEST_F(TFProfStatsTest, TestAccountShownNameOnly) {
"12\n }\n }\n }\n accelerator_exec_micros: 178\n "
"cpu_exec_micros: 419\n total_accelerator_exec_micros: 178\n "
"total_cpu_exec_micros: 419\n run_count: 1\n total_run_count: 1\n "
- "total_definition_count: 1\n peak_bytes: 4096\n residual_bytes: 512\n "
- "output_bytes: 512\n total_peak_bytes: 4096\n total_residual_bytes: "
+ "total_definition_count: 1\n peak_bytes: 8704\n residual_bytes: 512\n "
+ "output_bytes: 512\n total_peak_bytes: 8704\n total_residual_bytes: "
"512\n total_output_bytes: 512\n}\ntotal_float_ops: "
"4608\ntotal_accelerator_exec_micros: 178\ntotal_cpu_exec_micros: "
"419\ntotal_run_count: 1\ntotal_definition_count: 2\ntotal_peak_bytes: "
- "4096\ntotal_residual_bytes: 512\ntotal_output_bytes: 512\n",
+ "8704\ntotal_residual_bytes: 512\ntotal_output_bytes: 512\n",
&expected));
EXPECT_EQ(expected.DebugString(), root.DebugString());
@@ -265,8 +277,9 @@ TEST_F(TFProfStatsTest, TestShowTensorValue) {
GraphNodeProto expected;
CHECK(protobuf::TextFormat::ParseFromString(
"name: \"_TFProfRoot\"\ntotal_exec_micros: 4945\ntotal_requested_bytes: "
- "14592\ntotal_parameters: 451\nchildren {\n name: \"DW\"\n "
- "exec_micros: 2\n parameters: 162\n total_exec_micros: 2\n "
+ "30464\ntotal_parameters: 451\nchildren {\n name: \"DW\"\n "
+ "exec_micros: 2\n requested_bytes: 1280\n parameters: 162\n "
+ "total_exec_micros: 2\n total_requested_bytes: 1280\n "
"total_parameters: 162\n devices: "
"\"/job:localhost/replica:0/task:0/gpu:0\"\n tensor_value {\n dtype: "
"DT_FLOAT\n value_double: -0.000534315\n value_double: "
@@ -351,11 +364,13 @@ TEST_F(TFProfStatsTest, TestShowTensorValue) {
"value_double: 0.000374641\n value_double: -0.00149603\n "
"value_double: -0.000317367\n value_double: -0.000417829\n }\n "
"cpu_exec_micros: 2\n total_cpu_exec_micros: 2\n run_count: 1\n "
- "total_run_count: 1\n total_definition_count: 10\n output_bytes: "
- "1280\n total_output_bytes: 1280\n}\ntotal_float_ops: "
- "10440\ntotal_accelerator_exec_micros: 404\ntotal_cpu_exec_micros: "
- "4541\ntotal_run_count: 6\ntotal_definition_count: 35\ntotal_peak_bytes: "
- "9984\ntotal_residual_bytes: 1280\ntotal_output_bytes: 4864\n",
+ "total_run_count: 1\n total_definition_count: 10\n peak_bytes: 1280\n "
+ "residual_bytes: 1280\n output_bytes: 1280\n total_peak_bytes: 1280\n "
+ "total_residual_bytes: 1280\n total_output_bytes: "
+ "1280\n}\ntotal_float_ops: 10440\ntotal_accelerator_exec_micros: "
+ "404\ntotal_cpu_exec_micros: 4541\ntotal_run_count: "
+ "6\ntotal_definition_count: 35\ntotal_peak_bytes: "
+ "25856\ntotal_residual_bytes: 3840\ntotal_output_bytes: 4864\n",
&expected));
EXPECT_EQ(expected.DebugString(), root.DebugString());
}
diff --git a/tensorflow/core/profiler/tfprof_log.proto b/tensorflow/core/profiler/tfprof_log.proto
index f92301133a..b49bdf64ac 100644
--- a/tensorflow/core/profiler/tfprof_log.proto
+++ b/tensorflow/core/profiler/tfprof_log.proto
@@ -124,9 +124,10 @@ message ExecProfile {
int64 residual_bytes = 9;
// Total bytes output by the op (not necessarily requested by the op).
int64 output_bytes = 10;
- // Total temporary bytes allocated and released by the op.
+ // NOTE: Please don't depend on the following 4 fields yet. Due to
+ // TensorFlow internal tracing issues, the numbers can be quite wrong.
+ // TODO(xpan): Fix the TensorFlow internal tracing.
int64 host_temp_bytes = 11;
- // Total persistent bytes (e.g. variable) allocated by the op.
int64 host_persistent_bytes = 12;
int64 accelerator_temp_bytes = 13;
int64 accelerator_persistent_bytes = 14;