aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/core/profiler
diff options
context:
space:
mode:
authorGravatar A. Unique TensorFlower <gardener@tensorflow.org>2017-12-08 16:30:02 -0800
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2017-12-08 16:33:27 -0800
commita0c21217fcf2993c5625a726c62a04b749afcddf (patch)
tree75b02abe7c2cd17894201a95fe41225e4c24ed3f /tensorflow/core/profiler
parentddfd6253fe0870779abc78be52c872d86b03f577 (diff)
Make profiler memory profiling work with tf.while_loop
PiperOrigin-RevId: 178443834
Diffstat (limited to 'tensorflow/core/profiler')
-rw-r--r--tensorflow/core/profiler/internal/tfprof_node.cc68
-rw-r--r--tensorflow/core/profiler/internal/tfprof_node.h123
-rw-r--r--tensorflow/core/profiler/internal/tfprof_timeline.cc16
-rw-r--r--tensorflow/core/profiler/internal/tfprof_timeline_test.cc2
-rw-r--r--tensorflow/core/profiler/tfprof_log.proto53
5 files changed, 156 insertions, 106 deletions
diff --git a/tensorflow/core/profiler/internal/tfprof_node.cc b/tensorflow/core/profiler/internal/tfprof_node.cc
index 5cd1050bcc..2945c9510f 100644
--- a/tensorflow/core/profiler/internal/tfprof_node.cc
+++ b/tensorflow/core/profiler/internal/tfprof_node.cc
@@ -80,10 +80,15 @@ void ExecStep::AddTimeStats(const string& dev, const NodeExecStats& step_stat) {
void ExecStep::AddMemoryStats(const string& dev,
const NodeExecStats& step_stat) {
- if (exec_.memory_intialized()) {
+ ExecMemory exec_mem;
+ if (step_stat.all_start_micros() > 0) {
+ exec_mem.set_memory_micros(step_stat.all_start_micros() +
+ step_stat.op_end_rel_micros());
+ } else {
+ fprintf(stderr, "%s has no start time, skipping\n",
+ step_stat.node_name().c_str());
return;
}
- exec_.set_memory_intialized(true);
int accelerator_allocator_cnt = 0;
for (const auto& mem : step_stat.memory()) {
@@ -93,14 +98,12 @@ void ExecStep::AddMemoryStats(const string& dev,
continue;
}
++accelerator_allocator_cnt;
- exec_.set_allocator_bytes_in_use(
- std::max(static_cast<int64>(exec_.allocator_bytes_in_use()),
+ exec_mem.set_allocator_bytes_in_use(
+ std::max(static_cast<int64>(exec_mem.allocator_bytes_in_use()),
static_cast<int64>(mem.allocator_bytes_in_use())));
- Allocation allocation;
for (const auto& alloc : mem.allocation_records()) {
- allocation.add_allocation_records()->MergeFrom(alloc);
+ allocations_.push_back(alloc);
}
- allocations_.push_back(allocation);
}
if (accelerator_allocator_cnt > 1) {
fprintf(stderr, "found %d gpu allocator for 1 node\n",
@@ -121,22 +124,26 @@ void ExecStep::AddMemoryStats(const string& dev,
uint64 output_ptr =
output.tensor_description().allocation_description().ptr();
total_output_bytes += output_bytes;
- output_memory_[output.slot()] = std::make_pair(output_bytes, output_ptr);
+
+ auto& mem = (*exec_mem.mutable_output_memory())[output.slot()];
+ mem.set_ptr(output_ptr);
+ mem.set_bytes(output_bytes);
}
}
- exec_.set_output_bytes(total_output_bytes);
+ exec_mem.set_output_bytes(total_output_bytes);
if (step_stat.has_memory_stats()) {
- exec_.set_host_temp_bytes(exec_.host_temp_bytes() +
- step_stat.memory_stats().host_temp_memory_size());
- exec_.set_host_persistent_bytes(
- exec_.host_persistent_bytes() +
+ exec_mem.set_host_temp_bytes(
+ exec_mem.host_temp_bytes() +
+ step_stat.memory_stats().host_temp_memory_size());
+ exec_mem.set_host_persistent_bytes(
+ exec_mem.host_persistent_bytes() +
step_stat.memory_stats().host_persistent_memory_size());
- exec_.set_accelerator_temp_bytes(
- exec_.accelerator_temp_bytes() +
+ exec_mem.set_accelerator_temp_bytes(
+ exec_mem.accelerator_temp_bytes() +
step_stat.memory_stats().device_temp_memory_size());
- exec_.set_accelerator_persistent_bytes(
- exec_.accelerator_persistent_bytes() +
+ exec_mem.set_accelerator_persistent_bytes(
+ exec_mem.accelerator_persistent_bytes() +
step_stat.memory_stats().device_persistent_memory_size());
}
@@ -166,18 +173,20 @@ void ExecStep::AddMemoryStats(const string& dev,
requested_bytes += mem.total_bytes();
peak_bytes += mem.peak_bytes();
}
- residual_bytes +=
- exec_.host_persistent_bytes() + exec_.accelerator_persistent_bytes();
- requested_bytes += exec_.host_persistent_bytes() +
- exec_.accelerator_persistent_bytes() +
- exec_.host_temp_bytes() + exec_.accelerator_temp_bytes();
- peak_bytes += exec_.host_persistent_bytes() +
- exec_.accelerator_persistent_bytes() + exec_.host_temp_bytes() +
- exec_.accelerator_temp_bytes();
+ residual_bytes += exec_mem.host_persistent_bytes() +
+ exec_mem.accelerator_persistent_bytes();
+ requested_bytes += exec_mem.host_persistent_bytes() +
+ exec_mem.accelerator_persistent_bytes() +
+ exec_mem.host_temp_bytes() +
+ exec_mem.accelerator_temp_bytes();
+ peak_bytes += exec_mem.host_persistent_bytes() +
+ exec_mem.accelerator_persistent_bytes() +
+ exec_mem.host_temp_bytes() + exec_mem.accelerator_temp_bytes();
- exec_.set_requested_bytes(requested_bytes);
- exec_.set_residual_bytes(residual_bytes);
- exec_.set_peak_bytes(peak_bytes);
+ exec_mem.set_requested_bytes(requested_bytes);
+ exec_mem.set_residual_bytes(residual_bytes);
+ exec_mem.set_peak_bytes(peak_bytes);
+ memory_execs_.emplace_back(exec_mem);
}
void TFGraphNode::AddStepStat(int64 step, const string& device,
@@ -279,5 +288,8 @@ bool IsPlacedOnAccelerator(const string& device) {
return device.find("gpu") != device.npos ||
device.find("sycl") != device.npos;
}
+bool IsPlacedOnCPU(const string& device) {
+ return device.find("cpu") != device.npos;
+}
} // namespace tfprof
} // namespace tensorflow
diff --git a/tensorflow/core/profiler/internal/tfprof_node.h b/tensorflow/core/profiler/internal/tfprof_node.h
index 77c14cb792..5bc2ea3c42 100644
--- a/tensorflow/core/profiler/internal/tfprof_node.h
+++ b/tensorflow/core/profiler/internal/tfprof_node.h
@@ -109,7 +109,6 @@ class ExecStep {
const {
return cpu_execs_;
}
-
int64 all_start_micros() const { return exec_.all_start_micros(); }
int64 latest_end_micros() const { return exec_.latest_end_micros(); }
int64 lastest_schedule_end_micros() const {
@@ -121,27 +120,73 @@ class ExecStep {
}
return ret;
}
-
- int64 requested_bytes() const { return exec_.requested_bytes(); }
- int64 peak_bytes() const { return exec_.peak_bytes(); }
- int64 residual_bytes() const { return exec_.residual_bytes(); }
- int64 output_bytes() const { return exec_.output_bytes(); }
+ int64 requested_bytes() const {
+ int64 requested_bytes = 0;
+ for (const ExecMemory& exec : memory_execs_) {
+ requested_bytes += exec.requested_bytes();
+ }
+ return requested_bytes;
+ }
+ int64 peak_bytes() const {
+ int64 peak_bytes = 0;
+ for (const ExecMemory& exec : memory_execs_) {
+ peak_bytes += exec.peak_bytes();
+ }
+ return peak_bytes;
+ }
+ int64 residual_bytes() const {
+ int64 residual_bytes = 0;
+ for (const ExecMemory& exec : memory_execs_) {
+ residual_bytes += exec.residual_bytes();
+ }
+ return residual_bytes;
+ }
+ int64 output_bytes() const {
+ int64 output_bytes = 0;
+ for (const ExecMemory& exec : memory_execs_) {
+ output_bytes += exec.output_bytes();
+ }
+ return output_bytes;
+ }
int64 accelerator_temp_bytes() const {
- return exec_.accelerator_temp_bytes();
+ int64 accelerator_temp_bytes = 0;
+ for (const ExecMemory& exec : memory_execs_) {
+ accelerator_temp_bytes += exec.accelerator_temp_bytes();
+ }
+ return accelerator_temp_bytes;
+ }
+ int64 host_temp_bytes() const {
+ int64 host_temp_bytes = 0;
+ for (const ExecMemory& exec : memory_execs_) {
+ host_temp_bytes += exec.host_temp_bytes();
+ }
+ return host_temp_bytes;
}
- int64 host_temp_bytes() const { return exec_.host_temp_bytes(); }
int64 accelerator_persistent_bytes() const {
- return exec_.accelerator_persistent_bytes();
+ int64 accelerator_persistent_bytes = 0;
+ for (const ExecMemory& exec : memory_execs_) {
+ accelerator_persistent_bytes += exec.accelerator_persistent_bytes();
+ }
+ return accelerator_persistent_bytes;
}
- int64 host_persistent_bytes() const { return exec_.host_persistent_bytes(); }
- const std::map<int32, std::pair<int64, uint64>>& output_memory() const {
- return output_memory_;
+ int64 host_persistent_bytes() const {
+ int64 host_persistent_bytes = 0;
+ for (const ExecMemory& exec : memory_execs_) {
+ host_persistent_bytes += exec.host_persistent_bytes();
+ }
+ return host_persistent_bytes;
}
- int64 allocator_bytes_in_use() const {
- return exec_.allocator_bytes_in_use();
+ std::map<int64, int64> allocator_bytes_in_use() const {
+ std::map<int64, int64> bytes_in_use;
+ for (const ExecMemory& exec : memory_execs_) {
+ bytes_in_use[exec.memory_micros()] = exec.allocator_bytes_in_use();
+ }
+ return bytes_in_use;
}
- const std::vector<Allocation>& allocations() const { return allocations_; }
+ const std::vector<AllocationRecord>& allocations() const {
+ return allocations_;
+ }
const ExecProfile& ToProto() {
exec_.mutable_accelerator_execs()->clear();
@@ -169,19 +214,15 @@ class ExecStep {
for (const string& d : devices_) {
exec_.add_devices(d);
}
-
- exec_.mutable_output_memory()->clear();
- for (const auto& mem : output_memory_) {
- auto& mem_pb = (*exec_.mutable_output_memory())[mem.first];
- mem_pb.set_bytes(mem.second.first);
- mem_pb.set_ptr(mem.second.second);
- }
-
exec_.mutable_allocations()->Clear();
for (const auto& r : allocations_) {
exec_.add_allocations()->MergeFrom(r);
}
+ exec_.mutable_memory_execs()->Clear();
+ for (const auto& m : memory_execs_) {
+ exec_.add_memory_execs()->MergeFrom(m);
+ }
return exec_;
}
@@ -197,6 +238,7 @@ class ExecStep {
op_execs_.clear();
allocations_.clear();
+ memory_execs_.clear();
for (const auto& exec_time : exec_.accelerator_execs()) {
auto& exec = accelerator_execs_[exec_time.first];
@@ -214,15 +256,12 @@ class ExecStep {
op_exec.push_back(std::make_pair(p.int64_values(0), p.int64_values(1)));
}
}
- for (const auto& output_mem : exec_.output_memory()) {
- auto& mem = output_memory_[output_mem.first];
- mem.first = output_mem.second.bytes();
- mem.second = output_mem.second.ptr();
- }
-
for (const auto& r : exec_.allocations()) {
allocations_.push_back(r);
}
+ for (const auto& m : exec_.memory_execs()) {
+ memory_execs_.push_back(m);
+ }
}
private:
@@ -237,14 +276,15 @@ class ExecStep {
std::map<string, std::vector<std::pair<int64, int64>>> cpu_execs_;
// combines accelerator_execs_ and cpu_execs_.
std::map<string, std::vector<std::pair<int64, int64>>> op_execs_;
+ // Each ExecMemory corresponds to one scheduling of the op. Normally,
+ // there are multiple schedulings in while_loop.
+ std::vector<ExecMemory> memory_execs_;
// All devices the op is associated with (e.g. gpu:0 (scheduling),
// gpu:0:stream:xx (kernel exec), cpu:0 host)
std::set<string> devices_;
- // output_idx -> {output_bytes, memory_ptr}
- std::map<int32, std::pair<int64, uint64>> output_memory_;
// The history of accelerator allocations and deallocations of this step.
- std::vector<Allocation> allocations_;
+ std::vector<AllocationRecord> allocations_;
};
#define GRAPH_NODE_BYTES(type) \
@@ -598,23 +638,15 @@ class TFGraphNode {
}
return persistent_bytes;
}
- const std::map<int32, std::pair<int64, uint64>>& output_memory(
- int64 step) const {
+ const std::map<int64, int64> allocator_bytes_in_use(int64 step) const {
auto exec = execs_.find(step);
if (exec == execs_.end()) {
- return empty_output_memory_;
- }
- return exec->second.output_memory();
- }
- int64 allocator_bytes_in_use(int64 step) const {
- auto exec = execs_.find(step);
- if (exec == execs_.end()) {
- return 0;
+ return empty_bytes_in_use_;
}
return exec->second.allocator_bytes_in_use();
}
- const std::vector<Allocation>& allocations(int64 step) const {
+ const std::vector<AllocationRecord>& allocations(int64 step) const {
auto exec = execs_.find(step);
if (exec == execs_.end()) {
return empty_allocations_;
@@ -719,9 +751,9 @@ class TFGraphNode {
std::map<int64, ExecStep> execs_;
// Placeholder for empty cases.
- std::map<int32, std::pair<int64, uint64>> empty_output_memory_;
+ std::map<int64, int64> empty_bytes_in_use_;
std::map<string, std::vector<std::pair<int64, int64>>> empty_execs_;
- std::vector<Allocation> empty_allocations_;
+ std::vector<AllocationRecord> empty_allocations_;
};
class TFMultiGraphNode {
@@ -874,6 +906,7 @@ class TFMultiGraphNode {
std::map<string, const TFGraphNode*> nodes_;
};
+bool IsPlacedOnCPU(const string& device);
bool IsPlacedOnAccelerator(const string& device);
bool CountAsAcceleratorTime(const string& device);
bool CountAsCPUTime(const string& device);
diff --git a/tensorflow/core/profiler/internal/tfprof_timeline.cc b/tensorflow/core/profiler/internal/tfprof_timeline.cc
index bdb000747d..b0dd8ce5e0 100644
--- a/tensorflow/core/profiler/internal/tfprof_timeline.cc
+++ b/tensorflow/core/profiler/internal/tfprof_timeline.cc
@@ -153,10 +153,8 @@ void MemoryTracker::TrackNode(int64 step, const GraphNode* node) {
std::map<int64, int64> allocs;
for (const auto& alloc : node->node->allocations(step)) {
- for (const auto& r : alloc.allocation_records()) {
- allocs[r.alloc_micros()] += r.alloc_bytes();
- dev.tracked_allocations[r.alloc_micros()] += r.alloc_bytes();
- }
+ allocs[alloc.alloc_micros()] += alloc.alloc_bytes();
+ dev.tracked_allocations[alloc.alloc_micros()] += alloc.alloc_bytes();
}
dev.tracked_allocations[0] += node->node->accelerator_persistent_bytes();
allocs[0] += node->node->accelerator_persistent_bytes();
@@ -167,9 +165,9 @@ void MemoryTracker::TrackNode(int64 step, const GraphNode* node) {
last += it->second;
aggregate_allocs[it->first] = last;
}
- int64 end_micros = node->node->lastest_schedule_end_micros(step);
- if (end_micros > 0 && node->node->allocator_bytes_in_use(step) > 0) {
- dev.allocations[end_micros] = node->node->allocator_bytes_in_use(step);
+ for (const auto& bytes_in_use : node->node->allocator_bytes_in_use(step)) {
+ if (bytes_in_use.first <= 0) continue;
+ dev.allocations[bytes_in_use.first] = bytes_in_use.second;
}
}
@@ -265,6 +263,10 @@ void Timeline::GenerateGraphTimeline(const std::vector<GraphNode*>& gnodes) {
}
}
for (const auto& dev : mem_tracker_.devices()) {
+ if (IsPlacedOnCPU(dev.first)) {
+ // TODO(xpan): Maybe also support CPU allocator memory tracking.
+ continue;
+ }
int64 pid = AllocatePID();
chrome_formatter_.EmitPID(GetMemoryLaneName(dev.first), pid);
int64 pid2 = AllocatePID();
diff --git a/tensorflow/core/profiler/internal/tfprof_timeline_test.cc b/tensorflow/core/profiler/internal/tfprof_timeline_test.cc
index 91eac0cf76..6a7ab01029 100644
--- a/tensorflow/core/profiler/internal/tfprof_timeline_test.cc
+++ b/tensorflow/core/profiler/internal/tfprof_timeline_test.cc
@@ -71,7 +71,7 @@ TEST_F(TFProfTimelineTest, GraphView) {
string dump_str;
TF_CHECK_OK(ReadFileToString(Env::Default(), dump_file + "_0", &dump_str));
- EXPECT_EQ(7932146665024565912ull, Hash64(dump_str));
+ EXPECT_EQ(16556121177519539380ull, Hash64(dump_str));
}
TEST_F(TFProfTimelineTest, ScopeView) {
diff --git a/tensorflow/core/profiler/tfprof_log.proto b/tensorflow/core/profiler/tfprof_log.proto
index b49bdf64ac..0bf1b477ed 100644
--- a/tensorflow/core/profiler/tfprof_log.proto
+++ b/tensorflow/core/profiler/tfprof_log.proto
@@ -90,10 +90,6 @@ message ProfileNode {
map<int64, ExecProfile> execs = 12;
}
-message Allocation {
- repeated AllocationRecord allocation_records = 1;
-}
-
message ExecProfile {
// Can be larger than 1 if run multiple times in loop.
int64 run_count = 1;
@@ -110,35 +106,42 @@ message ExecProfile {
// For cpu, vector size can be larger than 1 if in tf.while_loop.
map<string, ExecTime> cpu_execs = 5;
- map<int32, Memory> output_memory = 17;
+ // Each entry to memory information of a scheduling of the node.
+ // Normally, there will be multiple entries in while_loop.
+ repeated ExecMemory memory_execs = 7;
+ // The allocation and deallocation times and sizes throughout execution.
+ repeated AllocationRecord allocations = 11;
+ // The devices related to this execution.
+ repeated string devices = 6;
+}
- repeated Allocation allocations = 18;
+message ExecTime {
+ repeated Tuple times = 1;
+}
- repeated string devices = 6;
+message ExecMemory {
+ // This is the timestamp when the memory information was tracked.
+ int64 memory_micros = 1;
+ // NOTE: Please don't depend on the following 4 fields yet. Due to
+ // TensorFlow internal tracing issues, the numbers can be quite wrong.
+ // TODO(xpan): Fix the TensorFlow internal tracing.
+ int64 host_temp_bytes = 2;
+ int64 host_persistent_bytes = 3;
+ int64 accelerator_temp_bytes = 4;
+ int64 accelerator_persistent_bytes = 5;
// Total bytes requested by the op.
- int64 requested_bytes = 7;
+ int64 requested_bytes = 6;
// Total bytes requested by the op and released before op end.
- int64 peak_bytes = 8;
+ int64 peak_bytes = 7;
// Total bytes requested by the op and not released after op end.
- int64 residual_bytes = 9;
+ int64 residual_bytes = 8;
// Total bytes output by the op (not necessarily requested by the op).
- int64 output_bytes = 10;
- // NOTE: Please don't depend on the following 4 fields yet. Due to
- // TensorFlow internal tracing issues, the numbers can be quite wrong.
- // TODO(xpan): Fix the TensorFlow internal tracing.
- int64 host_temp_bytes = 11;
- int64 host_persistent_bytes = 12;
- int64 accelerator_temp_bytes = 13;
- int64 accelerator_persistent_bytes = 14;
+ int64 output_bytes = 9;
// The total number of bytes currently allocated by the allocator if >0.
- int64 allocator_bytes_in_use = 15;
-
- bool memory_intialized = 16;
-}
-
-message ExecTime {
- repeated Tuple times = 1;
+ int64 allocator_bytes_in_use = 10;
+ // The memory of each output of the operation.
+ map<int32, Memory> output_memory = 11;
}
message Tuple {