diff options
author | A. Unique TensorFlower <gardener@tensorflow.org> | 2017-12-08 16:30:02 -0800 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2017-12-08 16:33:27 -0800 |
commit | a0c21217fcf2993c5625a726c62a04b749afcddf (patch) | |
tree | 75b02abe7c2cd17894201a95fe41225e4c24ed3f /tensorflow/core/profiler | |
parent | ddfd6253fe0870779abc78be52c872d86b03f577 (diff) |
Make profiler memory profiling work with tf.while_loop
PiperOrigin-RevId: 178443834
Diffstat (limited to 'tensorflow/core/profiler')
-rw-r--r-- | tensorflow/core/profiler/internal/tfprof_node.cc | 68 | ||||
-rw-r--r-- | tensorflow/core/profiler/internal/tfprof_node.h | 123 | ||||
-rw-r--r-- | tensorflow/core/profiler/internal/tfprof_timeline.cc | 16 | ||||
-rw-r--r-- | tensorflow/core/profiler/internal/tfprof_timeline_test.cc | 2 | ||||
-rw-r--r-- | tensorflow/core/profiler/tfprof_log.proto | 53 |
5 files changed, 156 insertions, 106 deletions
diff --git a/tensorflow/core/profiler/internal/tfprof_node.cc b/tensorflow/core/profiler/internal/tfprof_node.cc index 5cd1050bcc..2945c9510f 100644 --- a/tensorflow/core/profiler/internal/tfprof_node.cc +++ b/tensorflow/core/profiler/internal/tfprof_node.cc @@ -80,10 +80,15 @@ void ExecStep::AddTimeStats(const string& dev, const NodeExecStats& step_stat) { void ExecStep::AddMemoryStats(const string& dev, const NodeExecStats& step_stat) { - if (exec_.memory_intialized()) { + ExecMemory exec_mem; + if (step_stat.all_start_micros() > 0) { + exec_mem.set_memory_micros(step_stat.all_start_micros() + + step_stat.op_end_rel_micros()); + } else { + fprintf(stderr, "%s has no start time, skipping\n", + step_stat.node_name().c_str()); return; } - exec_.set_memory_intialized(true); int accelerator_allocator_cnt = 0; for (const auto& mem : step_stat.memory()) { @@ -93,14 +98,12 @@ void ExecStep::AddMemoryStats(const string& dev, continue; } ++accelerator_allocator_cnt; - exec_.set_allocator_bytes_in_use( - std::max(static_cast<int64>(exec_.allocator_bytes_in_use()), + exec_mem.set_allocator_bytes_in_use( + std::max(static_cast<int64>(exec_mem.allocator_bytes_in_use()), static_cast<int64>(mem.allocator_bytes_in_use()))); - Allocation allocation; for (const auto& alloc : mem.allocation_records()) { - allocation.add_allocation_records()->MergeFrom(alloc); + allocations_.push_back(alloc); } - allocations_.push_back(allocation); } if (accelerator_allocator_cnt > 1) { fprintf(stderr, "found %d gpu allocator for 1 node\n", @@ -121,22 +124,26 @@ void ExecStep::AddMemoryStats(const string& dev, uint64 output_ptr = output.tensor_description().allocation_description().ptr(); total_output_bytes += output_bytes; - output_memory_[output.slot()] = std::make_pair(output_bytes, output_ptr); + + auto& mem = (*exec_mem.mutable_output_memory())[output.slot()]; + mem.set_ptr(output_ptr); + mem.set_bytes(output_bytes); } } - exec_.set_output_bytes(total_output_bytes); + exec_mem.set_output_bytes(total_output_bytes); if (step_stat.has_memory_stats()) { - exec_.set_host_temp_bytes(exec_.host_temp_bytes() + - step_stat.memory_stats().host_temp_memory_size()); - exec_.set_host_persistent_bytes( - exec_.host_persistent_bytes() + + exec_mem.set_host_temp_bytes( + exec_mem.host_temp_bytes() + + step_stat.memory_stats().host_temp_memory_size()); + exec_mem.set_host_persistent_bytes( + exec_mem.host_persistent_bytes() + step_stat.memory_stats().host_persistent_memory_size()); - exec_.set_accelerator_temp_bytes( - exec_.accelerator_temp_bytes() + + exec_mem.set_accelerator_temp_bytes( + exec_mem.accelerator_temp_bytes() + step_stat.memory_stats().device_temp_memory_size()); - exec_.set_accelerator_persistent_bytes( - exec_.accelerator_persistent_bytes() + + exec_mem.set_accelerator_persistent_bytes( + exec_mem.accelerator_persistent_bytes() + step_stat.memory_stats().device_persistent_memory_size()); } @@ -166,18 +173,20 @@ void ExecStep::AddMemoryStats(const string& dev, requested_bytes += mem.total_bytes(); peak_bytes += mem.peak_bytes(); } - residual_bytes += - exec_.host_persistent_bytes() + exec_.accelerator_persistent_bytes(); - requested_bytes += exec_.host_persistent_bytes() + - exec_.accelerator_persistent_bytes() + - exec_.host_temp_bytes() + exec_.accelerator_temp_bytes(); - peak_bytes += exec_.host_persistent_bytes() + - exec_.accelerator_persistent_bytes() + exec_.host_temp_bytes() + - exec_.accelerator_temp_bytes(); + residual_bytes += exec_mem.host_persistent_bytes() + + exec_mem.accelerator_persistent_bytes(); + requested_bytes += exec_mem.host_persistent_bytes() + + exec_mem.accelerator_persistent_bytes() + + exec_mem.host_temp_bytes() + + exec_mem.accelerator_temp_bytes(); + peak_bytes += exec_mem.host_persistent_bytes() + + exec_mem.accelerator_persistent_bytes() + + exec_mem.host_temp_bytes() + exec_mem.accelerator_temp_bytes(); - exec_.set_requested_bytes(requested_bytes); - exec_.set_residual_bytes(residual_bytes); - exec_.set_peak_bytes(peak_bytes); + exec_mem.set_requested_bytes(requested_bytes); + exec_mem.set_residual_bytes(residual_bytes); + exec_mem.set_peak_bytes(peak_bytes); + memory_execs_.emplace_back(exec_mem); } void TFGraphNode::AddStepStat(int64 step, const string& device, @@ -279,5 +288,8 @@ bool IsPlacedOnAccelerator(const string& device) { return device.find("gpu") != device.npos || device.find("sycl") != device.npos; } +bool IsPlacedOnCPU(const string& device) { + return device.find("cpu") != device.npos; +} } // namespace tfprof } // namespace tensorflow diff --git a/tensorflow/core/profiler/internal/tfprof_node.h b/tensorflow/core/profiler/internal/tfprof_node.h index 77c14cb792..5bc2ea3c42 100644 --- a/tensorflow/core/profiler/internal/tfprof_node.h +++ b/tensorflow/core/profiler/internal/tfprof_node.h @@ -109,7 +109,6 @@ class ExecStep { const { return cpu_execs_; } - int64 all_start_micros() const { return exec_.all_start_micros(); } int64 latest_end_micros() const { return exec_.latest_end_micros(); } int64 lastest_schedule_end_micros() const { @@ -121,27 +120,73 @@ class ExecStep { } return ret; } - - int64 requested_bytes() const { return exec_.requested_bytes(); } - int64 peak_bytes() const { return exec_.peak_bytes(); } - int64 residual_bytes() const { return exec_.residual_bytes(); } - int64 output_bytes() const { return exec_.output_bytes(); } + int64 requested_bytes() const { + int64 requested_bytes = 0; + for (const ExecMemory& exec : memory_execs_) { + requested_bytes += exec.requested_bytes(); + } + return requested_bytes; + } + int64 peak_bytes() const { + int64 peak_bytes = 0; + for (const ExecMemory& exec : memory_execs_) { + peak_bytes += exec.peak_bytes(); + } + return peak_bytes; + } + int64 residual_bytes() const { + int64 residual_bytes = 0; + for (const ExecMemory& exec : memory_execs_) { + residual_bytes += exec.residual_bytes(); + } + return residual_bytes; + } + int64 output_bytes() const { + int64 output_bytes = 0; + for (const ExecMemory& exec : memory_execs_) { + output_bytes += exec.output_bytes(); + } + return output_bytes; + } int64 accelerator_temp_bytes() const { - return exec_.accelerator_temp_bytes(); + int64 accelerator_temp_bytes = 0; + for (const ExecMemory& exec : memory_execs_) { + accelerator_temp_bytes += exec.accelerator_temp_bytes(); + } + return accelerator_temp_bytes; + } + int64 host_temp_bytes() const { + int64 host_temp_bytes = 0; + for (const ExecMemory& exec : memory_execs_) { + host_temp_bytes += exec.host_temp_bytes(); + } + return host_temp_bytes; } - int64 host_temp_bytes() const { return exec_.host_temp_bytes(); } int64 accelerator_persistent_bytes() const { - return exec_.accelerator_persistent_bytes(); + int64 accelerator_persistent_bytes = 0; + for (const ExecMemory& exec : memory_execs_) { + accelerator_persistent_bytes += exec.accelerator_persistent_bytes(); + } + return accelerator_persistent_bytes; } - int64 host_persistent_bytes() const { return exec_.host_persistent_bytes(); } - const std::map<int32, std::pair<int64, uint64>>& output_memory() const { - return output_memory_; + int64 host_persistent_bytes() const { + int64 host_persistent_bytes = 0; + for (const ExecMemory& exec : memory_execs_) { + host_persistent_bytes += exec.host_persistent_bytes(); + } + return host_persistent_bytes; } - int64 allocator_bytes_in_use() const { - return exec_.allocator_bytes_in_use(); + std::map<int64, int64> allocator_bytes_in_use() const { + std::map<int64, int64> bytes_in_use; + for (const ExecMemory& exec : memory_execs_) { + bytes_in_use[exec.memory_micros()] = exec.allocator_bytes_in_use(); + } + return bytes_in_use; } - const std::vector<Allocation>& allocations() const { return allocations_; } + const std::vector<AllocationRecord>& allocations() const { + return allocations_; + } const ExecProfile& ToProto() { exec_.mutable_accelerator_execs()->clear(); @@ -169,19 +214,15 @@ class ExecStep { for (const string& d : devices_) { exec_.add_devices(d); } - - exec_.mutable_output_memory()->clear(); - for (const auto& mem : output_memory_) { - auto& mem_pb = (*exec_.mutable_output_memory())[mem.first]; - mem_pb.set_bytes(mem.second.first); - mem_pb.set_ptr(mem.second.second); - } - exec_.mutable_allocations()->Clear(); for (const auto& r : allocations_) { exec_.add_allocations()->MergeFrom(r); } + exec_.mutable_memory_execs()->Clear(); + for (const auto& m : memory_execs_) { + exec_.add_memory_execs()->MergeFrom(m); + } return exec_; } @@ -197,6 +238,7 @@ class ExecStep { op_execs_.clear(); allocations_.clear(); + memory_execs_.clear(); for (const auto& exec_time : exec_.accelerator_execs()) { auto& exec = accelerator_execs_[exec_time.first]; @@ -214,15 +256,12 @@ class ExecStep { op_exec.push_back(std::make_pair(p.int64_values(0), p.int64_values(1))); } } - for (const auto& output_mem : exec_.output_memory()) { - auto& mem = output_memory_[output_mem.first]; - mem.first = output_mem.second.bytes(); - mem.second = output_mem.second.ptr(); - } - for (const auto& r : exec_.allocations()) { allocations_.push_back(r); } + for (const auto& m : exec_.memory_execs()) { + memory_execs_.push_back(m); + } } private: @@ -237,14 +276,15 @@ class ExecStep { std::map<string, std::vector<std::pair<int64, int64>>> cpu_execs_; // combines accelerator_execs_ and cpu_execs_. std::map<string, std::vector<std::pair<int64, int64>>> op_execs_; + // Each ExecMemory corresponds to one scheduling of the op. Normally, + // there are multiple schedulings in while_loop. + std::vector<ExecMemory> memory_execs_; // All devices the op is associated with (e.g. gpu:0 (scheduling), // gpu:0:stream:xx (kernel exec), cpu:0 host) std::set<string> devices_; - // output_idx -> {output_bytes, memory_ptr} - std::map<int32, std::pair<int64, uint64>> output_memory_; // The history of accelerator allocations and deallocations of this step. - std::vector<Allocation> allocations_; + std::vector<AllocationRecord> allocations_; }; #define GRAPH_NODE_BYTES(type) \ @@ -598,23 +638,15 @@ class TFGraphNode { } return persistent_bytes; } - const std::map<int32, std::pair<int64, uint64>>& output_memory( - int64 step) const { + const std::map<int64, int64> allocator_bytes_in_use(int64 step) const { auto exec = execs_.find(step); if (exec == execs_.end()) { - return empty_output_memory_; - } - return exec->second.output_memory(); - } - int64 allocator_bytes_in_use(int64 step) const { - auto exec = execs_.find(step); - if (exec == execs_.end()) { - return 0; + return empty_bytes_in_use_; } return exec->second.allocator_bytes_in_use(); } - const std::vector<Allocation>& allocations(int64 step) const { + const std::vector<AllocationRecord>& allocations(int64 step) const { auto exec = execs_.find(step); if (exec == execs_.end()) { return empty_allocations_; @@ -719,9 +751,9 @@ class TFGraphNode { std::map<int64, ExecStep> execs_; // Placeholder for empty cases. - std::map<int32, std::pair<int64, uint64>> empty_output_memory_; + std::map<int64, int64> empty_bytes_in_use_; std::map<string, std::vector<std::pair<int64, int64>>> empty_execs_; - std::vector<Allocation> empty_allocations_; + std::vector<AllocationRecord> empty_allocations_; }; class TFMultiGraphNode { @@ -874,6 +906,7 @@ class TFMultiGraphNode { std::map<string, const TFGraphNode*> nodes_; }; +bool IsPlacedOnCPU(const string& device); bool IsPlacedOnAccelerator(const string& device); bool CountAsAcceleratorTime(const string& device); bool CountAsCPUTime(const string& device); diff --git a/tensorflow/core/profiler/internal/tfprof_timeline.cc b/tensorflow/core/profiler/internal/tfprof_timeline.cc index bdb000747d..b0dd8ce5e0 100644 --- a/tensorflow/core/profiler/internal/tfprof_timeline.cc +++ b/tensorflow/core/profiler/internal/tfprof_timeline.cc @@ -153,10 +153,8 @@ void MemoryTracker::TrackNode(int64 step, const GraphNode* node) { std::map<int64, int64> allocs; for (const auto& alloc : node->node->allocations(step)) { - for (const auto& r : alloc.allocation_records()) { - allocs[r.alloc_micros()] += r.alloc_bytes(); - dev.tracked_allocations[r.alloc_micros()] += r.alloc_bytes(); - } + allocs[alloc.alloc_micros()] += alloc.alloc_bytes(); + dev.tracked_allocations[alloc.alloc_micros()] += alloc.alloc_bytes(); } dev.tracked_allocations[0] += node->node->accelerator_persistent_bytes(); allocs[0] += node->node->accelerator_persistent_bytes(); @@ -167,9 +165,9 @@ void MemoryTracker::TrackNode(int64 step, const GraphNode* node) { last += it->second; aggregate_allocs[it->first] = last; } - int64 end_micros = node->node->lastest_schedule_end_micros(step); - if (end_micros > 0 && node->node->allocator_bytes_in_use(step) > 0) { - dev.allocations[end_micros] = node->node->allocator_bytes_in_use(step); + for (const auto& bytes_in_use : node->node->allocator_bytes_in_use(step)) { + if (bytes_in_use.first <= 0) continue; + dev.allocations[bytes_in_use.first] = bytes_in_use.second; } } @@ -265,6 +263,10 @@ void Timeline::GenerateGraphTimeline(const std::vector<GraphNode*>& gnodes) { } } for (const auto& dev : mem_tracker_.devices()) { + if (IsPlacedOnCPU(dev.first)) { + // TODO(xpan): Maybe also support CPU allocator memory tracking. + continue; + } int64 pid = AllocatePID(); chrome_formatter_.EmitPID(GetMemoryLaneName(dev.first), pid); int64 pid2 = AllocatePID(); diff --git a/tensorflow/core/profiler/internal/tfprof_timeline_test.cc b/tensorflow/core/profiler/internal/tfprof_timeline_test.cc index 91eac0cf76..6a7ab01029 100644 --- a/tensorflow/core/profiler/internal/tfprof_timeline_test.cc +++ b/tensorflow/core/profiler/internal/tfprof_timeline_test.cc @@ -71,7 +71,7 @@ TEST_F(TFProfTimelineTest, GraphView) { string dump_str; TF_CHECK_OK(ReadFileToString(Env::Default(), dump_file + "_0", &dump_str)); - EXPECT_EQ(7932146665024565912ull, Hash64(dump_str)); + EXPECT_EQ(16556121177519539380ull, Hash64(dump_str)); } TEST_F(TFProfTimelineTest, ScopeView) { diff --git a/tensorflow/core/profiler/tfprof_log.proto b/tensorflow/core/profiler/tfprof_log.proto index b49bdf64ac..0bf1b477ed 100644 --- a/tensorflow/core/profiler/tfprof_log.proto +++ b/tensorflow/core/profiler/tfprof_log.proto @@ -90,10 +90,6 @@ message ProfileNode { map<int64, ExecProfile> execs = 12; } -message Allocation { - repeated AllocationRecord allocation_records = 1; -} - message ExecProfile { // Can be larger than 1 if run multiple times in loop. int64 run_count = 1; @@ -110,35 +106,42 @@ message ExecProfile { // For cpu, vector size can be larger than 1 if in tf.while_loop. map<string, ExecTime> cpu_execs = 5; - map<int32, Memory> output_memory = 17; + // Each entry to memory information of a scheduling of the node. + // Normally, there will be multiple entries in while_loop. + repeated ExecMemory memory_execs = 7; + // The allocation and deallocation times and sizes throughout execution. + repeated AllocationRecord allocations = 11; + // The devices related to this execution. + repeated string devices = 6; +} - repeated Allocation allocations = 18; +message ExecTime { + repeated Tuple times = 1; +} - repeated string devices = 6; +message ExecMemory { + // This is the timestamp when the memory information was tracked. + int64 memory_micros = 1; + // NOTE: Please don't depend on the following 4 fields yet. Due to + // TensorFlow internal tracing issues, the numbers can be quite wrong. + // TODO(xpan): Fix the TensorFlow internal tracing. + int64 host_temp_bytes = 2; + int64 host_persistent_bytes = 3; + int64 accelerator_temp_bytes = 4; + int64 accelerator_persistent_bytes = 5; // Total bytes requested by the op. - int64 requested_bytes = 7; + int64 requested_bytes = 6; // Total bytes requested by the op and released before op end. - int64 peak_bytes = 8; + int64 peak_bytes = 7; // Total bytes requested by the op and not released after op end. - int64 residual_bytes = 9; + int64 residual_bytes = 8; // Total bytes output by the op (not necessarily requested by the op). - int64 output_bytes = 10; - // NOTE: Please don't depend on the following 4 fields yet. Due to - // TensorFlow internal tracing issues, the numbers can be quite wrong. - // TODO(xpan): Fix the TensorFlow internal tracing. - int64 host_temp_bytes = 11; - int64 host_persistent_bytes = 12; - int64 accelerator_temp_bytes = 13; - int64 accelerator_persistent_bytes = 14; + int64 output_bytes = 9; // The total number of bytes currently allocated by the allocator if >0. - int64 allocator_bytes_in_use = 15; - - bool memory_intialized = 16; -} - -message ExecTime { - repeated Tuple times = 1; + int64 allocator_bytes_in_use = 10; + // The memory of each output of the operation. + map<int32, Memory> output_memory = 11; } message Tuple { |