diff options
Diffstat (limited to 'tensorflow/core/profiler/internal/tfprof_timeline.cc')
-rw-r--r-- | tensorflow/core/profiler/internal/tfprof_timeline.cc | 169 |
1 files changed, 92 insertions, 77 deletions
diff --git a/tensorflow/core/profiler/internal/tfprof_timeline.cc b/tensorflow/core/profiler/internal/tfprof_timeline.cc index 1732574cc4..bdb000747d 100644 --- a/tensorflow/core/profiler/internal/tfprof_timeline.cc +++ b/tensorflow/core/profiler/internal/tfprof_timeline.cc @@ -25,6 +25,8 @@ limitations under the License. namespace tensorflow { namespace tfprof { namespace { +int kMaxDisplayedMemNode = 10; + string GetTimeDevName(const string& dev) { if (dev.find("stream") != dev.npos) { return strings::StrCat("Op execution threads: ", dev); @@ -85,14 +87,41 @@ void ChromeTraceFormatter::EmitFlowEnd(const string& name, int64 ts, int64 pid, events_.push_back(event); } -void ChromeTraceFormatter::EmitCounter(const string& category, - const string& name, int64 pid, int64 ts, - const string& device, int64 bytes) { - Json::Value event = CreateEvent("C", category, name, pid, 0, ts); +void ChromeTraceFormatter::EmitCounter( + const string& category, const string& name, int64 pid, int64 ts, + const string& device, int64 bytes, + const std::map<int64, std::vector<string>>& tensor_mem) { + Json::Value event = CreateEvent("C", category, "Allocated Bytes", pid, 0, ts); Json::Value args(Json::objectValue); - args[device] = Json::Value(bytes); + args["Allocator Bytes in Use"] = Json::Value(bytes); event["args"] = args; events_.push_back(event); + + // TODO(xpan): chrome://tracing is not ideal visualization for memory. + // It would be great to have a customized UI for it. + Json::Value event2 = + CreateEvent("C", category, "Top Allocations", pid + 1, 0, ts); + Json::Value args2(Json::objectValue); + // Need to reserve the same args for all locations. + for (int i = 1; i < kMaxDisplayedMemNode; ++i) { + args2[strings::Printf("Top Allocation %02d", i)] = Json::Value("N/A"); + } + int count = 0; + for (auto it = tensor_mem.rbegin(); it != tensor_mem.rend(); ++it) { + for (const string& t : it->second) { + if (bytes < it->first || count >= kMaxDisplayedMemNode) { + break; + } + args2[strings::Printf("Top Allocation %02d", count)] = + Json::Value(strings::StrCat(it->first / 1000000.0, " MB from ", t)); + ++count; + bytes -= it->first; + } + } + args2[strings::StrCat("Not Displayed")] = + Json::Value(strings::Printf("%.2f MB", bytes / 1000000.0)); + event2["args"] = args2; + events_.push_back(event2); } string ChromeTraceFormatter::Format() { @@ -119,71 +148,28 @@ void MemoryTracker::TrackNode(int64 step, const GraphNode* node) { if (!node->Trackable(step)) { return; } + Device& dev = devices_[node->node->canonical_device()]; - int64 end_micros = node->node->latest_end_micros(step); - if (node->node->accelerator_persistent_bytes(step) != 0) { - string tensor_name = strings::StrCat(node->name(), ":", -1); - dev.earliest_ref[tensor_name] = node->node->all_start_micros(step); - dev.tensor_size[tensor_name] = - node->node->accelerator_persistent_bytes(step); - // TODO(xpan): Need latest_ref? - } - if (node->node->accelerator_temp_bytes(step)) { - string tensor_name = strings::StrCat(node->name(), ":", -2); - dev.earliest_ref[tensor_name] = node->node->all_start_micros(step); - dev.latest_ref[tensor_name] = end_micros; - dev.tensor_size[tensor_name] = node->node->accelerator_temp_bytes(step); - } - if (node->node->allocator_bytes_in_use(step) > 0) { - dev.allocator_stats[end_micros] = node->node->allocator_bytes_in_use(step); - } -} -void MemoryTracker::TrackNodeConnection(int64 step, const GraphNode* node, - const GraphNode* src) { - if (!node->Trackable(step) || !src->Trackable(step)) { - return; - } - const auto& output_idx = node->node->src_output_idx().find(src->name()); - if (output_idx == node->node->src_output_idx().end()) { - return; - } - const auto& output = src->node->output_memory(step).find(output_idx->second); - if (output == src->node->output_memory(step).end()) { - return; + std::map<int64, int64> allocs; + for (const auto& alloc : node->node->allocations(step)) { + for (const auto& r : alloc.allocation_records()) { + allocs[r.alloc_micros()] += r.alloc_bytes(); + dev.tracked_allocations[r.alloc_micros()] += r.alloc_bytes(); + } } - int64 output_bytes = output->second.first; - uint64 output_ptr = output->second.second; - - Device& src_dev = devices_[src->node->canonical_device()]; - string tensor_name = strings::StrCat(output_ptr); - if (output_ptr == 0) { - fprintf(stderr, "output no ptr\n"); - tensor_name = strings::StrCat(src->node->name(), ":", output_idx->second); + dev.tracked_allocations[0] += node->node->accelerator_persistent_bytes(); + allocs[0] += node->node->accelerator_persistent_bytes(); + + int64 last = 0; + std::map<int64, int64>& aggregate_allocs = dev.tensor_allocs[node->name()]; + for (auto it = allocs.begin(); it != allocs.end(); ++it) { + last += it->second; + aggregate_allocs[it->first] = last; } - - src_dev.tensor_size[tensor_name] = output_bytes; - src_dev.earliest_ref[tensor_name] = src->node->all_start_micros(step); - - int64 src_end_micros = src->node->latest_end_micros(step); - - if (src->node->canonical_device() != node->node->canonical_device()) { - int64 transfer_micros = - (src_end_micros + node->node->all_start_micros(step)) / 2; - src_dev.latest_ref[tensor_name] = - std::max(src_dev.latest_ref[tensor_name], transfer_micros); - - Device& dest_dev = devices_[node->node->canonical_device()]; - string dest_tensor_name = - strings::StrCat(tensor_name, node->node->canonical_device()); - dest_dev.tensor_size[dest_tensor_name] = output_bytes; - dest_dev.earliest_ref[dest_tensor_name] = transfer_micros; - dest_dev.latest_ref[dest_tensor_name] = - std::max(dest_dev.latest_ref[dest_tensor_name], - node->node->latest_end_micros(step)); - } else { - src_dev.latest_ref[tensor_name] = std::max( - src_dev.latest_ref[tensor_name], node->node->latest_end_micros(step)); + int64 end_micros = node->node->lastest_schedule_end_micros(step); + if (end_micros > 0 && node->node->allocator_bytes_in_use(step) > 0) { + dev.allocations[end_micros] = node->node->allocator_bytes_in_use(step); } } @@ -222,22 +208,24 @@ void Timeline::GenerateGraphTimeline(const std::vector<GraphNode*>& gnodes) { for (GraphNode* gnode : gnodes) { AllocateTimeNodes(gnode); } + // To save memory, we only track cross-device (canonical device) flows. for (auto& process : tnodes_) { + if (!IsCanonicalDevice(process.first)) continue; for (auto& tn : process.second) { TimeNode* tnode = tn.second.get(); for (GraphNode* inp : tnode->node->children) { if (!inp->account || !inp->Trackable(step_)) { continue; } - TrackNodeConnection(tnode->node, inp); - for (const auto& kernel_execs : inp->node->op_execs(step_)) { - if (process.first == kernel_execs.first) { - // Not interested in flow withthin the same device. + for (const auto& execs : inp->node->cpu_execs(step_)) { + if (!IsCanonicalDevice(execs.first)) continue; + if (process.first == execs.first) { + // Not interested in flow within the same device. continue; } - for (const auto& exec : kernel_execs.second) { + for (const auto& exec : execs.second) { int64 start_micros = exec.first; - auto cprocess = tnodes_.find(kernel_execs.first); + auto cprocess = tnodes_.find(execs.first); if (cprocess == tnodes_.end()) continue; auto ctn = cprocess->second.find(start_micros); if (ctn == cprocess->second.end()) continue; @@ -258,7 +246,6 @@ void Timeline::GenerateGraphTimeline(const std::vector<GraphNode*>& gnodes) { Json::Value args(Json::objectValue); args["name"] = Json::Value(tnode->name()); - args["op"] = Json::Value(tnode->name()); chrome_formatter_.EmitRegion(node.first, tnode->exec_micros, process.first, lane.first, "Op", tnode->name(), args); @@ -280,12 +267,40 @@ void Timeline::GenerateGraphTimeline(const std::vector<GraphNode*>& gnodes) { for (const auto& dev : mem_tracker_.devices()) { int64 pid = AllocatePID(); chrome_formatter_.EmitPID(GetMemoryLaneName(dev.first), pid); + int64 pid2 = AllocatePID(); + chrome_formatter_.EmitPID(GetMemoryLaneName(dev.first) + " allocations", + pid2); + const MemoryTracker::Device& device = dev.second; - for (const auto& alloc_stats : device.allocator_stats) { - chrome_formatter_.EmitCounter("Memory", "Memory Series", pid, - alloc_stats.first, dev.first, - alloc_stats.second); + int64 max_bytes_in_use = 0; + int64 cur_bytes_in_use = 0; + int64 last_point = 0; + for (const auto& alloc : device.allocations) { + cur_bytes_in_use = alloc.second; + max_bytes_in_use = std::max(max_bytes_in_use, cur_bytes_in_use); + // Do not plot too dense to reduce file size. + int64 ts = alloc.first; + if (ts - last_point < 100) continue; + last_point = ts; + + std::map<int64, std::vector<string>> tensor_mem; + for (const auto& tensor_alloc_it : dev.second.tensor_allocs) { + const auto& tensor_alloc = tensor_alloc_it.second; + auto it = tensor_alloc.lower_bound(ts); + if (it != tensor_alloc.begin()) { + --it; + } + if (it->second > 0) { + tensor_mem[it->second].push_back(tensor_alloc_it.first); + } + } + chrome_formatter_.EmitCounter("Memory", "Memory Series", pid, ts, + dev.first, cur_bytes_in_use, tensor_mem); + } + if (IsPlacedOnAccelerator(dev.first)) { + fprintf(stdout, "%s peak memory: %.2f MB\n", dev.first.c_str(), + max_bytes_in_use / 1000000.0); } } OutputTimeline(); |