aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/core/profiler/internal/tfprof_timeline.cc
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow/core/profiler/internal/tfprof_timeline.cc')
-rw-r--r--tensorflow/core/profiler/internal/tfprof_timeline.cc169
1 files changed, 92 insertions, 77 deletions
diff --git a/tensorflow/core/profiler/internal/tfprof_timeline.cc b/tensorflow/core/profiler/internal/tfprof_timeline.cc
index 1732574cc4..bdb000747d 100644
--- a/tensorflow/core/profiler/internal/tfprof_timeline.cc
+++ b/tensorflow/core/profiler/internal/tfprof_timeline.cc
@@ -25,6 +25,8 @@ limitations under the License.
namespace tensorflow {
namespace tfprof {
namespace {
+int kMaxDisplayedMemNode = 10;
+
string GetTimeDevName(const string& dev) {
if (dev.find("stream") != dev.npos) {
return strings::StrCat("Op execution threads: ", dev);
@@ -85,14 +87,41 @@ void ChromeTraceFormatter::EmitFlowEnd(const string& name, int64 ts, int64 pid,
events_.push_back(event);
}
-void ChromeTraceFormatter::EmitCounter(const string& category,
- const string& name, int64 pid, int64 ts,
- const string& device, int64 bytes) {
- Json::Value event = CreateEvent("C", category, name, pid, 0, ts);
+void ChromeTraceFormatter::EmitCounter(
+ const string& category, const string& name, int64 pid, int64 ts,
+ const string& device, int64 bytes,
+ const std::map<int64, std::vector<string>>& tensor_mem) {
+ Json::Value event = CreateEvent("C", category, "Allocated Bytes", pid, 0, ts);
Json::Value args(Json::objectValue);
- args[device] = Json::Value(bytes);
+ args["Allocator Bytes in Use"] = Json::Value(bytes);
event["args"] = args;
events_.push_back(event);
+
+ // TODO(xpan): chrome://tracing is not ideal visualization for memory.
+ // It would be great to have a customized UI for it.
+ Json::Value event2 =
+ CreateEvent("C", category, "Top Allocations", pid + 1, 0, ts);
+ Json::Value args2(Json::objectValue);
+ // Need to reserve the same args for all locations.
+ for (int i = 1; i < kMaxDisplayedMemNode; ++i) {
+ args2[strings::Printf("Top Allocation %02d", i)] = Json::Value("N/A");
+ }
+ int count = 0;
+ for (auto it = tensor_mem.rbegin(); it != tensor_mem.rend(); ++it) {
+ for (const string& t : it->second) {
+ if (bytes < it->first || count >= kMaxDisplayedMemNode) {
+ break;
+ }
+ args2[strings::Printf("Top Allocation %02d", count)] =
+ Json::Value(strings::StrCat(it->first / 1000000.0, " MB from ", t));
+ ++count;
+ bytes -= it->first;
+ }
+ }
+ args2[strings::StrCat("Not Displayed")] =
+ Json::Value(strings::Printf("%.2f MB", bytes / 1000000.0));
+ event2["args"] = args2;
+ events_.push_back(event2);
}
string ChromeTraceFormatter::Format() {
@@ -119,71 +148,28 @@ void MemoryTracker::TrackNode(int64 step, const GraphNode* node) {
if (!node->Trackable(step)) {
return;
}
+
Device& dev = devices_[node->node->canonical_device()];
- int64 end_micros = node->node->latest_end_micros(step);
- if (node->node->accelerator_persistent_bytes(step) != 0) {
- string tensor_name = strings::StrCat(node->name(), ":", -1);
- dev.earliest_ref[tensor_name] = node->node->all_start_micros(step);
- dev.tensor_size[tensor_name] =
- node->node->accelerator_persistent_bytes(step);
- // TODO(xpan): Need latest_ref?
- }
- if (node->node->accelerator_temp_bytes(step)) {
- string tensor_name = strings::StrCat(node->name(), ":", -2);
- dev.earliest_ref[tensor_name] = node->node->all_start_micros(step);
- dev.latest_ref[tensor_name] = end_micros;
- dev.tensor_size[tensor_name] = node->node->accelerator_temp_bytes(step);
- }
- if (node->node->allocator_bytes_in_use(step) > 0) {
- dev.allocator_stats[end_micros] = node->node->allocator_bytes_in_use(step);
- }
-}
-void MemoryTracker::TrackNodeConnection(int64 step, const GraphNode* node,
- const GraphNode* src) {
- if (!node->Trackable(step) || !src->Trackable(step)) {
- return;
- }
- const auto& output_idx = node->node->src_output_idx().find(src->name());
- if (output_idx == node->node->src_output_idx().end()) {
- return;
- }
- const auto& output = src->node->output_memory(step).find(output_idx->second);
- if (output == src->node->output_memory(step).end()) {
- return;
+ std::map<int64, int64> allocs;
+ for (const auto& alloc : node->node->allocations(step)) {
+ for (const auto& r : alloc.allocation_records()) {
+ allocs[r.alloc_micros()] += r.alloc_bytes();
+ dev.tracked_allocations[r.alloc_micros()] += r.alloc_bytes();
+ }
}
- int64 output_bytes = output->second.first;
- uint64 output_ptr = output->second.second;
-
- Device& src_dev = devices_[src->node->canonical_device()];
- string tensor_name = strings::StrCat(output_ptr);
- if (output_ptr == 0) {
- fprintf(stderr, "output no ptr\n");
- tensor_name = strings::StrCat(src->node->name(), ":", output_idx->second);
+ dev.tracked_allocations[0] += node->node->accelerator_persistent_bytes();
+ allocs[0] += node->node->accelerator_persistent_bytes();
+
+ int64 last = 0;
+ std::map<int64, int64>& aggregate_allocs = dev.tensor_allocs[node->name()];
+ for (auto it = allocs.begin(); it != allocs.end(); ++it) {
+ last += it->second;
+ aggregate_allocs[it->first] = last;
}
-
- src_dev.tensor_size[tensor_name] = output_bytes;
- src_dev.earliest_ref[tensor_name] = src->node->all_start_micros(step);
-
- int64 src_end_micros = src->node->latest_end_micros(step);
-
- if (src->node->canonical_device() != node->node->canonical_device()) {
- int64 transfer_micros =
- (src_end_micros + node->node->all_start_micros(step)) / 2;
- src_dev.latest_ref[tensor_name] =
- std::max(src_dev.latest_ref[tensor_name], transfer_micros);
-
- Device& dest_dev = devices_[node->node->canonical_device()];
- string dest_tensor_name =
- strings::StrCat(tensor_name, node->node->canonical_device());
- dest_dev.tensor_size[dest_tensor_name] = output_bytes;
- dest_dev.earliest_ref[dest_tensor_name] = transfer_micros;
- dest_dev.latest_ref[dest_tensor_name] =
- std::max(dest_dev.latest_ref[dest_tensor_name],
- node->node->latest_end_micros(step));
- } else {
- src_dev.latest_ref[tensor_name] = std::max(
- src_dev.latest_ref[tensor_name], node->node->latest_end_micros(step));
+ int64 end_micros = node->node->lastest_schedule_end_micros(step);
+ if (end_micros > 0 && node->node->allocator_bytes_in_use(step) > 0) {
+ dev.allocations[end_micros] = node->node->allocator_bytes_in_use(step);
}
}
@@ -222,22 +208,24 @@ void Timeline::GenerateGraphTimeline(const std::vector<GraphNode*>& gnodes) {
for (GraphNode* gnode : gnodes) {
AllocateTimeNodes(gnode);
}
+ // To save memory, we only track cross-device (canonical device) flows.
for (auto& process : tnodes_) {
+ if (!IsCanonicalDevice(process.first)) continue;
for (auto& tn : process.second) {
TimeNode* tnode = tn.second.get();
for (GraphNode* inp : tnode->node->children) {
if (!inp->account || !inp->Trackable(step_)) {
continue;
}
- TrackNodeConnection(tnode->node, inp);
- for (const auto& kernel_execs : inp->node->op_execs(step_)) {
- if (process.first == kernel_execs.first) {
- // Not interested in flow withthin the same device.
+ for (const auto& execs : inp->node->cpu_execs(step_)) {
+ if (!IsCanonicalDevice(execs.first)) continue;
+ if (process.first == execs.first) {
+ // Not interested in flow within the same device.
continue;
}
- for (const auto& exec : kernel_execs.second) {
+ for (const auto& exec : execs.second) {
int64 start_micros = exec.first;
- auto cprocess = tnodes_.find(kernel_execs.first);
+ auto cprocess = tnodes_.find(execs.first);
if (cprocess == tnodes_.end()) continue;
auto ctn = cprocess->second.find(start_micros);
if (ctn == cprocess->second.end()) continue;
@@ -258,7 +246,6 @@ void Timeline::GenerateGraphTimeline(const std::vector<GraphNode*>& gnodes) {
Json::Value args(Json::objectValue);
args["name"] = Json::Value(tnode->name());
- args["op"] = Json::Value(tnode->name());
chrome_formatter_.EmitRegion(node.first, tnode->exec_micros,
process.first, lane.first, "Op",
tnode->name(), args);
@@ -280,12 +267,40 @@ void Timeline::GenerateGraphTimeline(const std::vector<GraphNode*>& gnodes) {
for (const auto& dev : mem_tracker_.devices()) {
int64 pid = AllocatePID();
chrome_formatter_.EmitPID(GetMemoryLaneName(dev.first), pid);
+ int64 pid2 = AllocatePID();
+ chrome_formatter_.EmitPID(GetMemoryLaneName(dev.first) + " allocations",
+ pid2);
+
const MemoryTracker::Device& device = dev.second;
- for (const auto& alloc_stats : device.allocator_stats) {
- chrome_formatter_.EmitCounter("Memory", "Memory Series", pid,
- alloc_stats.first, dev.first,
- alloc_stats.second);
+ int64 max_bytes_in_use = 0;
+ int64 cur_bytes_in_use = 0;
+ int64 last_point = 0;
+ for (const auto& alloc : device.allocations) {
+ cur_bytes_in_use = alloc.second;
+ max_bytes_in_use = std::max(max_bytes_in_use, cur_bytes_in_use);
+ // Do not plot too dense to reduce file size.
+ int64 ts = alloc.first;
+ if (ts - last_point < 100) continue;
+ last_point = ts;
+
+ std::map<int64, std::vector<string>> tensor_mem;
+ for (const auto& tensor_alloc_it : dev.second.tensor_allocs) {
+ const auto& tensor_alloc = tensor_alloc_it.second;
+ auto it = tensor_alloc.lower_bound(ts);
+ if (it != tensor_alloc.begin()) {
+ --it;
+ }
+ if (it->second > 0) {
+ tensor_mem[it->second].push_back(tensor_alloc_it.first);
+ }
+ }
+ chrome_formatter_.EmitCounter("Memory", "Memory Series", pid, ts,
+ dev.first, cur_bytes_in_use, tensor_mem);
+ }
+ if (IsPlacedOnAccelerator(dev.first)) {
+ fprintf(stdout, "%s peak memory: %.2f MB\n", dev.first.c_str(),
+ max_bytes_in_use / 1000000.0);
}
}
OutputTimeline();