diff options
author | 2017-10-30 20:08:55 -0700 | |
---|---|---|
committer | 2017-10-30 20:13:00 -0700 | |
commit | 113be57466d36ab7086794475cf4579f3e6b940b (patch) | |
tree | 2c77d2a78afe04d79386f5afbfa20a28513f4a68 /tensorflow/core/profiler | |
parent | 6d1263cdf8ee8323513f984553dbeb070865fd0c (diff) |
A few profiler improvements
1. Track the full allocation history of each tensor, visualized in timeline.
2. Better ProfileContext for tracing step selection.
3. Small bug fix.
PiperOrigin-RevId: 173988293
Diffstat (limited to 'tensorflow/core/profiler')
14 files changed, 273 insertions, 146 deletions
diff --git a/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc b/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc index c39d44b7fa..d05143aff9 100644 --- a/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc +++ b/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc @@ -48,7 +48,7 @@ class TFProfAdvisorTest : public ::testing::Test { for (const auto& attr : attrs) { (*def->mutable_attr())[attr.first].set_s(attr.second); } - std::unique_ptr<TFGraphNode> node(new TFGraphNode(def, -1)); + std::unique_ptr<TFGraphNode> node(new TFGraphNode(def, -1, nullptr)); NodeExecStats node_stat; node_stat.set_all_start_micros(start_miros); diff --git a/tensorflow/core/profiler/internal/print_model_analysis.cc b/tensorflow/core/profiler/internal/print_model_analysis.cc index 575ae182ee..7a0d590262 100644 --- a/tensorflow/core/profiler/internal/print_model_analysis.cc +++ b/tensorflow/core/profiler/internal/print_model_analysis.cc @@ -119,8 +119,8 @@ void DeleteProfiler() { } } -void AddStep(int64 step, const string* graph, const string* run_meta, - const string* op_log) { +double AddStep(int64 step, const string* graph, const string* run_meta, + const string* op_log) { CHECK(tf_stat); CHECK(graph && !graph->empty()); @@ -144,6 +144,7 @@ void AddStep(int64 step, const string* graph, const string* run_meta, op_log_ptr->ParseFromString(*op_log); tf_stat->AddOpLogProto(std::move(op_log_ptr)); } + return tf_stat->run_coverage(); } string Profile(const string* command, const string* options) { @@ -154,6 +155,7 @@ string Profile(const string* command, const string* options) { } void WriteProfile(const string* filename) { + CHECK(tf_stat); CHECK(filename) << "empty file name when asking to write profile."; tf_stat->WriteProfile(*filename); } diff --git a/tensorflow/core/profiler/internal/print_model_analysis.h b/tensorflow/core/profiler/internal/print_model_analysis.h index e4d01041a8..31ff5b07b0 100644 --- a/tensorflow/core/profiler/internal/print_model_analysis.h +++ b/tensorflow/core/profiler/internal/print_model_analysis.h @@ -35,8 +35,8 @@ bool NewProfiler(const string* graph, const string* op_log); void DeleteProfiler(); -void AddStep(int64 step, const string* graph, const string* run_meta, - const string* op_log); +double AddStep(int64 step, const string* graph, const string* run_meta, + const string* op_log); // Write the profiler's profile to a proto buffer. void WriteProfile(const string* filename); diff --git a/tensorflow/core/profiler/internal/tfprof_graph.cc b/tensorflow/core/profiler/internal/tfprof_graph.cc index 3766365bf8..db7ae3b397 100644 --- a/tensorflow/core/profiler/internal/tfprof_graph.cc +++ b/tensorflow/core/profiler/internal/tfprof_graph.cc @@ -31,7 +31,7 @@ GraphNode* TFGraph::CreateParentNode(const string& name) { node_defs_.back()->set_name(name); node_defs_.back()->set_op(kTFGraphParent); parent_nodes_[name] = std::unique_ptr<TFGraphNode>( - new TFGraphNode(node_defs_.back().get(), -1)); + new TFGraphNode(node_defs_.back().get(), -1, nullptr)); nodes_map_[name] = std::unique_ptr<GraphNode>(new GraphNode(parent_nodes_[name].get())); return nodes_map_[name].get(); diff --git a/tensorflow/core/profiler/internal/tfprof_node.cc b/tensorflow/core/profiler/internal/tfprof_node.cc index f283fafc0f..671b65d708 100644 --- a/tensorflow/core/profiler/internal/tfprof_node.cc +++ b/tensorflow/core/profiler/internal/tfprof_node.cc @@ -19,19 +19,15 @@ limitations under the License. namespace tensorflow { namespace tfprof { -namespace { bool CountAsAcceleratorTime(const string& device) { return device.find("stream:all") != device.npos; } - bool CountAsCPUTime(const string& device) { return RE2::FullMatch(device, ".*/(device:gpu|gpu|device:cpu|cpu|device:sycl):\\d+"); } - bool IsCanonicalDevice(const string& device) { return CountAsCPUTime(device); } -} // namespace // Notes about start and end time from the NodeExecStats proto: // For GPU, there is no difference between op_end_rel_micros and // all_end_rel_micros. All are kernel times. @@ -89,16 +85,28 @@ void ExecStep::AddMemoryStats(const string& dev, } exec_.set_memory_intialized(true); + int accelerator_allocator_cnt = 0; for (const auto& mem : step_stat.memory()) { // TODO(xpan): Fix this hack. Currently the allocator name seems quite // ad-hoc. if (mem.allocator_name().find("GPU") == mem.allocator_name().npos) { continue; } + ++accelerator_allocator_cnt; exec_.set_allocator_bytes_in_use( std::max(static_cast<int64>(exec_.allocator_bytes_in_use()), static_cast<int64>(mem.allocator_bytes_in_use()))); + Allocation allocation; + for (const auto& alloc : mem.allocation_records()) { + allocation.add_allocation_records()->MergeFrom(alloc); + } + allocations_.push_back(allocation); } + if (accelerator_allocator_cnt > 1) { + fprintf(stderr, "found %d gpu allocator for 1 node\n", + accelerator_allocator_cnt); + } + int64 total_output_bytes = 0; for (const auto& output : step_stat.output()) { if (output.has_tensor_description() && diff --git a/tensorflow/core/profiler/internal/tfprof_node.h b/tensorflow/core/profiler/internal/tfprof_node.h index 34bc0a581d..e2d0563a07 100644 --- a/tensorflow/core/profiler/internal/tfprof_node.h +++ b/tensorflow/core/profiler/internal/tfprof_node.h @@ -105,8 +105,22 @@ class ExecStep { const { return op_execs_; } + const std::map<string, std::vector<std::pair<int64, int64>>>& cpu_execs() + const { + return cpu_execs_; + } + int64 all_start_micros() const { return exec_.all_start_micros(); } int64 latest_end_micros() const { return exec_.latest_end_micros(); } + int64 lastest_schedule_end_micros() const { + int64 ret = 0; + for (const auto& exec : cpu_execs_) { + for (const auto& pair : exec.second) { + ret = std::max(ret, pair.first + pair.second); + } + } + return ret; + } int64 requested_bytes() const { return exec_.requested_bytes(); } int64 peak_bytes() const { return exec_.peak_bytes(); } @@ -127,6 +141,8 @@ class ExecStep { return exec_.allocator_bytes_in_use(); } + const std::vector<Allocation>& allocations() const { return allocations_; } + const ExecProfile& ToProto() { exec_.mutable_accelerator_execs()->clear(); for (const auto& e : accelerator_execs_) { @@ -161,6 +177,11 @@ class ExecStep { mem_pb.set_ptr(mem.second.second); } + exec_.mutable_allocations()->Clear(); + for (const auto& r : allocations_) { + exec_.add_allocations()->MergeFrom(r); + } + return exec_; } @@ -175,6 +196,8 @@ class ExecStep { cpu_execs_.clear(); op_execs_.clear(); + allocations_.clear(); + for (const auto& exec_time : exec_.accelerator_execs()) { auto& exec = accelerator_execs_[exec_time.first]; auto& op_exec = op_execs_[exec_time.first]; @@ -196,6 +219,10 @@ class ExecStep { mem.first = output_mem.second.bytes(); mem.second = output_mem.second.ptr(); } + + for (const auto& r : exec_.allocations()) { + allocations_.push_back(r); + } } private: @@ -215,6 +242,9 @@ class ExecStep { std::set<string> devices_; // output_idx -> {output_bytes, memory_ptr} std::map<int32, std::pair<int64, uint64>> output_memory_; + + // The history of accelerator allocations and deallocations of this step. + std::vector<Allocation> allocations_; }; #define GRAPH_NODE_BYTES(type) \ @@ -238,11 +268,15 @@ class ExecStep { class TFGraphNode { public: TFGraphNode(const ProfileNode& node, const ProfileProto& profile, - const std::map<int64, string>* id_to_string) { + const std::map<int64, string>* id_to_string, + const std::map<string, std::unique_ptr<TFGraphNode>>* nodes_map) { + nodes_map_ = nodes_map; FromProto(node, profile, id_to_string); } - TFGraphNode(const NodeDef* node, int64 id) { + TFGraphNode(const NodeDef* node, int64 id, + const std::map<string, std::unique_ptr<TFGraphNode>>* nodes_map) { + nodes_map_ = nodes_map; node_.set_id(id); node_.set_name(node->name()); node_.set_op(node->op()); @@ -269,17 +303,9 @@ class TFGraphNode { op_types_.insert(node->op()); } - void AddInput(TFGraphNode* input, int32 output_idx, int input_idx) { - src_output_idx_[input->name()] = output_idx; - - inputs_[input_idx] = input->name(); - const auto& output_shape = input->output_shapes().find(output_idx); - // Always create an empty vec even if the shape info might be missing. - std::vector<int64>& shape_vec = input_shapes_[input_idx]; - if (output_shape != input->output_shapes().end()) { - shape_vec.assign(output_shape->second.begin(), - output_shape->second.end()); - } + void AddInput(const string& input, int64 output_index, int input_idx) { + inputs_[input_idx] = input; + src_output_idx_[input] = output_index; } void AddOpType(const string& op_type) { op_types_.insert(op_type); } @@ -416,9 +442,6 @@ class TFGraphNode { } const std::map<int32, string>& inputs() const { return inputs_; } - const std::map<string, int32>& src_output_idx() const { - return src_output_idx_; - } // Number of times the graph node is executed. When step < 0, the // average number of times executed across all steps. @@ -526,14 +549,30 @@ class TFGraphNode { return exec->second.latest_end_micros(); } + int64 lastest_schedule_end_micros(int64 step) const { + auto exec = execs_.find(step); + if (exec == execs_.end()) { + return 0; + } + return exec->second.lastest_schedule_end_micros(); + } + const std::map<string, std::vector<std::pair<int64, int64>>>& op_execs( int64 step) const { auto exec = execs_.find(step); if (exec == execs_.end()) { - return empty_op_execs_; + return empty_execs_; } return exec->second.op_execs(); } + const std::map<string, std::vector<std::pair<int64, int64>>>& cpu_execs( + int64 step) const { + auto exec = execs_.find(step); + if (exec == execs_.end()) { + return empty_execs_; + } + return exec->second.cpu_execs(); + } const std::map<int64, ExecStep>& all_op_execs() const { return execs_; } @@ -551,12 +590,12 @@ class TFGraphNode { } return exec->second.host_temp_bytes(); } - int64 accelerator_persistent_bytes(int64 step) const { - auto exec = execs_.find(step); - if (exec == execs_.end()) { - return 0; + int64 accelerator_persistent_bytes() const { + int64 persistent_bytes = 0; + for (const auto& exec : execs_) { + persistent_bytes += exec.second.accelerator_persistent_bytes(); } - return exec->second.accelerator_persistent_bytes(); + return persistent_bytes; } int64 host_persistent_bytes(int64 step) const { auto exec = execs_.find(step); @@ -581,6 +620,14 @@ class TFGraphNode { return exec->second.allocator_bytes_in_use(); } + const std::vector<Allocation>& allocations(int64 step) const { + auto exec = execs_.find(step); + if (exec == execs_.end()) { + return empty_allocations_; + } + return exec->second.allocations(); + } + int64 parameters() const { if (!shape().empty()) { int64 params = 1; @@ -628,18 +675,44 @@ class TFGraphNode { const std::map<int, std::vector<int64>>& output_shapes() const { return output_shapes_; } - const std::map<int, std::vector<int64>>& input_shapes() const { - return input_shapes_; + + const std::map<int, std::vector<int64>> input_shapes() const { + std::map<int, std::vector<int64>> input_shapes; + for (const auto& inp : inputs_) { + // Always create an empty vec even if the shape info might be missing. + std::vector<int64>& shape_vec = input_shapes[inp.first]; + if (!nodes_map_) continue; + auto input_it = nodes_map_->find(inp.second); + if (input_it == nodes_map_->end()) continue; + auto output_it = src_output_idx_.find(inp.second); + if (output_it == src_output_idx_.end()) continue; + + const TFGraphNode* input_node = input_it->second.get(); + if (!input_node) continue; + const auto& output_shapes = input_node->output_shapes(); + const auto& output_shape = output_shapes.find(output_it->second); + if (output_shape == output_shapes.end()) continue; + + if (output_shape != input_node->output_shapes().end()) { + shape_vec.assign(output_shape->second.begin(), + output_shape->second.end()); + } + } + return input_shapes; } private: + // maps graph node name to TFGraphNode. Not owned. + const std::map<string, std::unique_ptr<TFGraphNode>>* nodes_map_; + // inputs to the node. input index -> input node name. std::map<int, string> inputs_; + // The output index of the source node. std::map<string, int32> src_output_idx_; - + // proto for serialize/deserialized representation of the node. ProfileNode node_; - + // Python call stack that creates the name. std::unique_ptr<CallStack> call_stack_; - + // Shape of the node (e.g. Variable) if available. std::vector<int64> shape_; // Won't missing input_idx. But some shapes might be empty (unknown). std::map<int, std::vector<int64>> input_shapes_; @@ -651,8 +724,10 @@ class TFGraphNode { std::map<int64, ExecStep> execs_; + // Placeholder for empty cases. std::map<int32, std::pair<int64, uint64>> empty_output_memory_; - std::map<string, std::vector<std::pair<int64, int64>>> empty_op_execs_; + std::map<string, std::vector<std::pair<int64, int64>>> empty_execs_; + std::vector<Allocation> empty_allocations_; }; class TFMultiGraphNode { @@ -806,6 +881,10 @@ class TFMultiGraphNode { }; bool IsPlacedOnAccelerator(const string& device); +bool CountAsAcceleratorTime(const string& device); +bool CountAsCPUTime(const string& device); +bool IsCanonicalDevice(const string& device); + } // namespace tfprof } // namespace tensorflow diff --git a/tensorflow/core/profiler/internal/tfprof_scope.cc b/tensorflow/core/profiler/internal/tfprof_scope.cc index 128b296d5c..988bed71cc 100644 --- a/tensorflow/core/profiler/internal/tfprof_scope.cc +++ b/tensorflow/core/profiler/internal/tfprof_scope.cc @@ -35,7 +35,7 @@ ScopeNode* TFScope::CreateParentNode(const string& name) { node_defs_.back()->set_name(name); node_defs_.back()->set_op(kTFScopeParent); parent_nodes_[name] = std::unique_ptr<TFGraphNode>( - new TFGraphNode(node_defs_.back().get(), -1)); + new TFGraphNode(node_defs_.back().get(), -1, nullptr)); nodes_map_[name] = std::unique_ptr<ScopeNode>(new ScopeNode(parent_nodes_[name].get())); return nodes_map_[name].get(); diff --git a/tensorflow/core/profiler/internal/tfprof_stats.cc b/tensorflow/core/profiler/internal/tfprof_stats.cc index b4b98141f3..7943c075e0 100644 --- a/tensorflow/core/profiler/internal/tfprof_stats.cc +++ b/tensorflow/core/profiler/internal/tfprof_stats.cc @@ -36,7 +36,9 @@ bool CreateRunMetadataNode(const string& name, NodeDef* def) { } def->set_name(name); // TODO(xpan): Better operation type. - def->set_op("RunTimeOp"); + // This is because some times a node doesn't have a op type, + // so we use node name as the op type. + def->set_op(name); return true; } } // namespace @@ -86,7 +88,7 @@ TFStats::TFStats(const string& filename, } for (const auto& node_pb : profile.nodes()) { std::unique_ptr<TFGraphNode> node( - new TFGraphNode(node_pb.second, profile, &id_to_string_)); + new TFGraphNode(node_pb.second, profile, &id_to_string_, &nodes_map_)); nodes_map_.insert(std::pair<string, std::unique_ptr<TFGraphNode>>( node_pb.second.name(), std::move(node))); } @@ -178,12 +180,14 @@ const MultiGraphNodeProto& TFStats::ShowMultiGraphNode( void TFStats::AddGraph(std::unique_ptr<GraphDef> graph) { std::map<string, const NodeDef*> node_defs; + bool node_added = false; for (const NodeDef& node : graph->node()) { if (nodes_map_.find(node.name()) != nodes_map_.end()) { continue; } - nodes_map_[node.name()] = - std::unique_ptr<TFGraphNode>(new TFGraphNode(&node, nodes_map_.size())); + node_added = true; + nodes_map_[node.name()] = std::unique_ptr<TFGraphNode>( + new TFGraphNode(&node, nodes_map_.size(), &nodes_map_)); node_defs[node.name()] = &node; } for (auto it = node_defs.begin(); it != node_defs.end(); it++) { @@ -192,6 +196,7 @@ void TFStats::AddGraph(std::unique_ptr<GraphDef> graph) { string node_input = it->second->input(i); int output_idx = 0; // input name format can be: "^node:src_output" + // if not :src_output, then it's the first one (further verify?) auto prefix_pos = node_input.find(":"); if (prefix_pos != node_input.npos) { std::vector<string> input_parts = str_util::Split(node_input, ":"); @@ -204,15 +209,18 @@ void TFStats::AddGraph(std::unique_ptr<GraphDef> graph) { if (node_input.substr(0, 1) == "^") { node_input = node_input.substr(1); } - auto input_node = nodes_map_.find(node_input); - // TODO(xpan): P1: Add the input even if it doesn't exist yet, because - // this can be a partial graph. - if (input_node == nodes_map_.end()) { - continue; - } - node->AddInput(input_node->second.get(), output_idx, i); + // Delay input TFGraphNode retrieval as late as possible. + // In long run, when we have TensorFlow runtime graph, the + // graph connection should be dynamic and per-step. + node->AddInput(node_input, output_idx, i); } } + if (node_added) { + graph_view_.reset(nullptr); + scope_view_.reset(nullptr); + op_view_.reset(nullptr); + code_view_.reset(nullptr); + } } void TFStats::AddOpLogProto(std::unique_ptr<OpLogProto> op_log) { @@ -263,10 +271,11 @@ void TFStats::AddRunMeta(int64 step, std::unique_ptr<RunMetadata> run_meta) { NodeDef def; if (CreateRunMetadataNode(name, &def)) { nodes_map_[name] = std::unique_ptr<TFGraphNode>( - new TFGraphNode(&def, nodes_map_.size())); + new TFGraphNode(&def, nodes_map_.size(), &nodes_map_)); nodes_map_.at(name)->AddStepStat(step, dev_stat.device(), node_stat); } } else { + covered_nodes_.insert(node->second->id()); node->second->AddStepStat(step, dev_stat.device(), node_stat); } } diff --git a/tensorflow/core/profiler/internal/tfprof_stats.h b/tensorflow/core/profiler/internal/tfprof_stats.h index bb4baea738..d46d923556 100644 --- a/tensorflow/core/profiler/internal/tfprof_stats.h +++ b/tensorflow/core/profiler/internal/tfprof_stats.h @@ -66,6 +66,9 @@ class TFStats { } const std::set<int64>& steps() const { return steps_; } bool has_code_traces() const { return has_code_traces_; } + double run_coverage() const { + return covered_nodes_.size() / (nodes_map_.size() + 1e-10); + } void BuildView(const string& cmd); void BuildAllViews(); @@ -104,13 +107,16 @@ class TFStats { std::unique_ptr<TFCode> code_view_; std::unique_ptr<TFOp> op_view_; std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader_; - // Store TFGraphNode instead of TFGraphNode* to avoid large number of - // dynamic alloc. + // TODO(xpan): Store TFGraphNode instead of TFGraphNode* to avoid large + // number of dynamic alloc. + // Maps from graph node name to TFGraphNode. std::map<string, std::unique_ptr<TFGraphNode>> nodes_map_; GraphNodeProto empty_graph_node_; MultiGraphNodeProto empty_multi_graph_node_; std::map<int64, string> id_to_string_; + // Graph nodes covered by RunMetdata, that is traced with run time stats. + std::set<int64> covered_nodes_; }; } // namespace tfprof diff --git a/tensorflow/core/profiler/internal/tfprof_timeline.cc b/tensorflow/core/profiler/internal/tfprof_timeline.cc index 1732574cc4..bdb000747d 100644 --- a/tensorflow/core/profiler/internal/tfprof_timeline.cc +++ b/tensorflow/core/profiler/internal/tfprof_timeline.cc @@ -25,6 +25,8 @@ limitations under the License. namespace tensorflow { namespace tfprof { namespace { +int kMaxDisplayedMemNode = 10; + string GetTimeDevName(const string& dev) { if (dev.find("stream") != dev.npos) { return strings::StrCat("Op execution threads: ", dev); @@ -85,14 +87,41 @@ void ChromeTraceFormatter::EmitFlowEnd(const string& name, int64 ts, int64 pid, events_.push_back(event); } -void ChromeTraceFormatter::EmitCounter(const string& category, - const string& name, int64 pid, int64 ts, - const string& device, int64 bytes) { - Json::Value event = CreateEvent("C", category, name, pid, 0, ts); +void ChromeTraceFormatter::EmitCounter( + const string& category, const string& name, int64 pid, int64 ts, + const string& device, int64 bytes, + const std::map<int64, std::vector<string>>& tensor_mem) { + Json::Value event = CreateEvent("C", category, "Allocated Bytes", pid, 0, ts); Json::Value args(Json::objectValue); - args[device] = Json::Value(bytes); + args["Allocator Bytes in Use"] = Json::Value(bytes); event["args"] = args; events_.push_back(event); + + // TODO(xpan): chrome://tracing is not ideal visualization for memory. + // It would be great to have a customized UI for it. + Json::Value event2 = + CreateEvent("C", category, "Top Allocations", pid + 1, 0, ts); + Json::Value args2(Json::objectValue); + // Need to reserve the same args for all locations. + for (int i = 1; i < kMaxDisplayedMemNode; ++i) { + args2[strings::Printf("Top Allocation %02d", i)] = Json::Value("N/A"); + } + int count = 0; + for (auto it = tensor_mem.rbegin(); it != tensor_mem.rend(); ++it) { + for (const string& t : it->second) { + if (bytes < it->first || count >= kMaxDisplayedMemNode) { + break; + } + args2[strings::Printf("Top Allocation %02d", count)] = + Json::Value(strings::StrCat(it->first / 1000000.0, " MB from ", t)); + ++count; + bytes -= it->first; + } + } + args2[strings::StrCat("Not Displayed")] = + Json::Value(strings::Printf("%.2f MB", bytes / 1000000.0)); + event2["args"] = args2; + events_.push_back(event2); } string ChromeTraceFormatter::Format() { @@ -119,71 +148,28 @@ void MemoryTracker::TrackNode(int64 step, const GraphNode* node) { if (!node->Trackable(step)) { return; } + Device& dev = devices_[node->node->canonical_device()]; - int64 end_micros = node->node->latest_end_micros(step); - if (node->node->accelerator_persistent_bytes(step) != 0) { - string tensor_name = strings::StrCat(node->name(), ":", -1); - dev.earliest_ref[tensor_name] = node->node->all_start_micros(step); - dev.tensor_size[tensor_name] = - node->node->accelerator_persistent_bytes(step); - // TODO(xpan): Need latest_ref? - } - if (node->node->accelerator_temp_bytes(step)) { - string tensor_name = strings::StrCat(node->name(), ":", -2); - dev.earliest_ref[tensor_name] = node->node->all_start_micros(step); - dev.latest_ref[tensor_name] = end_micros; - dev.tensor_size[tensor_name] = node->node->accelerator_temp_bytes(step); - } - if (node->node->allocator_bytes_in_use(step) > 0) { - dev.allocator_stats[end_micros] = node->node->allocator_bytes_in_use(step); - } -} -void MemoryTracker::TrackNodeConnection(int64 step, const GraphNode* node, - const GraphNode* src) { - if (!node->Trackable(step) || !src->Trackable(step)) { - return; - } - const auto& output_idx = node->node->src_output_idx().find(src->name()); - if (output_idx == node->node->src_output_idx().end()) { - return; - } - const auto& output = src->node->output_memory(step).find(output_idx->second); - if (output == src->node->output_memory(step).end()) { - return; + std::map<int64, int64> allocs; + for (const auto& alloc : node->node->allocations(step)) { + for (const auto& r : alloc.allocation_records()) { + allocs[r.alloc_micros()] += r.alloc_bytes(); + dev.tracked_allocations[r.alloc_micros()] += r.alloc_bytes(); + } } - int64 output_bytes = output->second.first; - uint64 output_ptr = output->second.second; - - Device& src_dev = devices_[src->node->canonical_device()]; - string tensor_name = strings::StrCat(output_ptr); - if (output_ptr == 0) { - fprintf(stderr, "output no ptr\n"); - tensor_name = strings::StrCat(src->node->name(), ":", output_idx->second); + dev.tracked_allocations[0] += node->node->accelerator_persistent_bytes(); + allocs[0] += node->node->accelerator_persistent_bytes(); + + int64 last = 0; + std::map<int64, int64>& aggregate_allocs = dev.tensor_allocs[node->name()]; + for (auto it = allocs.begin(); it != allocs.end(); ++it) { + last += it->second; + aggregate_allocs[it->first] = last; } - - src_dev.tensor_size[tensor_name] = output_bytes; - src_dev.earliest_ref[tensor_name] = src->node->all_start_micros(step); - - int64 src_end_micros = src->node->latest_end_micros(step); - - if (src->node->canonical_device() != node->node->canonical_device()) { - int64 transfer_micros = - (src_end_micros + node->node->all_start_micros(step)) / 2; - src_dev.latest_ref[tensor_name] = - std::max(src_dev.latest_ref[tensor_name], transfer_micros); - - Device& dest_dev = devices_[node->node->canonical_device()]; - string dest_tensor_name = - strings::StrCat(tensor_name, node->node->canonical_device()); - dest_dev.tensor_size[dest_tensor_name] = output_bytes; - dest_dev.earliest_ref[dest_tensor_name] = transfer_micros; - dest_dev.latest_ref[dest_tensor_name] = - std::max(dest_dev.latest_ref[dest_tensor_name], - node->node->latest_end_micros(step)); - } else { - src_dev.latest_ref[tensor_name] = std::max( - src_dev.latest_ref[tensor_name], node->node->latest_end_micros(step)); + int64 end_micros = node->node->lastest_schedule_end_micros(step); + if (end_micros > 0 && node->node->allocator_bytes_in_use(step) > 0) { + dev.allocations[end_micros] = node->node->allocator_bytes_in_use(step); } } @@ -222,22 +208,24 @@ void Timeline::GenerateGraphTimeline(const std::vector<GraphNode*>& gnodes) { for (GraphNode* gnode : gnodes) { AllocateTimeNodes(gnode); } + // To save memory, we only track cross-device (canonical device) flows. for (auto& process : tnodes_) { + if (!IsCanonicalDevice(process.first)) continue; for (auto& tn : process.second) { TimeNode* tnode = tn.second.get(); for (GraphNode* inp : tnode->node->children) { if (!inp->account || !inp->Trackable(step_)) { continue; } - TrackNodeConnection(tnode->node, inp); - for (const auto& kernel_execs : inp->node->op_execs(step_)) { - if (process.first == kernel_execs.first) { - // Not interested in flow withthin the same device. + for (const auto& execs : inp->node->cpu_execs(step_)) { + if (!IsCanonicalDevice(execs.first)) continue; + if (process.first == execs.first) { + // Not interested in flow within the same device. continue; } - for (const auto& exec : kernel_execs.second) { + for (const auto& exec : execs.second) { int64 start_micros = exec.first; - auto cprocess = tnodes_.find(kernel_execs.first); + auto cprocess = tnodes_.find(execs.first); if (cprocess == tnodes_.end()) continue; auto ctn = cprocess->second.find(start_micros); if (ctn == cprocess->second.end()) continue; @@ -258,7 +246,6 @@ void Timeline::GenerateGraphTimeline(const std::vector<GraphNode*>& gnodes) { Json::Value args(Json::objectValue); args["name"] = Json::Value(tnode->name()); - args["op"] = Json::Value(tnode->name()); chrome_formatter_.EmitRegion(node.first, tnode->exec_micros, process.first, lane.first, "Op", tnode->name(), args); @@ -280,12 +267,40 @@ void Timeline::GenerateGraphTimeline(const std::vector<GraphNode*>& gnodes) { for (const auto& dev : mem_tracker_.devices()) { int64 pid = AllocatePID(); chrome_formatter_.EmitPID(GetMemoryLaneName(dev.first), pid); + int64 pid2 = AllocatePID(); + chrome_formatter_.EmitPID(GetMemoryLaneName(dev.first) + " allocations", + pid2); + const MemoryTracker::Device& device = dev.second; - for (const auto& alloc_stats : device.allocator_stats) { - chrome_formatter_.EmitCounter("Memory", "Memory Series", pid, - alloc_stats.first, dev.first, - alloc_stats.second); + int64 max_bytes_in_use = 0; + int64 cur_bytes_in_use = 0; + int64 last_point = 0; + for (const auto& alloc : device.allocations) { + cur_bytes_in_use = alloc.second; + max_bytes_in_use = std::max(max_bytes_in_use, cur_bytes_in_use); + // Do not plot too dense to reduce file size. + int64 ts = alloc.first; + if (ts - last_point < 100) continue; + last_point = ts; + + std::map<int64, std::vector<string>> tensor_mem; + for (const auto& tensor_alloc_it : dev.second.tensor_allocs) { + const auto& tensor_alloc = tensor_alloc_it.second; + auto it = tensor_alloc.lower_bound(ts); + if (it != tensor_alloc.begin()) { + --it; + } + if (it->second > 0) { + tensor_mem[it->second].push_back(tensor_alloc_it.first); + } + } + chrome_formatter_.EmitCounter("Memory", "Memory Series", pid, ts, + dev.first, cur_bytes_in_use, tensor_mem); + } + if (IsPlacedOnAccelerator(dev.first)) { + fprintf(stdout, "%s peak memory: %.2f MB\n", dev.first.c_str(), + max_bytes_in_use / 1000000.0); } } OutputTimeline(); diff --git a/tensorflow/core/profiler/internal/tfprof_timeline.h b/tensorflow/core/profiler/internal/tfprof_timeline.h index 6c62d1046f..b8174cdecb 100644 --- a/tensorflow/core/profiler/internal/tfprof_timeline.h +++ b/tensorflow/core/profiler/internal/tfprof_timeline.h @@ -28,10 +28,12 @@ namespace tfprof { typedef std::map<string, string> Event; +// Class for generating timeline json output. class ChromeTraceFormatter { public: ChromeTraceFormatter() {} - + // The following methods creates timeline nodes. See chrome tracing format + // document for details. Json::Value CreateEvent(const string& ph, const string& category, const string& name, int64 pid, int64 tid, int64 ts); @@ -47,22 +49,27 @@ class ChromeTraceFormatter { int64 flow_id); void EmitCounter(const string& category, const string& name, int64 pid, - int64 ts, const string& device, int64 bytes); + int64 ts, const string& device, int64 bytes, + const std::map<int64, std::vector<string>>& tensor_mem); string Format(); private: + // A event is a visualization unit in timeline. std::vector<Json::Value> events_; std::vector<Json::Value> metadata_; }; +// A process (time series of events) in the timeline. class Process { public: Process(const string& device, int64 pid) : device(device), pid(pid) {} // Each lane is a map from start_time to end_time. std::vector<std::map<int64, int64>> lanes; + // device for the time series. string device; + // unique id for the time series. int64 pid; }; @@ -96,19 +103,16 @@ class MemoryTracker { public: class Device { public: - // The first 3 fields are predicted. - std::map<string, int64> tensor_size; - std::map<string, int64> earliest_ref; - std::map<string, int64> latest_ref; + // map from tensor name to a pair of <alloc time, bytes_in_use>. + std::map<string, std::map<int64, int64>> tensor_allocs; // ground truth memory stats. time->bytes. - std::map<int64, int64> allocator_stats; + std::map<int64, int64> allocations; + // tracked allocations, might miss some bytes. + std::map<int64, int64> tracked_allocations; }; void TrackNode(int64 step, const GraphNode* node); - void TrackNodeConnection(int64 step, const GraphNode* node, - const GraphNode* src); - const std::map<string, Device>& devices() const { return devices_; } private: @@ -130,13 +134,9 @@ class Timeline { void GenerateCodeTimeline(const CodeNode* node); + private: void TrackNode(const GraphNode* node) { mem_tracker_.TrackNode(step_, node); } - void TrackNodeConnection(GraphNode* node, GraphNode* src) { - mem_tracker_.TrackNodeConnection(step_, node, src); - } - - private: void OutputTimeline(); template <typename Node> diff --git a/tensorflow/core/profiler/internal/tfprof_timeline_test.cc b/tensorflow/core/profiler/internal/tfprof_timeline_test.cc index babae395ba..91eac0cf76 100644 --- a/tensorflow/core/profiler/internal/tfprof_timeline_test.cc +++ b/tensorflow/core/profiler/internal/tfprof_timeline_test.cc @@ -71,7 +71,7 @@ TEST_F(TFProfTimelineTest, GraphView) { string dump_str; TF_CHECK_OK(ReadFileToString(Env::Default(), dump_file + "_0", &dump_str)); - EXPECT_EQ(1754536562981488144ull, Hash64(dump_str)); + EXPECT_EQ(7932146665024565912ull, Hash64(dump_str)); } TEST_F(TFProfTimelineTest, ScopeView) { diff --git a/tensorflow/core/profiler/profiler.cc b/tensorflow/core/profiler/profiler.cc index 96e0b06bf3..a5e513aa21 100644 --- a/tensorflow/core/profiler/profiler.cc +++ b/tensorflow/core/profiler/profiler.cc @@ -234,6 +234,7 @@ int Run(int argc, char** argv) { return 1; } tf_stat->AddRunMeta(i, std::move(run_meta)); + fprintf(stdout, "run graph coverage: %.2f\n", tf_stat->run_coverage()); } } diff --git a/tensorflow/core/profiler/tfprof_log.proto b/tensorflow/core/profiler/tfprof_log.proto index a1410c7c79..f92301133a 100644 --- a/tensorflow/core/profiler/tfprof_log.proto +++ b/tensorflow/core/profiler/tfprof_log.proto @@ -3,6 +3,7 @@ syntax = "proto3"; package tensorflow.tfprof; import "tensorflow/core/framework/attr_value.proto"; +import "tensorflow/core/framework/step_stats.proto"; // It specifies the Python callstack that creates an op. message CodeDef { @@ -89,6 +90,10 @@ message ProfileNode { map<int64, ExecProfile> execs = 12; } +message Allocation { + repeated AllocationRecord allocation_records = 1; +} + message ExecProfile { // Can be larger than 1 if run multiple times in loop. int64 run_count = 1; @@ -107,6 +112,8 @@ message ExecProfile { map<int32, Memory> output_memory = 17; + repeated Allocation allocations = 18; + repeated string devices = 6; // Total bytes requested by the op. |