aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/core/profiler
diff options
context:
space:
mode:
authorGravatar A. Unique TensorFlower <gardener@tensorflow.org>2017-10-30 20:08:55 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2017-10-30 20:13:00 -0700
commit113be57466d36ab7086794475cf4579f3e6b940b (patch)
tree2c77d2a78afe04d79386f5afbfa20a28513f4a68 /tensorflow/core/profiler
parent6d1263cdf8ee8323513f984553dbeb070865fd0c (diff)
A few profiler improvements
1. Track the full allocation history of each tensor, visualized in timeline. 2. Better ProfileContext for tracing step selection. 3. Small bug fix. PiperOrigin-RevId: 173988293
Diffstat (limited to 'tensorflow/core/profiler')
-rw-r--r--tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc2
-rw-r--r--tensorflow/core/profiler/internal/print_model_analysis.cc6
-rw-r--r--tensorflow/core/profiler/internal/print_model_analysis.h4
-rw-r--r--tensorflow/core/profiler/internal/tfprof_graph.cc2
-rw-r--r--tensorflow/core/profiler/internal/tfprof_node.cc16
-rw-r--r--tensorflow/core/profiler/internal/tfprof_node.h135
-rw-r--r--tensorflow/core/profiler/internal/tfprof_scope.cc2
-rw-r--r--tensorflow/core/profiler/internal/tfprof_stats.cc33
-rw-r--r--tensorflow/core/profiler/internal/tfprof_stats.h10
-rw-r--r--tensorflow/core/profiler/internal/tfprof_timeline.cc169
-rw-r--r--tensorflow/core/profiler/internal/tfprof_timeline.h30
-rw-r--r--tensorflow/core/profiler/internal/tfprof_timeline_test.cc2
-rw-r--r--tensorflow/core/profiler/profiler.cc1
-rw-r--r--tensorflow/core/profiler/tfprof_log.proto7
14 files changed, 273 insertions, 146 deletions
diff --git a/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc b/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
index c39d44b7fa..d05143aff9 100644
--- a/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
+++ b/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
@@ -48,7 +48,7 @@ class TFProfAdvisorTest : public ::testing::Test {
for (const auto& attr : attrs) {
(*def->mutable_attr())[attr.first].set_s(attr.second);
}
- std::unique_ptr<TFGraphNode> node(new TFGraphNode(def, -1));
+ std::unique_ptr<TFGraphNode> node(new TFGraphNode(def, -1, nullptr));
NodeExecStats node_stat;
node_stat.set_all_start_micros(start_miros);
diff --git a/tensorflow/core/profiler/internal/print_model_analysis.cc b/tensorflow/core/profiler/internal/print_model_analysis.cc
index 575ae182ee..7a0d590262 100644
--- a/tensorflow/core/profiler/internal/print_model_analysis.cc
+++ b/tensorflow/core/profiler/internal/print_model_analysis.cc
@@ -119,8 +119,8 @@ void DeleteProfiler() {
}
}
-void AddStep(int64 step, const string* graph, const string* run_meta,
- const string* op_log) {
+double AddStep(int64 step, const string* graph, const string* run_meta,
+ const string* op_log) {
CHECK(tf_stat);
CHECK(graph && !graph->empty());
@@ -144,6 +144,7 @@ void AddStep(int64 step, const string* graph, const string* run_meta,
op_log_ptr->ParseFromString(*op_log);
tf_stat->AddOpLogProto(std::move(op_log_ptr));
}
+ return tf_stat->run_coverage();
}
string Profile(const string* command, const string* options) {
@@ -154,6 +155,7 @@ string Profile(const string* command, const string* options) {
}
void WriteProfile(const string* filename) {
+ CHECK(tf_stat);
CHECK(filename) << "empty file name when asking to write profile.";
tf_stat->WriteProfile(*filename);
}
diff --git a/tensorflow/core/profiler/internal/print_model_analysis.h b/tensorflow/core/profiler/internal/print_model_analysis.h
index e4d01041a8..31ff5b07b0 100644
--- a/tensorflow/core/profiler/internal/print_model_analysis.h
+++ b/tensorflow/core/profiler/internal/print_model_analysis.h
@@ -35,8 +35,8 @@ bool NewProfiler(const string* graph, const string* op_log);
void DeleteProfiler();
-void AddStep(int64 step, const string* graph, const string* run_meta,
- const string* op_log);
+double AddStep(int64 step, const string* graph, const string* run_meta,
+ const string* op_log);
// Write the profiler's profile to a proto buffer.
void WriteProfile(const string* filename);
diff --git a/tensorflow/core/profiler/internal/tfprof_graph.cc b/tensorflow/core/profiler/internal/tfprof_graph.cc
index 3766365bf8..db7ae3b397 100644
--- a/tensorflow/core/profiler/internal/tfprof_graph.cc
+++ b/tensorflow/core/profiler/internal/tfprof_graph.cc
@@ -31,7 +31,7 @@ GraphNode* TFGraph::CreateParentNode(const string& name) {
node_defs_.back()->set_name(name);
node_defs_.back()->set_op(kTFGraphParent);
parent_nodes_[name] = std::unique_ptr<TFGraphNode>(
- new TFGraphNode(node_defs_.back().get(), -1));
+ new TFGraphNode(node_defs_.back().get(), -1, nullptr));
nodes_map_[name] =
std::unique_ptr<GraphNode>(new GraphNode(parent_nodes_[name].get()));
return nodes_map_[name].get();
diff --git a/tensorflow/core/profiler/internal/tfprof_node.cc b/tensorflow/core/profiler/internal/tfprof_node.cc
index f283fafc0f..671b65d708 100644
--- a/tensorflow/core/profiler/internal/tfprof_node.cc
+++ b/tensorflow/core/profiler/internal/tfprof_node.cc
@@ -19,19 +19,15 @@ limitations under the License.
namespace tensorflow {
namespace tfprof {
-namespace {
bool CountAsAcceleratorTime(const string& device) {
return device.find("stream:all") != device.npos;
}
-
bool CountAsCPUTime(const string& device) {
return RE2::FullMatch(device,
".*/(device:gpu|gpu|device:cpu|cpu|device:sycl):\\d+");
}
-
bool IsCanonicalDevice(const string& device) { return CountAsCPUTime(device); }
-} // namespace
// Notes about start and end time from the NodeExecStats proto:
// For GPU, there is no difference between op_end_rel_micros and
// all_end_rel_micros. All are kernel times.
@@ -89,16 +85,28 @@ void ExecStep::AddMemoryStats(const string& dev,
}
exec_.set_memory_intialized(true);
+ int accelerator_allocator_cnt = 0;
for (const auto& mem : step_stat.memory()) {
// TODO(xpan): Fix this hack. Currently the allocator name seems quite
// ad-hoc.
if (mem.allocator_name().find("GPU") == mem.allocator_name().npos) {
continue;
}
+ ++accelerator_allocator_cnt;
exec_.set_allocator_bytes_in_use(
std::max(static_cast<int64>(exec_.allocator_bytes_in_use()),
static_cast<int64>(mem.allocator_bytes_in_use())));
+ Allocation allocation;
+ for (const auto& alloc : mem.allocation_records()) {
+ allocation.add_allocation_records()->MergeFrom(alloc);
+ }
+ allocations_.push_back(allocation);
}
+ if (accelerator_allocator_cnt > 1) {
+ fprintf(stderr, "found %d gpu allocator for 1 node\n",
+ accelerator_allocator_cnt);
+ }
+
int64 total_output_bytes = 0;
for (const auto& output : step_stat.output()) {
if (output.has_tensor_description() &&
diff --git a/tensorflow/core/profiler/internal/tfprof_node.h b/tensorflow/core/profiler/internal/tfprof_node.h
index 34bc0a581d..e2d0563a07 100644
--- a/tensorflow/core/profiler/internal/tfprof_node.h
+++ b/tensorflow/core/profiler/internal/tfprof_node.h
@@ -105,8 +105,22 @@ class ExecStep {
const {
return op_execs_;
}
+ const std::map<string, std::vector<std::pair<int64, int64>>>& cpu_execs()
+ const {
+ return cpu_execs_;
+ }
+
int64 all_start_micros() const { return exec_.all_start_micros(); }
int64 latest_end_micros() const { return exec_.latest_end_micros(); }
+ int64 lastest_schedule_end_micros() const {
+ int64 ret = 0;
+ for (const auto& exec : cpu_execs_) {
+ for (const auto& pair : exec.second) {
+ ret = std::max(ret, pair.first + pair.second);
+ }
+ }
+ return ret;
+ }
int64 requested_bytes() const { return exec_.requested_bytes(); }
int64 peak_bytes() const { return exec_.peak_bytes(); }
@@ -127,6 +141,8 @@ class ExecStep {
return exec_.allocator_bytes_in_use();
}
+ const std::vector<Allocation>& allocations() const { return allocations_; }
+
const ExecProfile& ToProto() {
exec_.mutable_accelerator_execs()->clear();
for (const auto& e : accelerator_execs_) {
@@ -161,6 +177,11 @@ class ExecStep {
mem_pb.set_ptr(mem.second.second);
}
+ exec_.mutable_allocations()->Clear();
+ for (const auto& r : allocations_) {
+ exec_.add_allocations()->MergeFrom(r);
+ }
+
return exec_;
}
@@ -175,6 +196,8 @@ class ExecStep {
cpu_execs_.clear();
op_execs_.clear();
+ allocations_.clear();
+
for (const auto& exec_time : exec_.accelerator_execs()) {
auto& exec = accelerator_execs_[exec_time.first];
auto& op_exec = op_execs_[exec_time.first];
@@ -196,6 +219,10 @@ class ExecStep {
mem.first = output_mem.second.bytes();
mem.second = output_mem.second.ptr();
}
+
+ for (const auto& r : exec_.allocations()) {
+ allocations_.push_back(r);
+ }
}
private:
@@ -215,6 +242,9 @@ class ExecStep {
std::set<string> devices_;
// output_idx -> {output_bytes, memory_ptr}
std::map<int32, std::pair<int64, uint64>> output_memory_;
+
+ // The history of accelerator allocations and deallocations of this step.
+ std::vector<Allocation> allocations_;
};
#define GRAPH_NODE_BYTES(type) \
@@ -238,11 +268,15 @@ class ExecStep {
class TFGraphNode {
public:
TFGraphNode(const ProfileNode& node, const ProfileProto& profile,
- const std::map<int64, string>* id_to_string) {
+ const std::map<int64, string>* id_to_string,
+ const std::map<string, std::unique_ptr<TFGraphNode>>* nodes_map) {
+ nodes_map_ = nodes_map;
FromProto(node, profile, id_to_string);
}
- TFGraphNode(const NodeDef* node, int64 id) {
+ TFGraphNode(const NodeDef* node, int64 id,
+ const std::map<string, std::unique_ptr<TFGraphNode>>* nodes_map) {
+ nodes_map_ = nodes_map;
node_.set_id(id);
node_.set_name(node->name());
node_.set_op(node->op());
@@ -269,17 +303,9 @@ class TFGraphNode {
op_types_.insert(node->op());
}
- void AddInput(TFGraphNode* input, int32 output_idx, int input_idx) {
- src_output_idx_[input->name()] = output_idx;
-
- inputs_[input_idx] = input->name();
- const auto& output_shape = input->output_shapes().find(output_idx);
- // Always create an empty vec even if the shape info might be missing.
- std::vector<int64>& shape_vec = input_shapes_[input_idx];
- if (output_shape != input->output_shapes().end()) {
- shape_vec.assign(output_shape->second.begin(),
- output_shape->second.end());
- }
+ void AddInput(const string& input, int64 output_index, int input_idx) {
+ inputs_[input_idx] = input;
+ src_output_idx_[input] = output_index;
}
void AddOpType(const string& op_type) { op_types_.insert(op_type); }
@@ -416,9 +442,6 @@ class TFGraphNode {
}
const std::map<int32, string>& inputs() const { return inputs_; }
- const std::map<string, int32>& src_output_idx() const {
- return src_output_idx_;
- }
// Number of times the graph node is executed. When step < 0, the
// average number of times executed across all steps.
@@ -526,14 +549,30 @@ class TFGraphNode {
return exec->second.latest_end_micros();
}
+ int64 lastest_schedule_end_micros(int64 step) const {
+ auto exec = execs_.find(step);
+ if (exec == execs_.end()) {
+ return 0;
+ }
+ return exec->second.lastest_schedule_end_micros();
+ }
+
const std::map<string, std::vector<std::pair<int64, int64>>>& op_execs(
int64 step) const {
auto exec = execs_.find(step);
if (exec == execs_.end()) {
- return empty_op_execs_;
+ return empty_execs_;
}
return exec->second.op_execs();
}
+ const std::map<string, std::vector<std::pair<int64, int64>>>& cpu_execs(
+ int64 step) const {
+ auto exec = execs_.find(step);
+ if (exec == execs_.end()) {
+ return empty_execs_;
+ }
+ return exec->second.cpu_execs();
+ }
const std::map<int64, ExecStep>& all_op_execs() const { return execs_; }
@@ -551,12 +590,12 @@ class TFGraphNode {
}
return exec->second.host_temp_bytes();
}
- int64 accelerator_persistent_bytes(int64 step) const {
- auto exec = execs_.find(step);
- if (exec == execs_.end()) {
- return 0;
+ int64 accelerator_persistent_bytes() const {
+ int64 persistent_bytes = 0;
+ for (const auto& exec : execs_) {
+ persistent_bytes += exec.second.accelerator_persistent_bytes();
}
- return exec->second.accelerator_persistent_bytes();
+ return persistent_bytes;
}
int64 host_persistent_bytes(int64 step) const {
auto exec = execs_.find(step);
@@ -581,6 +620,14 @@ class TFGraphNode {
return exec->second.allocator_bytes_in_use();
}
+ const std::vector<Allocation>& allocations(int64 step) const {
+ auto exec = execs_.find(step);
+ if (exec == execs_.end()) {
+ return empty_allocations_;
+ }
+ return exec->second.allocations();
+ }
+
int64 parameters() const {
if (!shape().empty()) {
int64 params = 1;
@@ -628,18 +675,44 @@ class TFGraphNode {
const std::map<int, std::vector<int64>>& output_shapes() const {
return output_shapes_;
}
- const std::map<int, std::vector<int64>>& input_shapes() const {
- return input_shapes_;
+
+ const std::map<int, std::vector<int64>> input_shapes() const {
+ std::map<int, std::vector<int64>> input_shapes;
+ for (const auto& inp : inputs_) {
+ // Always create an empty vec even if the shape info might be missing.
+ std::vector<int64>& shape_vec = input_shapes[inp.first];
+ if (!nodes_map_) continue;
+ auto input_it = nodes_map_->find(inp.second);
+ if (input_it == nodes_map_->end()) continue;
+ auto output_it = src_output_idx_.find(inp.second);
+ if (output_it == src_output_idx_.end()) continue;
+
+ const TFGraphNode* input_node = input_it->second.get();
+ if (!input_node) continue;
+ const auto& output_shapes = input_node->output_shapes();
+ const auto& output_shape = output_shapes.find(output_it->second);
+ if (output_shape == output_shapes.end()) continue;
+
+ if (output_shape != input_node->output_shapes().end()) {
+ shape_vec.assign(output_shape->second.begin(),
+ output_shape->second.end());
+ }
+ }
+ return input_shapes;
}
private:
+ // maps graph node name to TFGraphNode. Not owned.
+ const std::map<string, std::unique_ptr<TFGraphNode>>* nodes_map_;
+ // inputs to the node. input index -> input node name.
std::map<int, string> inputs_;
+ // The output index of the source node.
std::map<string, int32> src_output_idx_;
-
+ // proto for serialize/deserialized representation of the node.
ProfileNode node_;
-
+ // Python call stack that creates the name.
std::unique_ptr<CallStack> call_stack_;
-
+ // Shape of the node (e.g. Variable) if available.
std::vector<int64> shape_;
// Won't missing input_idx. But some shapes might be empty (unknown).
std::map<int, std::vector<int64>> input_shapes_;
@@ -651,8 +724,10 @@ class TFGraphNode {
std::map<int64, ExecStep> execs_;
+ // Placeholder for empty cases.
std::map<int32, std::pair<int64, uint64>> empty_output_memory_;
- std::map<string, std::vector<std::pair<int64, int64>>> empty_op_execs_;
+ std::map<string, std::vector<std::pair<int64, int64>>> empty_execs_;
+ std::vector<Allocation> empty_allocations_;
};
class TFMultiGraphNode {
@@ -806,6 +881,10 @@ class TFMultiGraphNode {
};
bool IsPlacedOnAccelerator(const string& device);
+bool CountAsAcceleratorTime(const string& device);
+bool CountAsCPUTime(const string& device);
+bool IsCanonicalDevice(const string& device);
+
} // namespace tfprof
} // namespace tensorflow
diff --git a/tensorflow/core/profiler/internal/tfprof_scope.cc b/tensorflow/core/profiler/internal/tfprof_scope.cc
index 128b296d5c..988bed71cc 100644
--- a/tensorflow/core/profiler/internal/tfprof_scope.cc
+++ b/tensorflow/core/profiler/internal/tfprof_scope.cc
@@ -35,7 +35,7 @@ ScopeNode* TFScope::CreateParentNode(const string& name) {
node_defs_.back()->set_name(name);
node_defs_.back()->set_op(kTFScopeParent);
parent_nodes_[name] = std::unique_ptr<TFGraphNode>(
- new TFGraphNode(node_defs_.back().get(), -1));
+ new TFGraphNode(node_defs_.back().get(), -1, nullptr));
nodes_map_[name] =
std::unique_ptr<ScopeNode>(new ScopeNode(parent_nodes_[name].get()));
return nodes_map_[name].get();
diff --git a/tensorflow/core/profiler/internal/tfprof_stats.cc b/tensorflow/core/profiler/internal/tfprof_stats.cc
index b4b98141f3..7943c075e0 100644
--- a/tensorflow/core/profiler/internal/tfprof_stats.cc
+++ b/tensorflow/core/profiler/internal/tfprof_stats.cc
@@ -36,7 +36,9 @@ bool CreateRunMetadataNode(const string& name, NodeDef* def) {
}
def->set_name(name);
// TODO(xpan): Better operation type.
- def->set_op("RunTimeOp");
+ // This is because some times a node doesn't have a op type,
+ // so we use node name as the op type.
+ def->set_op(name);
return true;
}
} // namespace
@@ -86,7 +88,7 @@ TFStats::TFStats(const string& filename,
}
for (const auto& node_pb : profile.nodes()) {
std::unique_ptr<TFGraphNode> node(
- new TFGraphNode(node_pb.second, profile, &id_to_string_));
+ new TFGraphNode(node_pb.second, profile, &id_to_string_, &nodes_map_));
nodes_map_.insert(std::pair<string, std::unique_ptr<TFGraphNode>>(
node_pb.second.name(), std::move(node)));
}
@@ -178,12 +180,14 @@ const MultiGraphNodeProto& TFStats::ShowMultiGraphNode(
void TFStats::AddGraph(std::unique_ptr<GraphDef> graph) {
std::map<string, const NodeDef*> node_defs;
+ bool node_added = false;
for (const NodeDef& node : graph->node()) {
if (nodes_map_.find(node.name()) != nodes_map_.end()) {
continue;
}
- nodes_map_[node.name()] =
- std::unique_ptr<TFGraphNode>(new TFGraphNode(&node, nodes_map_.size()));
+ node_added = true;
+ nodes_map_[node.name()] = std::unique_ptr<TFGraphNode>(
+ new TFGraphNode(&node, nodes_map_.size(), &nodes_map_));
node_defs[node.name()] = &node;
}
for (auto it = node_defs.begin(); it != node_defs.end(); it++) {
@@ -192,6 +196,7 @@ void TFStats::AddGraph(std::unique_ptr<GraphDef> graph) {
string node_input = it->second->input(i);
int output_idx = 0;
// input name format can be: "^node:src_output"
+ // if not :src_output, then it's the first one (further verify?)
auto prefix_pos = node_input.find(":");
if (prefix_pos != node_input.npos) {
std::vector<string> input_parts = str_util::Split(node_input, ":");
@@ -204,15 +209,18 @@ void TFStats::AddGraph(std::unique_ptr<GraphDef> graph) {
if (node_input.substr(0, 1) == "^") {
node_input = node_input.substr(1);
}
- auto input_node = nodes_map_.find(node_input);
- // TODO(xpan): P1: Add the input even if it doesn't exist yet, because
- // this can be a partial graph.
- if (input_node == nodes_map_.end()) {
- continue;
- }
- node->AddInput(input_node->second.get(), output_idx, i);
+ // Delay input TFGraphNode retrieval as late as possible.
+ // In long run, when we have TensorFlow runtime graph, the
+ // graph connection should be dynamic and per-step.
+ node->AddInput(node_input, output_idx, i);
}
}
+ if (node_added) {
+ graph_view_.reset(nullptr);
+ scope_view_.reset(nullptr);
+ op_view_.reset(nullptr);
+ code_view_.reset(nullptr);
+ }
}
void TFStats::AddOpLogProto(std::unique_ptr<OpLogProto> op_log) {
@@ -263,10 +271,11 @@ void TFStats::AddRunMeta(int64 step, std::unique_ptr<RunMetadata> run_meta) {
NodeDef def;
if (CreateRunMetadataNode(name, &def)) {
nodes_map_[name] = std::unique_ptr<TFGraphNode>(
- new TFGraphNode(&def, nodes_map_.size()));
+ new TFGraphNode(&def, nodes_map_.size(), &nodes_map_));
nodes_map_.at(name)->AddStepStat(step, dev_stat.device(), node_stat);
}
} else {
+ covered_nodes_.insert(node->second->id());
node->second->AddStepStat(step, dev_stat.device(), node_stat);
}
}
diff --git a/tensorflow/core/profiler/internal/tfprof_stats.h b/tensorflow/core/profiler/internal/tfprof_stats.h
index bb4baea738..d46d923556 100644
--- a/tensorflow/core/profiler/internal/tfprof_stats.h
+++ b/tensorflow/core/profiler/internal/tfprof_stats.h
@@ -66,6 +66,9 @@ class TFStats {
}
const std::set<int64>& steps() const { return steps_; }
bool has_code_traces() const { return has_code_traces_; }
+ double run_coverage() const {
+ return covered_nodes_.size() / (nodes_map_.size() + 1e-10);
+ }
void BuildView(const string& cmd);
void BuildAllViews();
@@ -104,13 +107,16 @@ class TFStats {
std::unique_ptr<TFCode> code_view_;
std::unique_ptr<TFOp> op_view_;
std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader_;
- // Store TFGraphNode instead of TFGraphNode* to avoid large number of
- // dynamic alloc.
+ // TODO(xpan): Store TFGraphNode instead of TFGraphNode* to avoid large
+ // number of dynamic alloc.
+ // Maps from graph node name to TFGraphNode.
std::map<string, std::unique_ptr<TFGraphNode>> nodes_map_;
GraphNodeProto empty_graph_node_;
MultiGraphNodeProto empty_multi_graph_node_;
std::map<int64, string> id_to_string_;
+ // Graph nodes covered by RunMetdata, that is traced with run time stats.
+ std::set<int64> covered_nodes_;
};
} // namespace tfprof
diff --git a/tensorflow/core/profiler/internal/tfprof_timeline.cc b/tensorflow/core/profiler/internal/tfprof_timeline.cc
index 1732574cc4..bdb000747d 100644
--- a/tensorflow/core/profiler/internal/tfprof_timeline.cc
+++ b/tensorflow/core/profiler/internal/tfprof_timeline.cc
@@ -25,6 +25,8 @@ limitations under the License.
namespace tensorflow {
namespace tfprof {
namespace {
+int kMaxDisplayedMemNode = 10;
+
string GetTimeDevName(const string& dev) {
if (dev.find("stream") != dev.npos) {
return strings::StrCat("Op execution threads: ", dev);
@@ -85,14 +87,41 @@ void ChromeTraceFormatter::EmitFlowEnd(const string& name, int64 ts, int64 pid,
events_.push_back(event);
}
-void ChromeTraceFormatter::EmitCounter(const string& category,
- const string& name, int64 pid, int64 ts,
- const string& device, int64 bytes) {
- Json::Value event = CreateEvent("C", category, name, pid, 0, ts);
+void ChromeTraceFormatter::EmitCounter(
+ const string& category, const string& name, int64 pid, int64 ts,
+ const string& device, int64 bytes,
+ const std::map<int64, std::vector<string>>& tensor_mem) {
+ Json::Value event = CreateEvent("C", category, "Allocated Bytes", pid, 0, ts);
Json::Value args(Json::objectValue);
- args[device] = Json::Value(bytes);
+ args["Allocator Bytes in Use"] = Json::Value(bytes);
event["args"] = args;
events_.push_back(event);
+
+ // TODO(xpan): chrome://tracing is not ideal visualization for memory.
+ // It would be great to have a customized UI for it.
+ Json::Value event2 =
+ CreateEvent("C", category, "Top Allocations", pid + 1, 0, ts);
+ Json::Value args2(Json::objectValue);
+ // Need to reserve the same args for all locations.
+ for (int i = 1; i < kMaxDisplayedMemNode; ++i) {
+ args2[strings::Printf("Top Allocation %02d", i)] = Json::Value("N/A");
+ }
+ int count = 0;
+ for (auto it = tensor_mem.rbegin(); it != tensor_mem.rend(); ++it) {
+ for (const string& t : it->second) {
+ if (bytes < it->first || count >= kMaxDisplayedMemNode) {
+ break;
+ }
+ args2[strings::Printf("Top Allocation %02d", count)] =
+ Json::Value(strings::StrCat(it->first / 1000000.0, " MB from ", t));
+ ++count;
+ bytes -= it->first;
+ }
+ }
+ args2[strings::StrCat("Not Displayed")] =
+ Json::Value(strings::Printf("%.2f MB", bytes / 1000000.0));
+ event2["args"] = args2;
+ events_.push_back(event2);
}
string ChromeTraceFormatter::Format() {
@@ -119,71 +148,28 @@ void MemoryTracker::TrackNode(int64 step, const GraphNode* node) {
if (!node->Trackable(step)) {
return;
}
+
Device& dev = devices_[node->node->canonical_device()];
- int64 end_micros = node->node->latest_end_micros(step);
- if (node->node->accelerator_persistent_bytes(step) != 0) {
- string tensor_name = strings::StrCat(node->name(), ":", -1);
- dev.earliest_ref[tensor_name] = node->node->all_start_micros(step);
- dev.tensor_size[tensor_name] =
- node->node->accelerator_persistent_bytes(step);
- // TODO(xpan): Need latest_ref?
- }
- if (node->node->accelerator_temp_bytes(step)) {
- string tensor_name = strings::StrCat(node->name(), ":", -2);
- dev.earliest_ref[tensor_name] = node->node->all_start_micros(step);
- dev.latest_ref[tensor_name] = end_micros;
- dev.tensor_size[tensor_name] = node->node->accelerator_temp_bytes(step);
- }
- if (node->node->allocator_bytes_in_use(step) > 0) {
- dev.allocator_stats[end_micros] = node->node->allocator_bytes_in_use(step);
- }
-}
-void MemoryTracker::TrackNodeConnection(int64 step, const GraphNode* node,
- const GraphNode* src) {
- if (!node->Trackable(step) || !src->Trackable(step)) {
- return;
- }
- const auto& output_idx = node->node->src_output_idx().find(src->name());
- if (output_idx == node->node->src_output_idx().end()) {
- return;
- }
- const auto& output = src->node->output_memory(step).find(output_idx->second);
- if (output == src->node->output_memory(step).end()) {
- return;
+ std::map<int64, int64> allocs;
+ for (const auto& alloc : node->node->allocations(step)) {
+ for (const auto& r : alloc.allocation_records()) {
+ allocs[r.alloc_micros()] += r.alloc_bytes();
+ dev.tracked_allocations[r.alloc_micros()] += r.alloc_bytes();
+ }
}
- int64 output_bytes = output->second.first;
- uint64 output_ptr = output->second.second;
-
- Device& src_dev = devices_[src->node->canonical_device()];
- string tensor_name = strings::StrCat(output_ptr);
- if (output_ptr == 0) {
- fprintf(stderr, "output no ptr\n");
- tensor_name = strings::StrCat(src->node->name(), ":", output_idx->second);
+ dev.tracked_allocations[0] += node->node->accelerator_persistent_bytes();
+ allocs[0] += node->node->accelerator_persistent_bytes();
+
+ int64 last = 0;
+ std::map<int64, int64>& aggregate_allocs = dev.tensor_allocs[node->name()];
+ for (auto it = allocs.begin(); it != allocs.end(); ++it) {
+ last += it->second;
+ aggregate_allocs[it->first] = last;
}
-
- src_dev.tensor_size[tensor_name] = output_bytes;
- src_dev.earliest_ref[tensor_name] = src->node->all_start_micros(step);
-
- int64 src_end_micros = src->node->latest_end_micros(step);
-
- if (src->node->canonical_device() != node->node->canonical_device()) {
- int64 transfer_micros =
- (src_end_micros + node->node->all_start_micros(step)) / 2;
- src_dev.latest_ref[tensor_name] =
- std::max(src_dev.latest_ref[tensor_name], transfer_micros);
-
- Device& dest_dev = devices_[node->node->canonical_device()];
- string dest_tensor_name =
- strings::StrCat(tensor_name, node->node->canonical_device());
- dest_dev.tensor_size[dest_tensor_name] = output_bytes;
- dest_dev.earliest_ref[dest_tensor_name] = transfer_micros;
- dest_dev.latest_ref[dest_tensor_name] =
- std::max(dest_dev.latest_ref[dest_tensor_name],
- node->node->latest_end_micros(step));
- } else {
- src_dev.latest_ref[tensor_name] = std::max(
- src_dev.latest_ref[tensor_name], node->node->latest_end_micros(step));
+ int64 end_micros = node->node->lastest_schedule_end_micros(step);
+ if (end_micros > 0 && node->node->allocator_bytes_in_use(step) > 0) {
+ dev.allocations[end_micros] = node->node->allocator_bytes_in_use(step);
}
}
@@ -222,22 +208,24 @@ void Timeline::GenerateGraphTimeline(const std::vector<GraphNode*>& gnodes) {
for (GraphNode* gnode : gnodes) {
AllocateTimeNodes(gnode);
}
+ // To save memory, we only track cross-device (canonical device) flows.
for (auto& process : tnodes_) {
+ if (!IsCanonicalDevice(process.first)) continue;
for (auto& tn : process.second) {
TimeNode* tnode = tn.second.get();
for (GraphNode* inp : tnode->node->children) {
if (!inp->account || !inp->Trackable(step_)) {
continue;
}
- TrackNodeConnection(tnode->node, inp);
- for (const auto& kernel_execs : inp->node->op_execs(step_)) {
- if (process.first == kernel_execs.first) {
- // Not interested in flow withthin the same device.
+ for (const auto& execs : inp->node->cpu_execs(step_)) {
+ if (!IsCanonicalDevice(execs.first)) continue;
+ if (process.first == execs.first) {
+ // Not interested in flow within the same device.
continue;
}
- for (const auto& exec : kernel_execs.second) {
+ for (const auto& exec : execs.second) {
int64 start_micros = exec.first;
- auto cprocess = tnodes_.find(kernel_execs.first);
+ auto cprocess = tnodes_.find(execs.first);
if (cprocess == tnodes_.end()) continue;
auto ctn = cprocess->second.find(start_micros);
if (ctn == cprocess->second.end()) continue;
@@ -258,7 +246,6 @@ void Timeline::GenerateGraphTimeline(const std::vector<GraphNode*>& gnodes) {
Json::Value args(Json::objectValue);
args["name"] = Json::Value(tnode->name());
- args["op"] = Json::Value(tnode->name());
chrome_formatter_.EmitRegion(node.first, tnode->exec_micros,
process.first, lane.first, "Op",
tnode->name(), args);
@@ -280,12 +267,40 @@ void Timeline::GenerateGraphTimeline(const std::vector<GraphNode*>& gnodes) {
for (const auto& dev : mem_tracker_.devices()) {
int64 pid = AllocatePID();
chrome_formatter_.EmitPID(GetMemoryLaneName(dev.first), pid);
+ int64 pid2 = AllocatePID();
+ chrome_formatter_.EmitPID(GetMemoryLaneName(dev.first) + " allocations",
+ pid2);
+
const MemoryTracker::Device& device = dev.second;
- for (const auto& alloc_stats : device.allocator_stats) {
- chrome_formatter_.EmitCounter("Memory", "Memory Series", pid,
- alloc_stats.first, dev.first,
- alloc_stats.second);
+ int64 max_bytes_in_use = 0;
+ int64 cur_bytes_in_use = 0;
+ int64 last_point = 0;
+ for (const auto& alloc : device.allocations) {
+ cur_bytes_in_use = alloc.second;
+ max_bytes_in_use = std::max(max_bytes_in_use, cur_bytes_in_use);
+ // Do not plot too dense to reduce file size.
+ int64 ts = alloc.first;
+ if (ts - last_point < 100) continue;
+ last_point = ts;
+
+ std::map<int64, std::vector<string>> tensor_mem;
+ for (const auto& tensor_alloc_it : dev.second.tensor_allocs) {
+ const auto& tensor_alloc = tensor_alloc_it.second;
+ auto it = tensor_alloc.lower_bound(ts);
+ if (it != tensor_alloc.begin()) {
+ --it;
+ }
+ if (it->second > 0) {
+ tensor_mem[it->second].push_back(tensor_alloc_it.first);
+ }
+ }
+ chrome_formatter_.EmitCounter("Memory", "Memory Series", pid, ts,
+ dev.first, cur_bytes_in_use, tensor_mem);
+ }
+ if (IsPlacedOnAccelerator(dev.first)) {
+ fprintf(stdout, "%s peak memory: %.2f MB\n", dev.first.c_str(),
+ max_bytes_in_use / 1000000.0);
}
}
OutputTimeline();
diff --git a/tensorflow/core/profiler/internal/tfprof_timeline.h b/tensorflow/core/profiler/internal/tfprof_timeline.h
index 6c62d1046f..b8174cdecb 100644
--- a/tensorflow/core/profiler/internal/tfprof_timeline.h
+++ b/tensorflow/core/profiler/internal/tfprof_timeline.h
@@ -28,10 +28,12 @@ namespace tfprof {
typedef std::map<string, string> Event;
+// Class for generating timeline json output.
class ChromeTraceFormatter {
public:
ChromeTraceFormatter() {}
-
+ // The following methods creates timeline nodes. See chrome tracing format
+ // document for details.
Json::Value CreateEvent(const string& ph, const string& category,
const string& name, int64 pid, int64 tid, int64 ts);
@@ -47,22 +49,27 @@ class ChromeTraceFormatter {
int64 flow_id);
void EmitCounter(const string& category, const string& name, int64 pid,
- int64 ts, const string& device, int64 bytes);
+ int64 ts, const string& device, int64 bytes,
+ const std::map<int64, std::vector<string>>& tensor_mem);
string Format();
private:
+ // A event is a visualization unit in timeline.
std::vector<Json::Value> events_;
std::vector<Json::Value> metadata_;
};
+// A process (time series of events) in the timeline.
class Process {
public:
Process(const string& device, int64 pid) : device(device), pid(pid) {}
// Each lane is a map from start_time to end_time.
std::vector<std::map<int64, int64>> lanes;
+ // device for the time series.
string device;
+ // unique id for the time series.
int64 pid;
};
@@ -96,19 +103,16 @@ class MemoryTracker {
public:
class Device {
public:
- // The first 3 fields are predicted.
- std::map<string, int64> tensor_size;
- std::map<string, int64> earliest_ref;
- std::map<string, int64> latest_ref;
+ // map from tensor name to a pair of <alloc time, bytes_in_use>.
+ std::map<string, std::map<int64, int64>> tensor_allocs;
// ground truth memory stats. time->bytes.
- std::map<int64, int64> allocator_stats;
+ std::map<int64, int64> allocations;
+ // tracked allocations, might miss some bytes.
+ std::map<int64, int64> tracked_allocations;
};
void TrackNode(int64 step, const GraphNode* node);
- void TrackNodeConnection(int64 step, const GraphNode* node,
- const GraphNode* src);
-
const std::map<string, Device>& devices() const { return devices_; }
private:
@@ -130,13 +134,9 @@ class Timeline {
void GenerateCodeTimeline(const CodeNode* node);
+ private:
void TrackNode(const GraphNode* node) { mem_tracker_.TrackNode(step_, node); }
- void TrackNodeConnection(GraphNode* node, GraphNode* src) {
- mem_tracker_.TrackNodeConnection(step_, node, src);
- }
-
- private:
void OutputTimeline();
template <typename Node>
diff --git a/tensorflow/core/profiler/internal/tfprof_timeline_test.cc b/tensorflow/core/profiler/internal/tfprof_timeline_test.cc
index babae395ba..91eac0cf76 100644
--- a/tensorflow/core/profiler/internal/tfprof_timeline_test.cc
+++ b/tensorflow/core/profiler/internal/tfprof_timeline_test.cc
@@ -71,7 +71,7 @@ TEST_F(TFProfTimelineTest, GraphView) {
string dump_str;
TF_CHECK_OK(ReadFileToString(Env::Default(), dump_file + "_0", &dump_str));
- EXPECT_EQ(1754536562981488144ull, Hash64(dump_str));
+ EXPECT_EQ(7932146665024565912ull, Hash64(dump_str));
}
TEST_F(TFProfTimelineTest, ScopeView) {
diff --git a/tensorflow/core/profiler/profiler.cc b/tensorflow/core/profiler/profiler.cc
index 96e0b06bf3..a5e513aa21 100644
--- a/tensorflow/core/profiler/profiler.cc
+++ b/tensorflow/core/profiler/profiler.cc
@@ -234,6 +234,7 @@ int Run(int argc, char** argv) {
return 1;
}
tf_stat->AddRunMeta(i, std::move(run_meta));
+ fprintf(stdout, "run graph coverage: %.2f\n", tf_stat->run_coverage());
}
}
diff --git a/tensorflow/core/profiler/tfprof_log.proto b/tensorflow/core/profiler/tfprof_log.proto
index a1410c7c79..f92301133a 100644
--- a/tensorflow/core/profiler/tfprof_log.proto
+++ b/tensorflow/core/profiler/tfprof_log.proto
@@ -3,6 +3,7 @@ syntax = "proto3";
package tensorflow.tfprof;
import "tensorflow/core/framework/attr_value.proto";
+import "tensorflow/core/framework/step_stats.proto";
// It specifies the Python callstack that creates an op.
message CodeDef {
@@ -89,6 +90,10 @@ message ProfileNode {
map<int64, ExecProfile> execs = 12;
}
+message Allocation {
+ repeated AllocationRecord allocation_records = 1;
+}
+
message ExecProfile {
// Can be larger than 1 if run multiple times in loop.
int64 run_count = 1;
@@ -107,6 +112,8 @@ message ExecProfile {
map<int32, Memory> output_memory = 17;
+ repeated Allocation allocations = 18;
+
repeated string devices = 6;
// Total bytes requested by the op.