A few profiler improvements

1. Track the full allocation history of each tensor, visualized in timeline. 2. Better ProfileContext for tracing step selection. 3. Small bug fix. PiperOrigin-RevId: 173988293
author: A. Unique TensorFlower <gardener@tensorflow.org> 2017-10-30 20:08:55 -0700
committer: TensorFlower Gardener <gardener@tensorflow.org> 2017-10-30 20:13:00 -0700
commit: 113be57466d36ab7086794475cf4579f3e6b940b (patch)
tree: 2c77d2a78afe04d79386f5afbfa20a28513f4a68 /tensorflow/core/profiler
parent: 6d1263cdf8ee8323513f984553dbeb070865fd0c (diff)
14 files changed, 273 insertions, 146 deletions
diff --git a/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc b/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
index c39d44b7fa..d05143aff9 100644
--- a/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
+++ b/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
@@ -48,7 +48,7 @@ class TFProfAdvisorTest : public ::testing::Test {
     for (const auto& attr : attrs) {
       (*def->mutable_attr())[attr.first].set_s(attr.second);
     }
-    std::unique_ptr<TFGraphNode> node(new TFGraphNode(def, -1));
+    std::unique_ptr<TFGraphNode> node(new TFGraphNode(def, -1, nullptr));
 
     NodeExecStats node_stat;
     node_stat.set_all_start_micros(start_miros);
diff --git a/tensorflow/core/profiler/internal/print_model_analysis.cc b/tensorflow/core/profiler/internal/print_model_analysis.cc
index 575ae182ee..7a0d590262 100644
--- a/tensorflow/core/profiler/internal/print_model_analysis.cc
+++ b/tensorflow/core/profiler/internal/print_model_analysis.cc
@@ -119,8 +119,8 @@ void DeleteProfiler() {
   }
 }
 
-void AddStep(int64 step, const string* graph, const string* run_meta,
-             const string* op_log) {
+double AddStep(int64 step, const string* graph, const string* run_meta,
+               const string* op_log) {
   CHECK(tf_stat);
 
   CHECK(graph && !graph->empty());
@@ -144,6 +144,7 @@ void AddStep(int64 step, const string* graph, const string* run_meta,
     op_log_ptr->ParseFromString(*op_log);
     tf_stat->AddOpLogProto(std::move(op_log_ptr));
   }
+  return tf_stat->run_coverage();
 }
 
 string Profile(const string* command, const string* options) {
@@ -154,6 +155,7 @@ string Profile(const string* command, const string* options) {
 }
 
 void WriteProfile(const string* filename) {
+  CHECK(tf_stat);
   CHECK(filename) << "empty file name when asking to write profile.";
   tf_stat->WriteProfile(*filename);
 }
diff --git a/tensorflow/core/profiler/internal/print_model_analysis.h b/tensorflow/core/profiler/internal/print_model_analysis.h
index e4d01041a8..31ff5b07b0 100644
--- a/tensorflow/core/profiler/internal/print_model_analysis.h
+++ b/tensorflow/core/profiler/internal/print_model_analysis.h
@@ -35,8 +35,8 @@ bool NewProfiler(const string* graph, const string* op_log);
 
 void DeleteProfiler();
 
-void AddStep(int64 step, const string* graph, const string* run_meta,
-             const string* op_log);
+double AddStep(int64 step, const string* graph, const string* run_meta,
+               const string* op_log);
 
 // Write the profiler's profile to a proto buffer.
 void WriteProfile(const string* filename);
diff --git a/tensorflow/core/profiler/internal/tfprof_graph.cc b/tensorflow/core/profiler/internal/tfprof_graph.cc
index 3766365bf8..db7ae3b397 100644
--- a/tensorflow/core/profiler/internal/tfprof_graph.cc
+++ b/tensorflow/core/profiler/internal/tfprof_graph.cc
@@ -31,7 +31,7 @@ GraphNode* TFGraph::CreateParentNode(const string& name) {
   node_defs_.back()->set_name(name);
   node_defs_.back()->set_op(kTFGraphParent);
   parent_nodes_[name] = std::unique_ptr<TFGraphNode>(
-      new TFGraphNode(node_defs_.back().get(), -1));
+      new TFGraphNode(node_defs_.back().get(), -1, nullptr));
   nodes_map_[name] =
       std::unique_ptr<GraphNode>(new GraphNode(parent_nodes_[name].get()));
   return nodes_map_[name].get();
diff --git a/tensorflow/core/profiler/internal/tfprof_node.cc b/tensorflow/core/profiler/internal/tfprof_node.cc
index f283fafc0f..671b65d708 100644
--- a/tensorflow/core/profiler/internal/tfprof_node.cc
+++ b/tensorflow/core/profiler/internal/tfprof_node.cc
@@ -19,19 +19,15 @@ limitations under the License.
 
 namespace tensorflow {
 namespace tfprof {
-namespace {
 bool CountAsAcceleratorTime(const string& device) {
   return device.find("stream:all") != device.npos;
 }
-
 bool CountAsCPUTime(const string& device) {
   return RE2::FullMatch(device,
                         ".*/(device:gpu|gpu|device:cpu|cpu|device:sycl):\\d+");
 }
-
 bool IsCanonicalDevice(const string& device) { return CountAsCPUTime(device); }
 
-}  // namespace
 // Notes about start and end time from the NodeExecStats proto:
 // For GPU, there is no difference between op_end_rel_micros and
 // all_end_rel_micros. All are kernel times.
@@ -89,16 +85,28 @@ void ExecStep::AddMemoryStats(const string& dev,
   }
   exec_.set_memory_intialized(true);
 
+  int accelerator_allocator_cnt = 0;
   for (const auto& mem : step_stat.memory()) {
     // TODO(xpan): Fix this hack. Currently the allocator name seems quite
     // ad-hoc.
     if (mem.allocator_name().find("GPU") == mem.allocator_name().npos) {
       continue;
     }
+    ++accelerator_allocator_cnt;
     exec_.set_allocator_bytes_in_use(
         std::max(static_cast<int64>(exec_.allocator_bytes_in_use()),
                  static_cast<int64>(mem.allocator_bytes_in_use())));
+    Allocation allocation;
+    for (const auto& alloc : mem.allocation_records()) {
+      allocation.add_allocation_records()->MergeFrom(alloc);
+    }
+    allocations_.push_back(allocation);
   }
+  if (accelerator_allocator_cnt > 1) {
+    fprintf(stderr, "found %d gpu allocator for 1 node\n",
+            accelerator_allocator_cnt);
+  }
+
   int64 total_output_bytes = 0;
   for (const auto& output : step_stat.output()) {
     if (output.has_tensor_description() &&
diff --git a/tensorflow/core/profiler/internal/tfprof_node.h b/tensorflow/core/profiler/internal/tfprof_node.h
index 34bc0a581d..e2d0563a07 100644
--- a/tensorflow/core/profiler/internal/tfprof_node.h
+++ b/tensorflow/core/profiler/internal/tfprof_node.h
@@ -105,8 +105,22 @@ class ExecStep {
       const {
     return op_execs_;
   }
+  const std::map<string, std::vector<std::pair<int64, int64>>>& cpu_execs()
+      const {
+    return cpu_execs_;
+  }
+
   int64 all_start_micros() const { return exec_.all_start_micros(); }
   int64 latest_end_micros() const { return exec_.latest_end_micros(); }
+  int64 lastest_schedule_end_micros() const {
+    int64 ret = 0;
+    for (const auto& exec : cpu_execs_) {
+      for (const auto& pair : exec.second) {
+        ret = std::max(ret, pair.first + pair.second);
+      }
+    }
+    return ret;
+  }
 
   int64 requested_bytes() const { return exec_.requested_bytes(); }
   int64 peak_bytes() const { return exec_.peak_bytes(); }
@@ -127,6 +141,8 @@ class ExecStep {
     return exec_.allocator_bytes_in_use();
   }
 
+  const std::vector<Allocation>& allocations() const { return allocations_; }
+
   const ExecProfile& ToProto() {
     exec_.mutable_accelerator_execs()->clear();
     for (const auto& e : accelerator_execs_) {
@@ -161,6 +177,11 @@ class ExecStep {
       mem_pb.set_ptr(mem.second.second);
     }
 
+    exec_.mutable_allocations()->Clear();
+    for (const auto& r : allocations_) {
+      exec_.add_allocations()->MergeFrom(r);
+    }
+
     return exec_;
   }
 
@@ -175,6 +196,8 @@ class ExecStep {
     cpu_execs_.clear();
     op_execs_.clear();
 
+    allocations_.clear();
+
     for (const auto& exec_time : exec_.accelerator_execs()) {
       auto& exec = accelerator_execs_[exec_time.first];
       auto& op_exec = op_execs_[exec_time.first];
@@ -196,6 +219,10 @@ class ExecStep {
       mem.first = output_mem.second.bytes();
       mem.second = output_mem.second.ptr();
     }
+
+    for (const auto& r : exec_.allocations()) {
+      allocations_.push_back(r);
+    }
   }
 
  private:
@@ -215,6 +242,9 @@ class ExecStep {
   std::set<string> devices_;
   // output_idx -> {output_bytes, memory_ptr}
   std::map<int32, std::pair<int64, uint64>> output_memory_;
+
+  // The history of accelerator allocations and deallocations of this step.
+  std::vector<Allocation> allocations_;
 };
 
 #define GRAPH_NODE_BYTES(type)             \
@@ -238,11 +268,15 @@ class ExecStep {
 class TFGraphNode {
  public:
   TFGraphNode(const ProfileNode& node, const ProfileProto& profile,
-              const std::map<int64, string>* id_to_string) {
+              const std::map<int64, string>* id_to_string,
+              const std::map<string, std::unique_ptr<TFGraphNode>>* nodes_map) {
+    nodes_map_ = nodes_map;
     FromProto(node, profile, id_to_string);
   }
 
-  TFGraphNode(const NodeDef* node, int64 id) {
+  TFGraphNode(const NodeDef* node, int64 id,
+              const std::map<string, std::unique_ptr<TFGraphNode>>* nodes_map) {
+    nodes_map_ = nodes_map;
     node_.set_id(id);
     node_.set_name(node->name());
     node_.set_op(node->op());
@@ -269,17 +303,9 @@ class TFGraphNode {
     op_types_.insert(node->op());
   }
 
-  void AddInput(TFGraphNode* input, int32 output_idx, int input_idx) {
-    src_output_idx_[input->name()] = output_idx;
-
-    inputs_[input_idx] = input->name();
-    const auto& output_shape = input->output_shapes().find(output_idx);
-    // Always create an empty vec even if the shape info might be missing.
-    std::vector<int64>& shape_vec = input_shapes_[input_idx];
-    if (output_shape != input->output_shapes().end()) {
-      shape_vec.assign(output_shape->second.begin(),
-                       output_shape->second.end());
-    }
+  void AddInput(const string& input, int64 output_index, int input_idx) {
+    inputs_[input_idx] = input;
+    src_output_idx_[input] = output_index;
   }
 
   void AddOpType(const string& op_type) { op_types_.insert(op_type); }
@@ -416,9 +442,6 @@ class TFGraphNode {
   }
 
   const std::map<int32, string>& inputs() const { return inputs_; }
-  const std::map<string, int32>& src_output_idx() const {
-    return src_output_idx_;
-  }
 
   // Number of times the graph node is executed. When step < 0, the
   // average number of times executed across all steps.
@@ -526,14 +549,30 @@ class TFGraphNode {
     return exec->second.latest_end_micros();
   }
 
+  int64 lastest_schedule_end_micros(int64 step) const {
+    auto exec = execs_.find(step);
+    if (exec == execs_.end()) {
+      return 0;
+    }
+    return exec->second.lastest_schedule_end_micros();
+  }
+
   const std::map<string, std::vector<std::pair<int64, int64>>>& op_execs(
       int64 step) const {
     auto exec = execs_.find(step);
     if (exec == execs_.end()) {
-      return empty_op_execs_;
+      return empty_execs_;
     }
     return exec->second.op_execs();
   }
+  const std::map<string, std::vector<std::pair<int64, int64>>>& cpu_execs(
+      int64 step) const {
+    auto exec = execs_.find(step);
+    if (exec == execs_.end()) {
+      return empty_execs_;
+    }
+    return exec->second.cpu_execs();
+  }
 
   const std::map<int64, ExecStep>& all_op_execs() const { return execs_; }
 
@@ -551,12 +590,12 @@ class TFGraphNode {
     }
     return exec->second.host_temp_bytes();
   }
-  int64 accelerator_persistent_bytes(int64 step) const {
-    auto exec = execs_.find(step);
-    if (exec == execs_.end()) {
-      return 0;
+  int64 accelerator_persistent_bytes() const {
+    int64 persistent_bytes = 0;
+    for (const auto& exec : execs_) {
+      persistent_bytes += exec.second.accelerator_persistent_bytes();
     }
-    return exec->second.accelerator_persistent_bytes();
+    return persistent_bytes;
   }
   int64 host_persistent_bytes(int64 step) const {
     auto exec = execs_.find(step);
@@ -581,6 +620,14 @@ class TFGraphNode {
     return exec->second.allocator_bytes_in_use();
   }
 
+  const std::vector<Allocation>& allocations(int64 step) const {
+    auto exec = execs_.find(step);
+    if (exec == execs_.end()) {
+      return empty_allocations_;
+    }
+    return exec->second.allocations();
+  }
+
   int64 parameters() const {
     if (!shape().empty()) {
       int64 params = 1;
@@ -628,18 +675,44 @@ class TFGraphNode {
   const std::map<int, std::vector<int64>>& output_shapes() const {
     return output_shapes_;
   }
-  const std::map<int, std::vector<int64>>& input_shapes() const {
-    return input_shapes_;
+
+  const std::map<int, std::vector<int64>> input_shapes() const {
+    std::map<int, std::vector<int64>> input_shapes;
+    for (const auto& inp : inputs_) {
+      // Always create an empty vec even if the shape info might be missing.
+      std::vector<int64>& shape_vec = input_shapes[inp.first];
+      if (!nodes_map_) continue;
+      auto input_it = nodes_map_->find(inp.second);
+      if (input_it == nodes_map_->end()) continue;
+      auto output_it = src_output_idx_.find(inp.second);
+      if (output_it == src_output_idx_.end()) continue;
+
+      const TFGraphNode* input_node = input_it->second.get();
+      if (!input_node) continue;
+      const auto& output_shapes = input_node->output_shapes();
+      const auto& output_shape = output_shapes.find(output_it->second);
+      if (output_shape == output_shapes.end()) continue;
+
+      if (output_shape != input_node->output_shapes().end()) {
+        shape_vec.assign(output_shape->second.begin(),
+                         output_shape->second.end());
+      }
+    }
+    return input_shapes;
   }
 
  private:
+  // maps graph node name to TFGraphNode. Not owned.
+  const std::map<string, std::unique_ptr<TFGraphNode>>* nodes_map_;
+  // inputs to the node. input index -> input node name.
   std::map<int, string> inputs_;
+  // The output index of the source node.
   std::map<string, int32> src_output_idx_;
-
+  // proto for serialize/deserialized representation of the node.
   ProfileNode node_;
-
+  // Python call stack that creates the name.
   std::unique_ptr<CallStack> call_stack_;
-
+  // Shape of the node (e.g. Variable) if available.
   std::vector<int64> shape_;
   // Won't missing input_idx. But some shapes might be empty (unknown).
   std::map<int, std::vector<int64>> input_shapes_;
@@ -651,8 +724,10 @@ class TFGraphNode {
 
   std::map<int64, ExecStep> execs_;
 
+  // Placeholder for empty cases.
   std::map<int32, std::pair<int64, uint64>> empty_output_memory_;
-  std::map<string, std::vector<std::pair<int64, int64>>> empty_op_execs_;
+  std::map<string, std::vector<std::pair<int64, int64>>> empty_execs_;
+  std::vector<Allocation> empty_allocations_;
 };
 
 class TFMultiGraphNode {
@@ -806,6 +881,10 @@ class TFMultiGraphNode {
 };
 
 bool IsPlacedOnAccelerator(const string& device);
+bool CountAsAcceleratorTime(const string& device);
+bool CountAsCPUTime(const string& device);
+bool IsCanonicalDevice(const string& device);
+
 }  // namespace tfprof
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/profiler/internal/tfprof_scope.cc b/tensorflow/core/profiler/internal/tfprof_scope.cc
index 128b296d5c..988bed71cc 100644
--- a/tensorflow/core/profiler/internal/tfprof_scope.cc
+++ b/tensorflow/core/profiler/internal/tfprof_scope.cc
@@ -35,7 +35,7 @@ ScopeNode* TFScope::CreateParentNode(const string& name) {
   node_defs_.back()->set_name(name);
   node_defs_.back()->set_op(kTFScopeParent);
   parent_nodes_[name] = std::unique_ptr<TFGraphNode>(
-      new TFGraphNode(node_defs_.back().get(), -1));
+      new TFGraphNode(node_defs_.back().get(), -1, nullptr));
   nodes_map_[name] =
       std::unique_ptr<ScopeNode>(new ScopeNode(parent_nodes_[name].get()));
   return nodes_map_[name].get();
diff --git a/tensorflow/core/profiler/internal/tfprof_stats.cc b/tensorflow/core/profiler/internal/tfprof_stats.cc
index b4b98141f3..7943c075e0 100644
--- a/tensorflow/core/profiler/internal/tfprof_stats.cc
+++ b/tensorflow/core/profiler/internal/tfprof_stats.cc
@@ -36,7 +36,9 @@ bool CreateRunMetadataNode(const string& name, NodeDef* def) {
   }
   def->set_name(name);
   // TODO(xpan): Better operation type.
-  def->set_op("RunTimeOp");
+  // This is because some times a node doesn't have a op type,
+  // so we use node name as the op type.
+  def->set_op(name);
   return true;
 }
 }  // namespace
@@ -86,7 +88,7 @@ TFStats::TFStats(const string& filename,
   }
   for (const auto& node_pb : profile.nodes()) {
     std::unique_ptr<TFGraphNode> node(
-        new TFGraphNode(node_pb.second, profile, &id_to_string_));
+        new TFGraphNode(node_pb.second, profile, &id_to_string_, &nodes_map_));
     nodes_map_.insert(std::pair<string, std::unique_ptr<TFGraphNode>>(
         node_pb.second.name(), std::move(node)));
   }
@@ -178,12 +180,14 @@ const MultiGraphNodeProto& TFStats::ShowMultiGraphNode(
 
 void TFStats::AddGraph(std::unique_ptr<GraphDef> graph) {
   std::map<string, const NodeDef*> node_defs;
+  bool node_added = false;
   for (const NodeDef& node : graph->node()) {
     if (nodes_map_.find(node.name()) != nodes_map_.end()) {
       continue;
     }
-    nodes_map_[node.name()] =
-        std::unique_ptr<TFGraphNode>(new TFGraphNode(&node, nodes_map_.size()));
+    node_added = true;
+    nodes_map_[node.name()] = std::unique_ptr<TFGraphNode>(
+        new TFGraphNode(&node, nodes_map_.size(), &nodes_map_));
     node_defs[node.name()] = &node;
   }
   for (auto it = node_defs.begin(); it != node_defs.end(); it++) {
@@ -192,6 +196,7 @@ void TFStats::AddGraph(std::unique_ptr<GraphDef> graph) {
       string node_input = it->second->input(i);
       int output_idx = 0;
       // input name format can be: "^node:src_output"
+      // if not :src_output, then it's the first one (further verify?)
       auto prefix_pos = node_input.find(":");
       if (prefix_pos != node_input.npos) {
         std::vector<string> input_parts = str_util::Split(node_input, ":");
@@ -204,15 +209,18 @@ void TFStats::AddGraph(std::unique_ptr<GraphDef> graph) {
       if (node_input.substr(0, 1) == "^") {
         node_input = node_input.substr(1);
       }
-      auto input_node = nodes_map_.find(node_input);
-      // TODO(xpan): P1: Add the input even if it doesn't exist yet, because
-      // this can be a partial graph.
-      if (input_node == nodes_map_.end()) {
-        continue;
-      }
-      node->AddInput(input_node->second.get(), output_idx, i);
+      // Delay input TFGraphNode retrieval as late as possible.
+      // In long run, when we have TensorFlow runtime graph, the
+      // graph connection should be dynamic and per-step.
+      node->AddInput(node_input, output_idx, i);
     }
   }
+  if (node_added) {
+    graph_view_.reset(nullptr);
+    scope_view_.reset(nullptr);
+    op_view_.reset(nullptr);
+    code_view_.reset(nullptr);
+  }
 }
 
 void TFStats::AddOpLogProto(std::unique_ptr<OpLogProto> op_log) {
@@ -263,10 +271,11 @@ void TFStats::AddRunMeta(int64 step, std::unique_ptr<RunMetadata> run_meta) {
         NodeDef def;
         if (CreateRunMetadataNode(name, &def)) {
           nodes_map_[name] = std::unique_ptr<TFGraphNode>(
-              new TFGraphNode(&def, nodes_map_.size()));
+              new TFGraphNode(&def, nodes_map_.size(), &nodes_map_));
           nodes_map_.at(name)->AddStepStat(step, dev_stat.device(), node_stat);
         }
       } else {
+        covered_nodes_.insert(node->second->id());
         node->second->AddStepStat(step, dev_stat.device(), node_stat);
       }
     }
diff --git a/tensorflow/core/profiler/internal/tfprof_stats.h b/tensorflow/core/profiler/internal/tfprof_stats.h
index bb4baea738..d46d923556 100644
--- a/tensorflow/core/profiler/internal/tfprof_stats.h
+++ b/tensorflow/core/profiler/internal/tfprof_stats.h
@@ -66,6 +66,9 @@ class TFStats {
   }
   const std::set<int64>& steps() const { return steps_; }
   bool has_code_traces() const { return has_code_traces_; }
+  double run_coverage() const {
+    return covered_nodes_.size() / (nodes_map_.size() + 1e-10);
+  }
 
   void BuildView(const string& cmd);
   void BuildAllViews();
@@ -104,13 +107,16 @@ class TFStats {
   std::unique_ptr<TFCode> code_view_;
   std::unique_ptr<TFOp> op_view_;
   std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader_;
-  // Store TFGraphNode instead of TFGraphNode* to avoid large number of
-  // dynamic alloc.
+  // TODO(xpan): Store TFGraphNode instead of TFGraphNode* to avoid large
+  // number of dynamic alloc.
+  // Maps from graph node name to TFGraphNode.
   std::map<string, std::unique_ptr<TFGraphNode>> nodes_map_;
   GraphNodeProto empty_graph_node_;
   MultiGraphNodeProto empty_multi_graph_node_;
 
   std::map<int64, string> id_to_string_;
+  // Graph nodes covered by RunMetdata, that is traced with run time stats.
+  std::set<int64> covered_nodes_;
 };
 
 }  // namespace tfprof
diff --git a/tensorflow/core/profiler/internal/tfprof_timeline.cc b/tensorflow/core/profiler/internal/tfprof_timeline.cc
index 1732574cc4..bdb000747d 100644
--- a/tensorflow/core/profiler/internal/tfprof_timeline.cc
+++ b/tensorflow/core/profiler/internal/tfprof_timeline.cc
@@ -25,6 +25,8 @@ limitations under the License.
 namespace tensorflow {
 namespace tfprof {
 namespace {
+int kMaxDisplayedMemNode = 10;
+
 string GetTimeDevName(const string& dev) {
   if (dev.find("stream") != dev.npos) {
     return strings::StrCat("Op execution threads: ", dev);
@@ -85,14 +87,41 @@ void ChromeTraceFormatter::EmitFlowEnd(const string& name, int64 ts, int64 pid,
   events_.push_back(event);
 }
 
-void ChromeTraceFormatter::EmitCounter(const string& category,
-                                       const string& name, int64 pid, int64 ts,
-                                       const string& device, int64 bytes) {
-  Json::Value event = CreateEvent("C", category, name, pid, 0, ts);
+void ChromeTraceFormatter::EmitCounter(
+    const string& category, const string& name, int64 pid, int64 ts,
+    const string& device, int64 bytes,
+    const std::map<int64, std::vector<string>>& tensor_mem) {
+  Json::Value event = CreateEvent("C", category, "Allocated Bytes", pid, 0, ts);
   Json::Value args(Json::objectValue);
-  args[device] = Json::Value(bytes);
+  args["Allocator Bytes in Use"] = Json::Value(bytes);
   event["args"] = args;
   events_.push_back(event);
+
+  // TODO(xpan): chrome://tracing is not ideal visualization for memory.
+  // It would be great to have a customized UI for it.
+  Json::Value event2 =
+      CreateEvent("C", category, "Top Allocations", pid + 1, 0, ts);
+  Json::Value args2(Json::objectValue);
+  // Need to reserve the same args for all locations.
+  for (int i = 1; i < kMaxDisplayedMemNode; ++i) {
+    args2[strings::Printf("Top Allocation %02d", i)] = Json::Value("N/A");
+  }
+  int count = 0;
+  for (auto it = tensor_mem.rbegin(); it != tensor_mem.rend(); ++it) {
+    for (const string& t : it->second) {
+      if (bytes < it->first || count >= kMaxDisplayedMemNode) {
+        break;
+      }
+      args2[strings::Printf("Top Allocation %02d", count)] =
+          Json::Value(strings::StrCat(it->first / 1000000.0, " MB from ", t));
+      ++count;
+      bytes -= it->first;
+    }
+  }
+  args2[strings::StrCat("Not Displayed")] =
+      Json::Value(strings::Printf("%.2f MB", bytes / 1000000.0));
+  event2["args"] = args2;
+  events_.push_back(event2);
 }
 
 string ChromeTraceFormatter::Format() {
@@ -119,71 +148,28 @@ void MemoryTracker::TrackNode(int64 step, const GraphNode* node) {
   if (!node->Trackable(step)) {
     return;
   }
+
   Device& dev = devices_[node->node->canonical_device()];
-  int64 end_micros = node->node->latest_end_micros(step);
-  if (node->node->accelerator_persistent_bytes(step) != 0) {
-    string tensor_name = strings::StrCat(node->name(), ":", -1);
-    dev.earliest_ref[tensor_name] = node->node->all_start_micros(step);
-    dev.tensor_size[tensor_name] =
-        node->node->accelerator_persistent_bytes(step);
-    // TODO(xpan): Need latest_ref?
-  }
-  if (node->node->accelerator_temp_bytes(step)) {
-    string tensor_name = strings::StrCat(node->name(), ":", -2);
-    dev.earliest_ref[tensor_name] = node->node->all_start_micros(step);
-    dev.latest_ref[tensor_name] = end_micros;
-    dev.tensor_size[tensor_name] = node->node->accelerator_temp_bytes(step);
-  }
-  if (node->node->allocator_bytes_in_use(step) > 0) {
-    dev.allocator_stats[end_micros] = node->node->allocator_bytes_in_use(step);
-  }
-}
 
-void MemoryTracker::TrackNodeConnection(int64 step, const GraphNode* node,
-                                        const GraphNode* src) {
-  if (!node->Trackable(step) || !src->Trackable(step)) {
-    return;
-  }
-  const auto& output_idx = node->node->src_output_idx().find(src->name());
-  if (output_idx == node->node->src_output_idx().end()) {
-    return;
-  }
-  const auto& output = src->node->output_memory(step).find(output_idx->second);
-  if (output == src->node->output_memory(step).end()) {
-    return;
+  std::map<int64, int64> allocs;
+  for (const auto& alloc : node->node->allocations(step)) {
+    for (const auto& r : alloc.allocation_records()) {
+      allocs[r.alloc_micros()] += r.alloc_bytes();
+      dev.tracked_allocations[r.alloc_micros()] += r.alloc_bytes();
+    }
   }
-  int64 output_bytes = output->second.first;
-  uint64 output_ptr = output->second.second;
-
-  Device& src_dev = devices_[src->node->canonical_device()];
-  string tensor_name = strings::StrCat(output_ptr);
-  if (output_ptr == 0) {
-    fprintf(stderr, "output no ptr\n");
-    tensor_name = strings::StrCat(src->node->name(), ":", output_idx->second);
+  dev.tracked_allocations[0] += node->node->accelerator_persistent_bytes();
+  allocs[0] += node->node->accelerator_persistent_bytes();
+
+  int64 last = 0;
+  std::map<int64, int64>& aggregate_allocs = dev.tensor_allocs[node->name()];
+  for (auto it = allocs.begin(); it != allocs.end(); ++it) {
+    last += it->second;
+    aggregate_allocs[it->first] = last;
   }
-
-  src_dev.tensor_size[tensor_name] = output_bytes;
-  src_dev.earliest_ref[tensor_name] = src->node->all_start_micros(step);
-
-  int64 src_end_micros = src->node->latest_end_micros(step);
-
-  if (src->node->canonical_device() != node->node->canonical_device()) {
-    int64 transfer_micros =
-        (src_end_micros + node->node->all_start_micros(step)) / 2;
-    src_dev.latest_ref[tensor_name] =
-        std::max(src_dev.latest_ref[tensor_name], transfer_micros);
-
-    Device& dest_dev = devices_[node->node->canonical_device()];
-    string dest_tensor_name =
-        strings::StrCat(tensor_name, node->node->canonical_device());
-    dest_dev.tensor_size[dest_tensor_name] = output_bytes;
-    dest_dev.earliest_ref[dest_tensor_name] = transfer_micros;
-    dest_dev.latest_ref[dest_tensor_name] =
-        std::max(dest_dev.latest_ref[dest_tensor_name],
-                 node->node->latest_end_micros(step));
-  } else {
-    src_dev.latest_ref[tensor_name] = std::max(
-        src_dev.latest_ref[tensor_name], node->node->latest_end_micros(step));
+  int64 end_micros = node->node->lastest_schedule_end_micros(step);
+  if (end_micros > 0 && node->node->allocator_bytes_in_use(step) > 0) {
+    dev.allocations[end_micros] = node->node->allocator_bytes_in_use(step);
   }
 }
 
@@ -222,22 +208,24 @@ void Timeline::GenerateGraphTimeline(const std::vector<GraphNode*>& gnodes) {
   for (GraphNode* gnode : gnodes) {
     AllocateTimeNodes(gnode);
   }
+  // To save memory, we only track cross-device (canonical device) flows.
   for (auto& process : tnodes_) {
+    if (!IsCanonicalDevice(process.first)) continue;
     for (auto& tn : process.second) {
       TimeNode* tnode = tn.second.get();
       for (GraphNode* inp : tnode->node->children) {
         if (!inp->account || !inp->Trackable(step_)) {
           continue;
         }
-        TrackNodeConnection(tnode->node, inp);
-        for (const auto& kernel_execs : inp->node->op_execs(step_)) {
-          if (process.first == kernel_execs.first) {
-            // Not interested in flow withthin the same device.
+        for (const auto& execs : inp->node->cpu_execs(step_)) {
+          if (!IsCanonicalDevice(execs.first)) continue;
+          if (process.first == execs.first) {
+            // Not interested in flow within the same device.
             continue;
           }
-          for (const auto& exec : kernel_execs.second) {
+          for (const auto& exec : execs.second) {
             int64 start_micros = exec.first;
-            auto cprocess = tnodes_.find(kernel_execs.first);
+            auto cprocess = tnodes_.find(execs.first);
             if (cprocess == tnodes_.end()) continue;
             auto ctn = cprocess->second.find(start_micros);
             if (ctn == cprocess->second.end()) continue;
@@ -258,7 +246,6 @@ void Timeline::GenerateGraphTimeline(const std::vector<GraphNode*>& gnodes) {
 
         Json::Value args(Json::objectValue);
         args["name"] = Json::Value(tnode->name());
-        args["op"] = Json::Value(tnode->name());
         chrome_formatter_.EmitRegion(node.first, tnode->exec_micros,
                                      process.first, lane.first, "Op",
                                      tnode->name(), args);
@@ -280,12 +267,40 @@ void Timeline::GenerateGraphTimeline(const std::vector<GraphNode*>& gnodes) {
   for (const auto& dev : mem_tracker_.devices()) {
     int64 pid = AllocatePID();
     chrome_formatter_.EmitPID(GetMemoryLaneName(dev.first), pid);
+    int64 pid2 = AllocatePID();
+    chrome_formatter_.EmitPID(GetMemoryLaneName(dev.first) + " allocations",
+                              pid2);
+
     const MemoryTracker::Device& device = dev.second;
 
-    for (const auto& alloc_stats : device.allocator_stats) {
-      chrome_formatter_.EmitCounter("Memory", "Memory Series", pid,
-                                    alloc_stats.first, dev.first,
-                                    alloc_stats.second);
+    int64 max_bytes_in_use = 0;
+    int64 cur_bytes_in_use = 0;
+    int64 last_point = 0;
+    for (const auto& alloc : device.allocations) {
+      cur_bytes_in_use = alloc.second;
+      max_bytes_in_use = std::max(max_bytes_in_use, cur_bytes_in_use);
+      // Do not plot too dense to reduce file size.
+      int64 ts = alloc.first;
+      if (ts - last_point < 100) continue;
+      last_point = ts;
+
+      std::map<int64, std::vector<string>> tensor_mem;
+      for (const auto& tensor_alloc_it : dev.second.tensor_allocs) {
+        const auto& tensor_alloc = tensor_alloc_it.second;
+        auto it = tensor_alloc.lower_bound(ts);
+        if (it != tensor_alloc.begin()) {
+          --it;
+        }
+        if (it->second > 0) {
+          tensor_mem[it->second].push_back(tensor_alloc_it.first);
+        }
+      }
+      chrome_formatter_.EmitCounter("Memory", "Memory Series", pid, ts,
+                                    dev.first, cur_bytes_in_use, tensor_mem);
+    }
+    if (IsPlacedOnAccelerator(dev.first)) {
+      fprintf(stdout, "%s peak memory: %.2f MB\n", dev.first.c_str(),
+              max_bytes_in_use / 1000000.0);
     }
   }
   OutputTimeline();
diff --git a/tensorflow/core/profiler/internal/tfprof_timeline.h b/tensorflow/core/profiler/internal/tfprof_timeline.h
index 6c62d1046f..b8174cdecb 100644
--- a/tensorflow/core/profiler/internal/tfprof_timeline.h
+++ b/tensorflow/core/profiler/internal/tfprof_timeline.h
@@ -28,10 +28,12 @@ namespace tfprof {
 
 typedef std::map<string, string> Event;
 
+// Class for generating timeline json output.
 class ChromeTraceFormatter {
  public:
   ChromeTraceFormatter() {}
-
+  // The following methods creates timeline nodes. See chrome tracing format
+  // document for details.
   Json::Value CreateEvent(const string& ph, const string& category,
                           const string& name, int64 pid, int64 tid, int64 ts);
 
@@ -47,22 +49,27 @@ class ChromeTraceFormatter {
                    int64 flow_id);
 
   void EmitCounter(const string& category, const string& name, int64 pid,
-                   int64 ts, const string& device, int64 bytes);
+                   int64 ts, const string& device, int64 bytes,
+                   const std::map<int64, std::vector<string>>& tensor_mem);
 
   string Format();
 
  private:
+  // A event is a visualization unit in timeline.
   std::vector<Json::Value> events_;
   std::vector<Json::Value> metadata_;
 };
 
+// A process (time series of events) in the timeline.
 class Process {
  public:
   Process(const string& device, int64 pid) : device(device), pid(pid) {}
 
   // Each lane is a map from start_time to end_time.
   std::vector<std::map<int64, int64>> lanes;
+  // device for the time series.
   string device;
+  // unique id for the time series.
   int64 pid;
 };
 
@@ -96,19 +103,16 @@ class MemoryTracker {
  public:
   class Device {
    public:
-    // The first 3 fields are predicted.
-    std::map<string, int64> tensor_size;
-    std::map<string, int64> earliest_ref;
-    std::map<string, int64> latest_ref;
+    // map from tensor name to a pair of <alloc time, bytes_in_use>.
+    std::map<string, std::map<int64, int64>> tensor_allocs;
     // ground truth memory stats. time->bytes.
-    std::map<int64, int64> allocator_stats;
+    std::map<int64, int64> allocations;
+    // tracked allocations, might miss some bytes.
+    std::map<int64, int64> tracked_allocations;
   };
 
   void TrackNode(int64 step, const GraphNode* node);
 
-  void TrackNodeConnection(int64 step, const GraphNode* node,
-                           const GraphNode* src);
-
   const std::map<string, Device>& devices() const { return devices_; }
 
  private:
@@ -130,13 +134,9 @@ class Timeline {
 
   void GenerateCodeTimeline(const CodeNode* node);
 
+ private:
   void TrackNode(const GraphNode* node) { mem_tracker_.TrackNode(step_, node); }
 
-  void TrackNodeConnection(GraphNode* node, GraphNode* src) {
-    mem_tracker_.TrackNodeConnection(step_, node, src);
-  }
-
- private:
   void OutputTimeline();
 
   template <typename Node>
diff --git a/tensorflow/core/profiler/internal/tfprof_timeline_test.cc b/tensorflow/core/profiler/internal/tfprof_timeline_test.cc
index babae395ba..91eac0cf76 100644
--- a/tensorflow/core/profiler/internal/tfprof_timeline_test.cc
+++ b/tensorflow/core/profiler/internal/tfprof_timeline_test.cc
@@ -71,7 +71,7 @@ TEST_F(TFProfTimelineTest, GraphView) {
 
   string dump_str;
   TF_CHECK_OK(ReadFileToString(Env::Default(), dump_file + "_0", &dump_str));
-  EXPECT_EQ(1754536562981488144ull, Hash64(dump_str));
+  EXPECT_EQ(7932146665024565912ull, Hash64(dump_str));
 }
 
 TEST_F(TFProfTimelineTest, ScopeView) {
diff --git a/tensorflow/core/profiler/profiler.cc b/tensorflow/core/profiler/profiler.cc
index 96e0b06bf3..a5e513aa21 100644
--- a/tensorflow/core/profiler/profiler.cc
+++ b/tensorflow/core/profiler/profiler.cc
@@ -234,6 +234,7 @@ int Run(int argc, char** argv) {
         return 1;
       }
       tf_stat->AddRunMeta(i, std::move(run_meta));
+      fprintf(stdout, "run graph coverage: %.2f\n", tf_stat->run_coverage());
     }
   }
 
diff --git a/tensorflow/core/profiler/tfprof_log.proto b/tensorflow/core/profiler/tfprof_log.proto
index a1410c7c79..f92301133a 100644
--- a/tensorflow/core/profiler/tfprof_log.proto
+++ b/tensorflow/core/profiler/tfprof_log.proto
@@ -3,6 +3,7 @@ syntax = "proto3";
 package tensorflow.tfprof;
 
 import "tensorflow/core/framework/attr_value.proto";
+import "tensorflow/core/framework/step_stats.proto";
 
 // It specifies the Python callstack that creates an op.
 message CodeDef {
@@ -89,6 +90,10 @@ message ProfileNode {
   map<int64, ExecProfile> execs = 12;
 }
 
+message Allocation {
+  repeated AllocationRecord allocation_records = 1;
+}
+
 message ExecProfile {
   // Can be larger than 1 if run multiple times in loop.
   int64 run_count = 1;
@@ -107,6 +112,8 @@ message ExecProfile {
 
   map<int32, Memory> output_memory = 17;
 
+  repeated Allocation allocations = 18;
+
   repeated string devices = 6;
 
   // Total bytes requested by the op.
author	A. Unique TensorFlower <gardener@tensorflow.org>	2017-10-30 20:08:55 -0700
committer	TensorFlower Gardener <gardener@tensorflow.org>	2017-10-30 20:13:00 -0700
commit	113be57466d36ab7086794475cf4579f3e6b940b (patch)
tree	2c77d2a78afe04d79386f5afbfa20a28513f4a68 /tensorflow/core/profiler
parent	6d1263cdf8ee8323513f984553dbeb070865fd0c (diff)