1 files changed, 107 insertions, 28 deletions
diff --git a/tensorflow/core/profiler/internal/tfprof_node.h b/tensorflow/core/profiler/internal/tfprof_node.h
index 34bc0a581d..e2d0563a07 100644
--- a/tensorflow/core/profiler/internal/tfprof_node.h
+++ b/tensorflow/core/profiler/internal/tfprof_node.h
@@ -105,8 +105,22 @@ class ExecStep {
       const {
     return op_execs_;
   }
+  const std::map<string, std::vector<std::pair<int64, int64>>>& cpu_execs()
+      const {
+    return cpu_execs_;
+  }
+
   int64 all_start_micros() const { return exec_.all_start_micros(); }
   int64 latest_end_micros() const { return exec_.latest_end_micros(); }
+  int64 lastest_schedule_end_micros() const {
+    int64 ret = 0;
+    for (const auto& exec : cpu_execs_) {
+      for (const auto& pair : exec.second) {
+        ret = std::max(ret, pair.first + pair.second);
+      }
+    }
+    return ret;
+  }
 
   int64 requested_bytes() const { return exec_.requested_bytes(); }
   int64 peak_bytes() const { return exec_.peak_bytes(); }
@@ -127,6 +141,8 @@ class ExecStep {
     return exec_.allocator_bytes_in_use();
   }
 
+  const std::vector<Allocation>& allocations() const { return allocations_; }
+
   const ExecProfile& ToProto() {
     exec_.mutable_accelerator_execs()->clear();
     for (const auto& e : accelerator_execs_) {
@@ -161,6 +177,11 @@ class ExecStep {
       mem_pb.set_ptr(mem.second.second);
     }
 
+    exec_.mutable_allocations()->Clear();
+    for (const auto& r : allocations_) {
+      exec_.add_allocations()->MergeFrom(r);
+    }
+
     return exec_;
   }
 
@@ -175,6 +196,8 @@ class ExecStep {
     cpu_execs_.clear();
     op_execs_.clear();
 
+    allocations_.clear();
+
     for (const auto& exec_time : exec_.accelerator_execs()) {
       auto& exec = accelerator_execs_[exec_time.first];
       auto& op_exec = op_execs_[exec_time.first];
@@ -196,6 +219,10 @@ class ExecStep {
       mem.first = output_mem.second.bytes();
       mem.second = output_mem.second.ptr();
     }
+
+    for (const auto& r : exec_.allocations()) {
+      allocations_.push_back(r);
+    }
   }
 
  private:
@@ -215,6 +242,9 @@ class ExecStep {
   std::set<string> devices_;
   // output_idx -> {output_bytes, memory_ptr}
   std::map<int32, std::pair<int64, uint64>> output_memory_;
+
+  // The history of accelerator allocations and deallocations of this step.
+  std::vector<Allocation> allocations_;
 };
 
 #define GRAPH_NODE_BYTES(type)             \
@@ -238,11 +268,15 @@ class ExecStep {
 class TFGraphNode {
  public:
   TFGraphNode(const ProfileNode& node, const ProfileProto& profile,
-              const std::map<int64, string>* id_to_string) {
+              const std::map<int64, string>* id_to_string,
+              const std::map<string, std::unique_ptr<TFGraphNode>>* nodes_map) {
+    nodes_map_ = nodes_map;
     FromProto(node, profile, id_to_string);
   }
 
-  TFGraphNode(const NodeDef* node, int64 id) {
+  TFGraphNode(const NodeDef* node, int64 id,
+              const std::map<string, std::unique_ptr<TFGraphNode>>* nodes_map) {
+    nodes_map_ = nodes_map;
     node_.set_id(id);
     node_.set_name(node->name());
     node_.set_op(node->op());
@@ -269,17 +303,9 @@ class TFGraphNode {
     op_types_.insert(node->op());
   }
 
-  void AddInput(TFGraphNode* input, int32 output_idx, int input_idx) {
-    src_output_idx_[input->name()] = output_idx;
-
-    inputs_[input_idx] = input->name();
-    const auto& output_shape = input->output_shapes().find(output_idx);
-    // Always create an empty vec even if the shape info might be missing.
-    std::vector<int64>& shape_vec = input_shapes_[input_idx];
-    if (output_shape != input->output_shapes().end()) {
-      shape_vec.assign(output_shape->second.begin(),
-                       output_shape->second.end());
-    }
+  void AddInput(const string& input, int64 output_index, int input_idx) {
+    inputs_[input_idx] = input;
+    src_output_idx_[input] = output_index;
   }
 
   void AddOpType(const string& op_type) { op_types_.insert(op_type); }
@@ -416,9 +442,6 @@ class TFGraphNode {
   }
 
   const std::map<int32, string>& inputs() const { return inputs_; }
-  const std::map<string, int32>& src_output_idx() const {
-    return src_output_idx_;
-  }
 
   // Number of times the graph node is executed. When step < 0, the
   // average number of times executed across all steps.
@@ -526,14 +549,30 @@ class TFGraphNode {
     return exec->second.latest_end_micros();
   }
 
+  int64 lastest_schedule_end_micros(int64 step) const {
+    auto exec = execs_.find(step);
+    if (exec == execs_.end()) {
+      return 0;
+    }
+    return exec->second.lastest_schedule_end_micros();
+  }
+
   const std::map<string, std::vector<std::pair<int64, int64>>>& op_execs(
       int64 step) const {
     auto exec = execs_.find(step);
     if (exec == execs_.end()) {
-      return empty_op_execs_;
+      return empty_execs_;
     }
     return exec->second.op_execs();
   }
+  const std::map<string, std::vector<std::pair<int64, int64>>>& cpu_execs(
+      int64 step) const {
+    auto exec = execs_.find(step);
+    if (exec == execs_.end()) {
+      return empty_execs_;
+    }
+    return exec->second.cpu_execs();
+  }
 
   const std::map<int64, ExecStep>& all_op_execs() const { return execs_; }
 
@@ -551,12 +590,12 @@ class TFGraphNode {
     }
     return exec->second.host_temp_bytes();
   }
-  int64 accelerator_persistent_bytes(int64 step) const {
-    auto exec = execs_.find(step);
-    if (exec == execs_.end()) {
-      return 0;
+  int64 accelerator_persistent_bytes() const {
+    int64 persistent_bytes = 0;
+    for (const auto& exec : execs_) {
+      persistent_bytes += exec.second.accelerator_persistent_bytes();
     }
-    return exec->second.accelerator_persistent_bytes();
+    return persistent_bytes;
   }
   int64 host_persistent_bytes(int64 step) const {
     auto exec = execs_.find(step);
@@ -581,6 +620,14 @@ class TFGraphNode {
     return exec->second.allocator_bytes_in_use();
   }
 
+  const std::vector<Allocation>& allocations(int64 step) const {
+    auto exec = execs_.find(step);
+    if (exec == execs_.end()) {
+      return empty_allocations_;
+    }
+    return exec->second.allocations();
+  }
+
   int64 parameters() const {
     if (!shape().empty()) {
       int64 params = 1;
@@ -628,18 +675,44 @@ class TFGraphNode {
   const std::map<int, std::vector<int64>>& output_shapes() const {
     return output_shapes_;
   }
-  const std::map<int, std::vector<int64>>& input_shapes() const {
-    return input_shapes_;
+
+  const std::map<int, std::vector<int64>> input_shapes() const {
+    std::map<int, std::vector<int64>> input_shapes;
+    for (const auto& inp : inputs_) {
+      // Always create an empty vec even if the shape info might be missing.
+      std::vector<int64>& shape_vec = input_shapes[inp.first];
+      if (!nodes_map_) continue;
+      auto input_it = nodes_map_->find(inp.second);
+      if (input_it == nodes_map_->end()) continue;
+      auto output_it = src_output_idx_.find(inp.second);
+      if (output_it == src_output_idx_.end()) continue;
+
+      const TFGraphNode* input_node = input_it->second.get();
+      if (!input_node) continue;
+      const auto& output_shapes = input_node->output_shapes();
+      const auto& output_shape = output_shapes.find(output_it->second);
+      if (output_shape == output_shapes.end()) continue;
+
+      if (output_shape != input_node->output_shapes().end()) {
+        shape_vec.assign(output_shape->second.begin(),
+                         output_shape->second.end());
+      }
+    }
+    return input_shapes;
   }
 
  private:
+  // maps graph node name to TFGraphNode. Not owned.
+  const std::map<string, std::unique_ptr<TFGraphNode>>* nodes_map_;
+  // inputs to the node. input index -> input node name.
   std::map<int, string> inputs_;
+  // The output index of the source node.
   std::map<string, int32> src_output_idx_;
-
+  // proto for serialize/deserialized representation of the node.
   ProfileNode node_;
-
+  // Python call stack that creates the name.
   std::unique_ptr<CallStack> call_stack_;
-
+  // Shape of the node (e.g. Variable) if available.
   std::vector<int64> shape_;
   // Won't missing input_idx. But some shapes might be empty (unknown).
   std::map<int, std::vector<int64>> input_shapes_;
@@ -651,8 +724,10 @@ class TFGraphNode {
 
   std::map<int64, ExecStep> execs_;
 
+  // Placeholder for empty cases.
   std::map<int32, std::pair<int64, uint64>> empty_output_memory_;
-  std::map<string, std::vector<std::pair<int64, int64>>> empty_op_execs_;
+  std::map<string, std::vector<std::pair<int64, int64>>> empty_execs_;
+  std::vector<Allocation> empty_allocations_;
 };
 
 class TFMultiGraphNode {
@@ -806,6 +881,10 @@ class TFMultiGraphNode {
 };
 
 bool IsPlacedOnAccelerator(const string& device);
+bool CountAsAcceleratorTime(const string& device);
+bool CountAsCPUTime(const string& device);
+bool IsCanonicalDevice(const string& device);
+
 }  // namespace tfprof
 }  // namespace tensorflow