Unify all profile files (graph,run_meta,op_log) into one.

Also allow profiler to serialize/deserialize to/from file. PiperOrigin-RevId: 167815923
author: A. Unique TensorFlower <gardener@tensorflow.org> 2017-09-06 21:56:33 -0700
committer: TensorFlower Gardener <gardener@tensorflow.org> 2017-09-06 22:01:54 -0700
commit: 9e9ffa33d154b6c332bd475d6da2030746460fed (patch)
tree: 64de1c00853926d394f6f4ba82f01f448e990fda /tensorflow/core/profiler
parent: 27ce2a4ac956941ba8a0b9aaaa77acc0aa861fef (diff)
16 files changed, 631 insertions, 242 deletions
diff --git a/tensorflow/core/profiler/README.md b/tensorflow/core/profiler/README.md
index f0d4dafd3e..2ddac0a79b 100644
--- a/tensorflow/core/profiler/README.md
+++ b/tensorflow/core/profiler/README.md
@@ -21,60 +21,70 @@
 ### Quick Start
 
 ```python
-# Enable tracing and profile the time and memory information.
+# When using high-level API, session is usually hidden.
+#
+# Under the default ProfileContext, run a few hundred steps.
+# The ProfileContext will sample some steps and dump the profiles
+# to files. Users can then use command line tool or Web UI for
+# interactive profiling.
+with tf.contrib.tfprof.ProfileContext('/tmp/train_dir') as pctx:
+  # High level API, such as slim, Estimator, etc.
+  train_loop()
+
+bazel-bin/tensorflow/core/profiler/profiler \
+    --profile_path=/tmp/train_dir/profile_xx
+tfprof> op -select micros,bytes,occurrence -order_by micros
+
+# To be open sourced...
+bazel-bin/third_party/tensorflow/python/profiler/profiler_ui \
+    --profile_path=/tmp/profiles/profile_1
+```
+![ProfilerUI](g3doc/profiler_ui.jpg)
+
+```python
+# When using lower-level APIs with a Session object. User can have
+# explicit control of each step.
+#
+# Create options to profile the time and memory information.
 builder = tf.profiler.ProfileOptionBuilder
 opts = builder(builder.time_and_memory()).order_by('micros').build()
-with tf.contrib.tfprof.ProfileContext() as pctx:
+# Create a profiling context, set contructor argument `trace_steps`, 
+# `dump_steps` to empty for explicit control.
+with tf.contrib.tfprof.ProfileContext('/tmp/train_dir',
+                                      trace_steps=[],
+                                      dump_steps=[]) as pctx:
   with tf.Session() as sess:
-    x = build_model()
-    sess.run(tf.global_variables_initializer())
-    for i in range(10000):
-      if i == 10:
-        pctx.capture_next_run_meta()
-        _ = sess.run(x)
-        tf.profiler.profile(sess.graph, run_meta=pctx.run_meta(), cmd='op', options=opts)
-      else:
-        _ = sess.run(x)
+    # Enable tracing for next session.run.
+    pctx.trace_next_step()
+    # Dump the profile to '/tmp/train_dir' after the step.
+    pctx.dump_next_step()
+    _ = session.run(train_op)
+    pctx.profiler.profile_operations(options=opts)
 ```
 
 ```python
-# When using high-level API, session is usually hidden.
-# Register profiling to ProfileContext that performs profiling automatically.
+# For more advanced usage, user can control the tracing steps and
+# dumping steps. User can also run online profiling during training.
+#
+# Create options to profile time/memory as well as parameters.
 builder = tf.profiler.ProfileOptionBuilder
 opts = builder(builder.time_and_memory()).order_by('micros').build()
 opts2 = tf.profiler.ProfileOptionBuilder.trainable_variables_parameter()
-with tf.contrib.tfprof.ProfileContext() as pctx:
-  # Profile time and memory from step 100 to 109.
-  # Note: High level APIs can run sessions for queue, variable initialization
-  # or training at different steps. Try profile multiple steps.
-  pctx.add_auto_profiling('op', opts, range(100, 110))
-  # Profile model parameters once at step 10 (since it is static).
-  pctx.add_auto_profiling('scope', opts2, [10])
+
+# Collect traces of steps 10~20, dump the whole profile (with traces of
+# step 10~20) at step 20. The dumped profile can be used for further profiling
+# with command line interface or Web UI.
+with tf.contrib.tfprof.ProfileContext('/tmp/train_dir',
+                                      trace_steps=range(10, 20),
+                                      dump_steps=[20]) as pctx:
+  # Run online profiling with 'op' view and 'opts' options at step 15, 18, 20.
+  pctx.add_auto_profiling('op', opts, [15, 18, 20])
+  # Run online profiling with 'scope' view and 'opts2' options at step 20.
+  pctx.add_auto_profiling('scope', opts2, [20])
   # High level API, such as slim, Estimator, etc.
   train_loop()
 ```
 
-```shell
-# Profiling from Python API is not interactive.
-# Dump the profiles to files and profile with interactive command line or web UI.
-with tf.contrib.tfprof.ProfileContext() as pctx:
-  pctx.add_auto_profile_dump('/tmp/profiles', [100])
-  train_loop()
-
-bazel-bin/tensorflow/core/profiler/profiler \
-    --graph_path=/tmp/profiles/graph.pbtxt \
-    --run_meta_path=/tmp/profiles/run_meta \
-    --op_log_path=/tmp/profiles/tfprof_log \
-tfprof> op -select micros,bytes,occurrence -order_by micros
-
-
-# To be open sourced...
-bazel-bin/third_party/tensorflow/python/profiler/profiler_ui \
-    --graph_path=/tmp/profiles/graph.pbtxt \
-    --run_meta_path=/tmp/profiles/run_meta \
-    --op_log_path=/tmp/profiles/tfprof_log \
-```
-![ProfilerUI](g3doc/profiler_ui.jpg)
 
 <b>Detail Tutorials</b>
 
diff --git a/tensorflow/core/profiler/internal/advisor/operation_checker.h b/tensorflow/core/profiler/internal/advisor/operation_checker.h
index 08f6d3c389..f0bd72fa40 100644
--- a/tensorflow/core/profiler/internal/advisor/operation_checker.h
+++ b/tensorflow/core/profiler/internal/advisor/operation_checker.h
@@ -45,9 +45,10 @@ class OperationChecker : public Checker {
       if (node->op_types().find("FusedBatchNorm") != node->op_types().end()) {
         use_fused_batch_norm = true;
       }
-      if (node->op_attrs().find("data_format") != node->op_attrs().end()) {
-        const AttrValue* attr_val = node->op_attrs().at("data_format");
-        if (attr_val->s() == "NHWC" &&
+
+      const AttrValue* attr = node->op_attrs("data_format");
+      if (attr) {
+        if (attr->s() == "NHWC" &&
             IsPlacedOnAccelerator(node->canonical_device())) {
           recommend_nchw = true;
         }
diff --git a/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc b/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
index 23ed287f7b..c39d44b7fa 100644
--- a/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
+++ b/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
@@ -48,7 +48,7 @@ class TFProfAdvisorTest : public ::testing::Test {
     for (const auto& attr : attrs) {
       (*def->mutable_attr())[attr.first].set_s(attr.second);
     }
-    std::unique_ptr<TFGraphNode> node(new TFGraphNode(def));
+    std::unique_ptr<TFGraphNode> node(new TFGraphNode(def, -1));
 
     NodeExecStats node_stat;
     node_stat.set_all_start_micros(start_miros);
diff --git a/tensorflow/core/profiler/internal/print_model_analysis.cc b/tensorflow/core/profiler/internal/print_model_analysis.cc
index fd46b957e8..ddf3c7f1f2 100644
--- a/tensorflow/core/profiler/internal/print_model_analysis.cc
+++ b/tensorflow/core/profiler/internal/print_model_analysis.cc
@@ -90,38 +90,61 @@ bool NewProfiler(const string* graph, const string* op_log) {
   if (!graph_ptr->ParseFromString(*graph)) {
     if (!protobuf::TextFormat::ParseFromString(*graph, graph_ptr.get())) {
       fprintf(stderr, "Failed to parse graph\n");
+      return false;
     }
   }
 
   std::unique_ptr<OpLogProto> op_log_ptr;
   if (op_log && !op_log->empty()) {
     op_log_ptr.reset(new OpLogProto());
-    op_log_ptr->ParseFromString(*op_log);
+    if (!op_log_ptr->ParseFromString(*op_log)) {
+      fprintf(stderr, "Failed to parse OpLogProto.\n");
+      return false;
+    }
   }
   tf_stat = new TFStats(std::move(graph_ptr), nullptr, std::move(op_log_ptr),
                         nullptr);
   return true;
 }
 
+void ProfilerFromFile(const string* filename) {
+  CHECK(!tf_stat) << "Currently only 1 living tfprof profiler is allowed";
+  CHECK(filename) << "Missing profile filename to init profiler from file";
+  tf_stat = new TFStats(*filename, nullptr);
+}
+
 void DeleteProfiler() {
-  delete tf_stat;
-  tf_stat = nullptr;
+  if (tf_stat) {
+    delete tf_stat;
+    tf_stat = nullptr;
+  }
 }
 
-void AddStep(int64 step, const string* run_meta, const string* op_log) {
+void AddStep(int64 step, const string* graph, const string* run_meta,
+             const string* op_log) {
   CHECK(tf_stat);
+
+  CHECK(graph && !graph->empty());
+  std::unique_ptr<GraphDef> graph_ptr(new GraphDef());
+  if (!graph_ptr->ParseFromString(*graph)) {
+    if (!protobuf::TextFormat::ParseFromString(*graph, graph_ptr.get())) {
+      fprintf(stderr, "Failed to parse graph\n");
+    }
+  }
+  tf_stat->AddGraph(std::move(graph_ptr));
+
   CHECK(run_meta && !run_meta->empty());
   // TODO(xpan): Better error handling.
   std::unique_ptr<RunMetadata> run_meta_ptr(new RunMetadata());
   run_meta_ptr->ParseFromString(*run_meta);
   tf_stat->AddRunMeta(step, std::move(run_meta_ptr));
 
-  std::unique_ptr<OpLogProto> op_log_ptr;
   if (op_log && !op_log->empty()) {
+    std::unique_ptr<OpLogProto> op_log_ptr;
     op_log_ptr.reset(new OpLogProto());
     op_log_ptr->ParseFromString(*op_log);
+    tf_stat->AddOpLogProto(std::move(op_log_ptr));
   }
-  tf_stat->AddOpLogProto(std::move(op_log_ptr));
 }
 
 string Profile(const string* command, const string* options) {
@@ -131,6 +154,11 @@ string Profile(const string* command, const string* options) {
   return RunProfile(*command, *options, tf_stat);
 }
 
+void WriteProfile(const string* filename) {
+  CHECK(filename) << "empty file name when asking to write profile.";
+  tf_stat->WriteProfile(*filename);
+}
+
 string PrintModelAnalysis(const string* graph, const string* run_meta,
                           const string* op_log, const string* command,
                           const string* options) {
diff --git a/tensorflow/core/profiler/internal/print_model_analysis.h b/tensorflow/core/profiler/internal/print_model_analysis.h
index 76a436f71c..e4d01041a8 100644
--- a/tensorflow/core/profiler/internal/print_model_analysis.h
+++ b/tensorflow/core/profiler/internal/print_model_analysis.h
@@ -35,7 +35,14 @@ bool NewProfiler(const string* graph, const string* op_log);
 
 void DeleteProfiler();
 
-void AddStep(int64 step, const string* run_meta, const string* op_log);
+void AddStep(int64 step, const string* graph, const string* run_meta,
+             const string* op_log);
+
+// Write the profiler's profile to a proto buffer.
+void WriteProfile(const string* filename);
+
+// Load the profile to profiler from a proto buffer file.
+void ProfilerFromFile(const string* filename);
 
 string Profile(const string* command, const string* options);
 
diff --git a/tensorflow/core/profiler/internal/tfprof_graph.cc b/tensorflow/core/profiler/internal/tfprof_graph.cc
index e58924a614..3766365bf8 100644
--- a/tensorflow/core/profiler/internal/tfprof_graph.cc
+++ b/tensorflow/core/profiler/internal/tfprof_graph.cc
@@ -30,8 +30,8 @@ GraphNode* TFGraph::CreateParentNode(const string& name) {
   node_defs_.push_back(std::unique_ptr<NodeDef>(new NodeDef()));
   node_defs_.back()->set_name(name);
   node_defs_.back()->set_op(kTFGraphParent);
-  parent_nodes_[name] =
-      std::unique_ptr<TFGraphNode>(new TFGraphNode(node_defs_.back().get()));
+  parent_nodes_[name] = std::unique_ptr<TFGraphNode>(
+      new TFGraphNode(node_defs_.back().get(), -1));
   nodes_map_[name] =
       std::unique_ptr<GraphNode>(new GraphNode(parent_nodes_[name].get()));
   return nodes_map_[name].get();
@@ -49,11 +49,11 @@ void TFGraph::Build() {
   // Filter out the root nodes (node not input of any other node).
   for (auto it = nodes_map_.begin(); it != nodes_map_.end(); it++) {
     GraphNode* node = it->second.get();
-    const std::map<int, TFGraphNode*>& inputs = node->node->inputs();
+    const std::map<int, string>& inputs = node->node->inputs();
     for (auto inputs_it = inputs.cbegin(); inputs_it != inputs.cend();
          inputs_it++) {
-      nonroots.insert(inputs_it->second->name());
-      auto child_it = nodes_map_.find(inputs_it->second->name());
+      nonroots.insert(inputs_it->second);
+      auto child_it = nodes_map_.find(inputs_it->second);
       if (child_it != nodes_map_.end()) {
         node->children.push_back(child_it->second.get());
       }
diff --git a/tensorflow/core/profiler/internal/tfprof_node.cc b/tensorflow/core/profiler/internal/tfprof_node.cc
index 19257968e6..e2be2cf4cf 100644
--- a/tensorflow/core/profiler/internal/tfprof_node.cc
+++ b/tensorflow/core/profiler/internal/tfprof_node.cc
@@ -51,19 +51,21 @@ bool IsCanonicalDevice(const string& device) { return CountAsCPUTime(device); }
 void ExecStep::AddTimeStats(const string& dev, const NodeExecStats& step_stat) {
   devices_.insert(dev);
   if (step_stat.all_start_micros() > 0) {
-    if (all_start_micros_ > 0) {
-      all_start_micros_ = std::min(
-          all_start_micros_, static_cast<int64>(step_stat.all_start_micros()));
+    if (exec_.all_start_micros() > 0) {
+      exec_.set_all_start_micros(
+          std::min(static_cast<int64>(exec_.all_start_micros()),
+                   static_cast<int64>(step_stat.all_start_micros())));
     } else {
-      all_start_micros_ = step_stat.all_start_micros();
+      exec_.set_all_start_micros(step_stat.all_start_micros());
     }
     int64 op_end_rel_micros = step_stat.op_end_rel_micros();
     // Round quick execution to 1 micro to be semantically robust.
     if (op_end_rel_micros == 0) {
       ++op_end_rel_micros;
     }
-    latest_end_micros_ = std::max(
-        latest_end_micros_, step_stat.all_start_micros() + op_end_rel_micros);
+    exec_.set_latest_end_micros(
+        std::max(static_cast<int64>(exec_.latest_end_micros()),
+                 step_stat.all_start_micros() + op_end_rel_micros));
 
     const std::pair<int64, int64> pair =
         std::make_pair(step_stat.all_start_micros(), op_end_rel_micros);
@@ -75,17 +77,17 @@ void ExecStep::AddTimeStats(const string& dev, const NodeExecStats& step_stat) {
       op_execs_[dev].push_back(pair);
       // In while-loop, a graph node is executed multiple times under
       // the same name.
-      run_count_ += 1;
+      exec_.set_run_count(exec_.run_count() + 1);
     }
   }
 }
 
 void ExecStep::AddMemoryStats(const string& dev,
                               const NodeExecStats& step_stat) {
-  if (mem_initiated_) {
+  if (exec_.memory_intialized()) {
     return;
   }
-  mem_initiated_ = true;
+  exec_.set_memory_intialized(true);
 
   for (const auto& mem : step_stat.memory()) {
     // TODO(xpan): Fix this hack. Currently the allocator name seems quite
@@ -93,9 +95,9 @@ void ExecStep::AddMemoryStats(const string& dev,
     if (mem.allocator_name().find("GPU") == mem.allocator_name().npos) {
       continue;
     }
-    allocator_bytes_in_use_ =
-        std::max(allocator_bytes_in_use_,
-                 static_cast<int64>(mem.allocator_bytes_in_use()));
+    exec_.set_allocator_bytes_in_use(
+        std::max(static_cast<int64>(exec_.allocator_bytes_in_use()),
+                 static_cast<int64>(mem.allocator_bytes_in_use())));
   }
   int64 total_output_bytes = 0;
   for (const auto& output : step_stat.output()) {
@@ -114,16 +116,20 @@ void ExecStep::AddMemoryStats(const string& dev,
       output_memory_[output.slot()] = std::make_pair(output_bytes, output_ptr);
     }
   }
-  output_bytes_ = total_output_bytes;
+  exec_.set_output_bytes(total_output_bytes);
 
   if (step_stat.has_memory_stats()) {
-    host_temp_bytes_ += step_stat.memory_stats().host_temp_memory_size();
-    host_persistent_bytes_ +=
-        step_stat.memory_stats().host_persistent_memory_size();
-    accelerator_temp_bytes_ +=
-        step_stat.memory_stats().device_temp_memory_size();
-    accelerator_persistent_bytes_ +=
-        step_stat.memory_stats().device_persistent_memory_size();
+    exec_.set_host_temp_bytes(exec_.host_temp_bytes() +
+                              step_stat.memory_stats().host_temp_memory_size());
+    exec_.set_host_persistent_bytes(
+        exec_.host_persistent_bytes() +
+        step_stat.memory_stats().host_persistent_memory_size());
+    exec_.set_accelerator_temp_bytes(
+        exec_.accelerator_temp_bytes() +
+        step_stat.memory_stats().device_temp_memory_size());
+    exec_.set_accelerator_persistent_bytes(
+        exec_.accelerator_persistent_bytes() +
+        step_stat.memory_stats().device_persistent_memory_size());
   }
   int64 residual_bytes = 0;
   int64 requested_bytes = 0;
@@ -133,9 +139,9 @@ void ExecStep::AddMemoryStats(const string& dev,
     requested_bytes += mem.total_bytes();
     peak_bytes += mem.peak_bytes();
   }
-  requested_bytes_ = requested_bytes;
-  residual_bytes_ = residual_bytes;
-  peak_bytes_ = peak_bytes;
+  exec_.set_requested_bytes(requested_bytes);
+  exec_.set_residual_bytes(residual_bytes);
+  exec_.set_peak_bytes(peak_bytes);
 }
 
 void TFGraphNode::AddStepStat(int64 step, const string& device,
@@ -147,33 +153,33 @@ void TFGraphNode::AddStepStat(int64 step, const string& device,
   // It can be /job:0/replica:0/xxxx/device:GPU:0, or simply /device:GPU:0.
   // It can has some ad-hoc suffix, such as /stream:xx or /memcpy:xx.
   if (IsCanonicalDevice(dev)) {
-    if (!canonical_device_.empty()) {
-      if (canonical_device_ != dev) {
+    if (!node_.canonical_device().empty()) {
+      if (node_.canonical_device() != dev) {
         // TODO(xpan): Some RunMetadata node appears at multiple devices.
         // Need to address it.
         return;
       }
     } else {
-      canonical_device_ = dev;
+      node_.set_canonical_device(dev);
       // TODO(xpan): Support things other than gpu?
       if (dev.find("sycl") != dev.npos) {
-        host_device_ = StringReplace(dev, "device:sycl:\\d+", "cpu:0");
+        node_.set_host_device(StringReplace(dev, "device:sycl:\\d+", "cpu:0"));
       } else {
-        host_device_ = StringReplace(dev, "gpu:\\d+", "cpu:0");
+        node_.set_host_device(StringReplace(dev, "gpu:\\d+", "cpu:0"));
       }
-      AddOpType(canonical_device_);
+      AddOpType(node_.canonical_device());
     }
   }
 
   auto exec = execs_.find(step);
   if (exec == execs_.end()) {
-    execs_.insert(std::pair<int64, ExecStep>(step, ExecStep(this)));
+    execs_.insert(std::pair<int64, ExecStep>(step, ExecStep()));
     exec = execs_.find(step);
   }
 
   exec->second.AddTimeStats(dev, step_stat);
 
-  if (dev == canonical_device_) {
+  if (dev == node_.canonical_device()) {
     exec->second.AddMemoryStats(dev, step_stat);
   }
 }
diff --git a/tensorflow/core/profiler/internal/tfprof_node.h b/tensorflow/core/profiler/internal/tfprof_node.h
index 5ec3da12cf..55d53f3923 100644
--- a/tensorflow/core/profiler/internal/tfprof_node.h
+++ b/tensorflow/core/profiler/internal/tfprof_node.h
@@ -44,27 +44,13 @@ class TFGraphNode;
 
 class ExecStep {
  public:
-  ExecStep(TFGraphNode* node)
-      : node(node),
-        run_count_(0),
-        all_start_micros_(0),
-        latest_end_micros_(0),
-        mem_initiated_(false),
-        requested_bytes_(0),
-        peak_bytes_(0),
-        residual_bytes_(0),
-        output_bytes_(0),
-        host_temp_bytes_(0),
-        host_persistent_bytes_(0),
-        accelerator_temp_bytes_(0),
-        accelerator_persistent_bytes_(0),
-        allocator_bytes_in_use_(0) {}
+  ExecStep() {}
 
   void AddTimeStats(const string& dev, const NodeExecStats& step_stat);
 
   void AddMemoryStats(const string& dev, const NodeExecStats& step_stat);
 
-  int64 run_count() const { return run_count_; }
+  int64 run_count() const { return exec_.run_count(); }
   // The execution time of an op. If it runs on accelerator, then it's
   // accelerator_exec_micros(). Otherwise, it's CPU time.
   int64 exec_micros() const;
@@ -77,31 +63,101 @@ class ExecStep {
       const {
     return op_execs_;
   }
-  int64 all_start_micros() const { return all_start_micros_; }
-  int64 latest_end_micros() const { return latest_end_micros_; }
-
-  int64 requested_bytes() const { return requested_bytes_; }
-  int64 peak_bytes() const { return peak_bytes_; }
-  int64 residual_bytes() const { return residual_bytes_; }
-  int64 output_bytes() const { return output_bytes_; }
-  int64 accelerator_temp_bytes() const { return accelerator_temp_bytes_; }
-  int64 host_temp_bytes() const { return host_temp_bytes_; }
+  int64 all_start_micros() const { return exec_.all_start_micros(); }
+  int64 latest_end_micros() const { return exec_.latest_end_micros(); }
+
+  int64 requested_bytes() const { return exec_.requested_bytes(); }
+  int64 peak_bytes() const { return exec_.peak_bytes(); }
+  int64 residual_bytes() const { return exec_.residual_bytes(); }
+  int64 output_bytes() const { return exec_.output_bytes(); }
+  int64 accelerator_temp_bytes() const {
+    return exec_.accelerator_temp_bytes();
+  }
+  int64 host_temp_bytes() const { return exec_.host_temp_bytes(); }
   int64 accelerator_persistent_bytes() const {
-    return accelerator_persistent_bytes_;
+    return exec_.accelerator_persistent_bytes();
   }
-  int64 host_persistent_bytes() const { return host_persistent_bytes_; }
-  const std::map<int64, std::pair<int64, uint64>>& output_memory() const {
+  int64 host_persistent_bytes() const { return exec_.host_persistent_bytes(); }
+  const std::map<int32, std::pair<int64, uint64>>& output_memory() const {
     return output_memory_;
   }
-  int64 allocator_bytes_in_use() const { return allocator_bytes_in_use_; }
+  int64 allocator_bytes_in_use() const {
+    return exec_.allocator_bytes_in_use();
+  }
+
+  const ExecProfile& ToProto() {
+    exec_.mutable_accelerator_execs()->clear();
+    for (const auto& e : accelerator_execs_) {
+      auto& exec_time = (*exec_.mutable_accelerator_execs())[e.first];
+      for (const auto& p : e.second) {
+        auto* t = exec_time.mutable_times()->Add();
+        t->add_int64_values(p.first);
+        t->add_int64_values(p.second);
+      }
+    }
+
+    exec_.mutable_cpu_execs()->clear();
+    for (const auto& e : cpu_execs_) {
+      auto& exec_time = (*exec_.mutable_cpu_execs())[e.first];
+      for (const auto& p : e.second) {
+        auto* t = exec_time.mutable_times()->Add();
+        t->add_int64_values(p.first);
+        t->add_int64_values(p.second);
+      }
+    }
+
+    exec_.mutable_devices()->Clear();
+    exec_.mutable_devices()->Reserve(devices_.size());
+    for (const string& d : devices_) {
+      exec_.add_devices(d);
+    }
+
+    exec_.mutable_output_memory()->clear();
+    for (const auto& mem : output_memory_) {
+      auto& mem_pb = (*exec_.mutable_output_memory())[mem.first];
+      mem_pb.set_bytes(mem.second.first);
+      mem_pb.set_ptr(mem.second.second);
+    }
+
+    return exec_;
+  }
+
+  void FromProto(const ExecProfile& exec) {
+    exec_.Clear();
+    exec_.MergeFrom(exec);
+
+    devices_.clear();
+    devices_.insert(exec.devices().begin(), exec.devices().end());
+
+    accelerator_execs_.clear();
+    cpu_execs_.clear();
+    op_execs_.clear();
+
+    for (const auto& exec_time : exec_.accelerator_execs()) {
+      auto& exec = accelerator_execs_[exec_time.first];
+      auto& op_exec = op_execs_[exec_time.first];
+      for (const auto& p : exec_time.second.times()) {
+        exec.push_back(std::make_pair(p.int64_values(0), p.int64_values(1)));
+        op_exec.push_back(std::make_pair(p.int64_values(0), p.int64_values(1)));
+      }
+    }
+    for (const auto& exec_time : exec_.cpu_execs()) {
+      auto& exec = cpu_execs_[exec_time.first];
+      auto& op_exec = op_execs_[exec_time.first];
+      for (const auto& p : exec_time.second.times()) {
+        exec.push_back(std::make_pair(p.int64_values(0), p.int64_values(1)));
+        op_exec.push_back(std::make_pair(p.int64_values(0), p.int64_values(1)));
+      }
+    }
+    for (const auto& output_mem : exec_.output_memory()) {
+      auto& mem = output_memory_[output_mem.first];
+      mem.first = output_mem.second.bytes();
+      mem.second = output_mem.second.ptr();
+    }
+  }
 
  private:
-  TFGraphNode* node;
-  // Can be larger than 1 if run multiple times in loop.
-  int64 run_count_;
-  // The earliest/latest time including scheduling and execution.
-  int64 all_start_micros_;
-  int64 latest_end_micros_;
+  ExecProfile exec_;
   // device -> vector of {op_start_micros, op_exec_micros} pairs.
   // accelerator_execs: gpu:id/stream:all -> {op_start_micros, op_exec_micros}
   // For accelerator, vector size can be larger than 1, multiple kernel fires
@@ -115,26 +171,8 @@ class ExecStep {
   // All devices the op is associated with (e.g. gpu:0 (scheduling),
   // gpu:0:stream:xx (kernel exec), cpu:0 host)
   std::set<string> devices_;
-
-  bool mem_initiated_;
-  // Total bytes requested by the op.
-  int64 requested_bytes_;
-  // Total bytes requested by the op and released before op end.
-  int64 peak_bytes_;
-  // Total bytes requested by the op and not released after op end.
-  int64 residual_bytes_;
-  // Total bytes output by the op (not necessarily requested by the op).
-  int64 output_bytes_;
-  // Total temporary bytes allocated and released by the op.
-  int64 host_temp_bytes_;
-  // Total persistent bytes (e.g. variable) allocated by the op.
-  int64 host_persistent_bytes_;
-  int64 accelerator_temp_bytes_;
-  int64 accelerator_persistent_bytes_;
-  // The total number of bytes currently allocated by the allocator if >0.
-  int64 allocator_bytes_in_use_;
   // output_idx -> {output_bytes, memory_ptr}
-  std::map<int64, std::pair<int64, uint64>> output_memory_;
+  std::map<int32, std::pair<int64, uint64>> output_memory_;
 };
 
 #define GRAPH_NODE_BYTES(type)                                \
@@ -157,10 +195,18 @@ class ExecStep {
 
 class TFGraphNode {
  public:
-  TFGraphNode(const NodeDef* node)
-      : node_(node), float_ops_(0), op_(node->op()) {
+  TFGraphNode(const ProfileNode& node, const ProfileProto& profile) {
+    FromProto(node, profile);
+  }
+
+  TFGraphNode(const NodeDef* node, int64 id) {
+    node_.set_id(id);
+    node_.set_name(node->name());
+    node_.set_op(node->op());
+    node_.set_float_ops(0);
+
     for (const auto& attr : node->attr()) {
-      op_attrs_[attr.first] = &attr.second;
+      (*node_.mutable_attrs())[attr.first].MergeFrom(attr.second);
       if (attr.first == "shape" && attr.second.has_shape()) {
         if (!shape_.empty()) {
           fprintf(stderr, "Found duplicated shapes!\n");
@@ -180,10 +226,10 @@ class TFGraphNode {
     op_types_.insert(node->op());
   }
 
-  void AddInput(TFGraphNode* input, int64 output_idx, int input_idx) {
+  void AddInput(TFGraphNode* input, int32 output_idx, int input_idx) {
     src_output_idx_[input->name()] = output_idx;
 
-    inputs_[input_idx] = input;
+    inputs_[input_idx] = input->name();
     const auto& output_shape = input->output_shapes().find(output_idx);
     // Always create an empty vec even if the shape info might be missing.
     std::vector<int64>& shape_vec = input_shapes_[input_idx];
@@ -198,28 +244,123 @@ class TFGraphNode {
   void AddStepStat(int64 step, const string& device,
                    const NodeExecStats& step_stat);
 
-  void AddFloatOps(int64 float_ops) { float_ops_ = float_ops; }
+  void AddFloatOps(int64 float_ops) { node_.set_float_ops(float_ops); }
 
   // TODO(xpan): This could take a lot of memory.
-  void AddCode(const CodeDef& code) { code_.MergeFrom(code); }
+  void AddCode(const CodeDef& code) { node_.mutable_trace()->MergeFrom(code); }
 
-  const string& name() const { return node_->name(); }
-  const string& op() const { return op_; }
-  const NodeDef* node_def() { return node_; }
+  const string& name() const { return node_.name(); }
+  int64 id() const { return node_.id(); }
+  const string& op() const { return node_.op(); }
+  const ProfileNode& node() { return node_; }
 
   bool trackable(int64 step) const {
     auto exec = execs_.find(step);
     if (exec == execs_.end()) return false;
 
     if (exec->second.all_start_micros() == 0) return false;
-    if (canonical_device_.empty() || host_device_.empty()) {
+    if (node_.canonical_device().empty() || node_.host_device().empty()) {
       return false;
     }
     return true;
   }
 
-  const std::map<int, TFGraphNode*>& inputs() const { return inputs_; }
-  const std::map<string, int64>& src_output_idx() const {
+  const ProfileNode& ToProto(
+      const std::map<string, std::unique_ptr<TFGraphNode>>& nodes_map) {
+    node_.clear_shape();
+    node_.mutable_shape()->Reserve(shape().size());
+    for (int64 s : shape()) {
+      node_.add_shape(s);
+    }
+
+    node_.clear_op_types();
+    node_.mutable_op_types()->Reserve(op_types().size());
+    for (const string& t : op_types()) {
+      node_.add_op_types(t);
+    }
+
+    node_.clear_execs();
+    for (auto& exec : execs_) {
+      auto& exec_pb = (*node_.mutable_execs())[exec.first];
+      exec_pb.MergeFrom(exec.second.ToProto());
+    }
+
+    node_.clear_inputs();
+    for (const auto& inp : inputs_) {
+      (*node_.mutable_inputs())[inp.first] = nodes_map.at(inp.second)->id();
+    }
+
+    node_.clear_input_shapes();
+    for (const auto& s : input_shapes_) {
+      auto& shape = (*node_.mutable_input_shapes())[s.first];
+      for (int64 d : s.second) {
+        shape.add_int64_values(d);
+      }
+    }
+
+    node_.clear_output_shapes();
+    for (const auto& s : output_shapes_) {
+      auto& shape = (*node_.mutable_output_shapes())[s.first];
+      for (int64 d : s.second) {
+        shape.add_int64_values(d);
+      }
+    }
+
+    node_.clear_src_output_index();
+    for (const auto& s : src_output_idx_) {
+      int64 id = nodes_map.at(s.first)->id();
+      (*node_.mutable_src_output_index())[id] = s.second;
+    }
+    return node_;
+  }
+
+  void FromProto(const ProfileNode& node, const ProfileProto& profile) {
+    node_.Clear();
+    node_.MergeFrom(node);
+
+    op_types_.clear();
+    op_types_.insert(node_.op_types().begin(), node_.op_types().end());
+
+    shape_.clear();
+    for (int64 s : node_.shape()) {
+      shape_.push_back(s);
+    }
+
+    execs_.clear();
+    for (const auto& exec_pb : node.execs()) {
+      auto& exec = execs_[exec_pb.first];
+      exec.FromProto(exec_pb.second);
+    }
+
+    inputs_.clear();
+    for (const auto& inp : node.inputs()) {
+      inputs_[inp.first] = profile.nodes().at(inp.second).name();
+    }
+
+    input_shapes_.clear();
+    for (const auto& s : node.input_shapes()) {
+      auto& shape = input_shapes_[s.first];
+      for (const int64 d : s.second.int64_values()) {
+        shape.push_back(d);
+      }
+    }
+
+    output_shapes_.clear();
+    for (const auto& s : node.output_shapes()) {
+      auto& shape = output_shapes_[s.first];
+      for (const int64 d : s.second.int64_values()) {
+        shape.push_back(d);
+      }
+    }
+
+    src_output_idx_.clear();
+    for (const auto& s : node.src_output_index()) {
+      src_output_idx_[profile.nodes().at(s.first).name()] = s.second;
+    }
+  }
+
+  const std::map<int32, string>& inputs() const { return inputs_; }
+  const std::map<string, int32>& src_output_idx() const {
     return src_output_idx_;
   }
 
@@ -346,7 +487,7 @@ class TFGraphNode {
     CHECK(exec != execs_.end()) << "unknown step " << step;
     return exec->second.host_persistent_bytes();
   }
-  const std::map<int64, std::pair<int64, uint64>>& output_memory(
+  const std::map<int32, std::pair<int64, uint64>>& output_memory(
       int64 step) const {
     auto exec = execs_.find(step);
     CHECK(exec != execs_.end()) << "unknown step " << step;
@@ -382,18 +523,22 @@ class TFGraphNode {
   int64 float_ops(int64 step) const {
     // If not run, return static analysis.
     if (execs_.empty()) {
-      return float_ops_;
+      return node_.float_ops();
     }
     // Otherwise, return dynamic float_ops.
-    return float_ops_ * run_count(step);
+    return node_.float_ops() * run_count(step);
   }
-  const CodeDef& code() { return code_; }
-  string canonical_device() const { return canonical_device_; }
-  string host_device() const { return host_device_; }
+  const CodeDef& code() { return node_.trace(); }
+  string canonical_device() const { return node_.canonical_device(); }
+  string host_device() const { return node_.host_device(); }
   const std::set<string>& op_types() const { return op_types_; }
 
-  const std::map<string, const AttrValue*>& op_attrs() const {
-    return op_attrs_;
+  const AttrValue* op_attrs(const string& name) const {
+    const auto it = node_.attrs().find(name);
+    if (it == node_.attrs().end()) {
+      return nullptr;
+    }
+    return &it->second;
   }
 
   const std::vector<int64>& shape() const { return shape_; }
@@ -406,12 +551,10 @@ class TFGraphNode {
   }
 
  private:
-  std::map<int, TFGraphNode*> inputs_;
-  std::map<string, int64> src_output_idx_;
+  std::map<int, string> inputs_;
+  std::map<string, int32> src_output_idx_;
 
-  const NodeDef* node_;
-
-  CodeDef code_;
+  ProfileNode node_;
 
   std::vector<int64> shape_;
   // Won't missing input_idx. But some shapes might be empty (unknown).
@@ -421,18 +564,8 @@ class TFGraphNode {
   std::map<int, std::vector<int64>> output_shapes_;
 
   std::set<string> op_types_;
-  std::map<string, const AttrValue*> op_attrs_;
 
   std::map<int64, ExecStep> execs_;
-
-  // /j:#/t:#/r:#/device:#. A canonical device name without extra suffix.
-  string canonical_device_;
-  // The host device name.
-  string host_device_;
-
-  int64 float_ops_;
-
-  string op_;
 };
 
 class TFMultiGraphNode {
diff --git a/tensorflow/core/profiler/internal/tfprof_scope.cc b/tensorflow/core/profiler/internal/tfprof_scope.cc
index d7293c5e52..128b296d5c 100644
--- a/tensorflow/core/profiler/internal/tfprof_scope.cc
+++ b/tensorflow/core/profiler/internal/tfprof_scope.cc
@@ -34,8 +34,8 @@ ScopeNode* TFScope::CreateParentNode(const string& name) {
   node_defs_.push_back(std::unique_ptr<NodeDef>(new NodeDef()));
   node_defs_.back()->set_name(name);
   node_defs_.back()->set_op(kTFScopeParent);
-  parent_nodes_[name] =
-      std::unique_ptr<TFGraphNode>(new TFGraphNode(node_defs_.back().get()));
+  parent_nodes_[name] = std::unique_ptr<TFGraphNode>(
+      new TFGraphNode(node_defs_.back().get(), -1));
   nodes_map_[name] =
       std::unique_ptr<ScopeNode>(new ScopeNode(parent_nodes_[name].get()));
   return nodes_map_[name].get();
diff --git a/tensorflow/core/profiler/internal/tfprof_show_test.cc b/tensorflow/core/profiler/internal/tfprof_show_test.cc
index 24cf9afb17..1f19f8c322 100644
--- a/tensorflow/core/profiler/internal/tfprof_show_test.cc
+++ b/tensorflow/core/profiler/internal/tfprof_show_test.cc
@@ -68,6 +68,23 @@ class TFProfShowTest : public ::testing::Test {
     tf_stats_->BuildAllViews();
   }
 
+  string TestToFromProto(const string& cmd, const Options& opts,
+                         bool show_multi_node = false) {
+    string profile_file = io::JoinPath(testing::TmpDir(), "profile");
+    tf_stats_->WriteProfile(profile_file);
+    TFStats new_stats(profile_file, nullptr);
+    new_stats.BuildAllViews();
+    if (show_multi_node) {
+      new_stats.ShowMultiGraphNode(cmd, opts);
+    } else {
+      new_stats.ShowGraphNode(cmd, opts);
+    }
+    string dump_str;
+    TF_CHECK_OK(ReadFileToString(Env::Default(),
+                                 opts.output_options.at("outfile"), &dump_str));
+    return dump_str;
+  }
+
   std::unique_ptr<TFStats> tf_stats_;
 };
 
@@ -95,6 +112,8 @@ TEST_F(TFProfShowTest, DumpScopeMode) {
       "0us/0us, 11us/11us)\n  ScalarW (1, 1/1 params, 0/0 flops, 0B/0B, 0B/0B, "
       "0B/0B, 0B/0B, 0us/0us, 0us/0us, 0us/0us)\n",
       dump_str);
+
+  EXPECT_EQ(dump_str, TestToFromProto("scope", opts));
 }
 
 TEST_F(TFProfShowTest, DumpAcceleratorAndCPUMicros) {
@@ -140,6 +159,8 @@ TEST_F(TFProfShowTest, DumpAcceleratorAndCPUMicros) {
       "0us/0us)\n    ScalarW/read (0us/0us, 0us/0us)\n  init (0us/0us, "
       "0us/0us)\n",
       dump_str);
+
+  EXPECT_EQ(dump_str, TestToFromProto("scope", opts));
 }
 
 TEST_F(TFProfShowTest, DumpOpMode) {
@@ -182,6 +203,8 @@ TEST_F(TFProfShowTest, DumpOpMode) {
       "0|defined*1)\texec_time:0us\ninput_type:0:3x3x3x6\t(run*0|defined*1)"
       "\texec_time:0us\n\n",
       StringReplace(dump_str, " ", ""));
+
+  EXPECT_EQ(dump_str, TestToFromProto("op", opts, true));
 }
 }  // namespace tfprof
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/internal/tfprof_stats.cc b/tensorflow/core/profiler/internal/tfprof_stats.cc
index 81db766796..5b54958344 100644
--- a/tensorflow/core/profiler/internal/tfprof_stats.cc
+++ b/tensorflow/core/profiler/internal/tfprof_stats.cc
@@ -46,12 +46,11 @@ TFStats::TFStats(std::unique_ptr<GraphDef> graph,
                  std::unique_ptr<OpLogProto> op_log,
                  std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader)
     : has_code_traces_(false),
-      graph_(std::move(graph)),
       ckpt_reader_(std::move(ckpt_reader)) {
-  CHECK(graph_) << "Must at least have GraphDef";
+  CHECK(graph) << "Must at least have GraphDef";
 
   printf("Parsing Inputs...\n");
-  ParseGraph();
+  AddGraph(std::move(graph));
   if (run_meta && run_meta->has_step_stats()) {
     AddRunMeta(0, std::move(run_meta));
   }
@@ -67,6 +66,30 @@ TFStats::TFStats(std::unique_ptr<GraphDef> graph,
   }
 }
 
+TFStats::TFStats(const string& filename,
+                 std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader)
+    : has_code_traces_(false), ckpt_reader_(std::move(ckpt_reader)) {
+  string str;
+  Status s = ReadFileToString(Env::Default(), filename, &str);
+  if (!s.ok()) {
+    fprintf(stderr, "Failed to read profile: %s", s.ToString().c_str());
+    return;
+  }
+
+  ProfileProto profile;
+  if (!profile.ParseFromString(str)) {
+    fprintf(stderr, "Failed to parse profile\n");
+    return;
+  }
+
+  for (const auto& node_pb : profile.nodes()) {
+    std::unique_ptr<TFGraphNode> node(new TFGraphNode(node_pb.second, profile));
+    nodes_map_.insert(std::pair<string, std::unique_ptr<TFGraphNode>>(
+        node_pb.second.name(), std::move(node)));
+  }
+  has_code_traces_ = profile.has_trace();
+}
+
 void TFStats::BuildView(const string& cmd) {
   if (cmd == kCmds[0] && !scope_view_) {
     scope_view_.reset(new TFScope(ckpt_reader_.get()));
@@ -135,16 +158,20 @@ const MultiGraphNodeProto& TFStats::ShowMultiGraphNode(
   }
 }
 
-void TFStats::ParseGraph() {
-  for (const NodeDef& node : graph_->node()) {
-    CHECK(nodes_map_.find(node.name()) == nodes_map_.end());
+void TFStats::AddGraph(std::unique_ptr<GraphDef> graph) {
+  std::map<string, const NodeDef*> node_defs;
+  for (const NodeDef& node : graph->node()) {
+    if (nodes_map_.find(node.name()) != nodes_map_.end()) {
+      continue;
+    }
     nodes_map_[node.name()] =
-        std::unique_ptr<TFGraphNode>(new TFGraphNode(&node));
+        std::unique_ptr<TFGraphNode>(new TFGraphNode(&node, nodes_map_.size()));
+    node_defs[node.name()] = &node;
   }
-  for (auto it = nodes_map_.begin(); it != nodes_map_.end(); it++) {
-    const NodeDef* node_def = it->second->node_def();
-    for (int i = 0; i < node_def->input_size(); ++i) {
-      string node_input = node_def->input(i);
+  for (auto it = node_defs.begin(); it != node_defs.end(); it++) {
+    TFGraphNode* node = nodes_map_.at(it->first).get();
+    for (int i = 0; i < it->second->input_size(); ++i) {
+      string node_input = it->second->input(i);
       int output_idx = 0;
       // input name format can be: "^node:src_output"
       auto prefix_pos = node_input.find(":");
@@ -160,10 +187,12 @@ void TFStats::ParseGraph() {
         node_input = node_input.substr(1);
       }
       auto input_node = nodes_map_.find(node_input);
+      // TODO(xpan): P1: Add the input even if it doesn't exist yet, because
+      // this can be a partial graph.
       if (input_node == nodes_map_.end()) {
         continue;
       }
-      it->second->AddInput(input_node->second.get(), output_idx, i);
+      node->AddInput(input_node->second.get(), output_idx, i);
     }
   }
 }
@@ -210,10 +239,8 @@ void TFStats::AddRunMeta(int64 step, std::unique_ptr<RunMetadata> run_meta) {
       if (node == nodes_map_.end()) {
         NodeDef def;
         if (CreateRunMetadataNode(name, &def)) {
-          NodeDef* ndef = graph_->mutable_node()->Add();
-          ndef->MergeFrom(def);
-          nodes_map_[name] =
-              std::unique_ptr<TFGraphNode>(new TFGraphNode(ndef));
+          nodes_map_[name] = std::unique_ptr<TFGraphNode>(
+              new TFGraphNode(&def, nodes_map_.size()));
           nodes_map_.at(name)->AddStepStat(step, dev_stat.device(), node_stat);
         }
       } else {
@@ -223,6 +250,25 @@ void TFStats::AddRunMeta(int64 step, std::unique_ptr<RunMetadata> run_meta) {
   }
 }
 
+void TFStats::WriteProfile(const string& filename) {
+  ProfileProto profile;
+  for (auto it = nodes_map_.begin(); it != nodes_map_.end(); it++) {
+    if (it->second->id() < 0) {
+      continue;
+    }
+    (*profile.mutable_nodes())[it->second->id()].MergeFrom(
+        it->second->ToProto(nodes_map_));
+    if (it->second->code().traces_size() > 0) {
+      profile.set_has_trace(true);
+    }
+  }
+  Status s =
+      WriteStringToFile(Env::Default(), filename, profile.SerializeAsString());
+  if (!s.ok()) {
+    fprintf(stderr, "%s\n", s.ToString().c_str());
+  }
+}
+
 bool TFStats::Validate(const Options& opts) const {
   if (opts.step >= 0 && steps_.find(opts.step) == steps_.end()) {
     fprintf(stderr, "Options -step=%lld not found\n", opts.step);
diff --git a/tensorflow/core/profiler/internal/tfprof_stats.h b/tensorflow/core/profiler/internal/tfprof_stats.h
index 8dbab16fe4..3b1251152d 100644
--- a/tensorflow/core/profiler/internal/tfprof_stats.h
+++ b/tensorflow/core/profiler/internal/tfprof_stats.h
@@ -55,6 +55,10 @@ class TFStats {
           std::unique_ptr<RunMetadata> run_meta,
           std::unique_ptr<OpLogProto> op_log,
           std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader);
+
+  TFStats(const string& filename,
+          std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader);
+
   ~TFStats() {}
 
   const std::map<string, std::unique_ptr<TFGraphNode>>& nodes() const {
@@ -76,23 +80,25 @@ class TFStats {
   const MultiGraphNodeProto& ShowMultiGraphNode(const string& cmd,
                                                 const Options& opts) const;
 
+  // A a (partial) graph to existing graph.
+  void AddGraph(std::unique_ptr<GraphDef> graph);
+
   // Add a step of run time meta data.
   void AddRunMeta(int64 step, std::unique_ptr<RunMetadata> run_meta);
   // Add tfprof operation meta data, such as customized op type, float_ops,
   // and code traces.
   void AddOpLogProto(std::unique_ptr<OpLogProto> op_log);
 
+  void WriteProfile(const string& filename);
+
   // For test purpose only.
   void AddNodeForTest(int64 step, std::unique_ptr<TFGraphNode> node);
 
  private:
   bool Validate(const Options& opts) const;
 
-  void ParseGraph();
-
   std::set<int64> steps_;
   bool has_code_traces_;
-  std::unique_ptr<GraphDef> graph_;
   std::unique_ptr<TFScope> scope_view_;
   std::unique_ptr<TFGraph> graph_view_;
   std::unique_ptr<TFCode> code_view_;
diff --git a/tensorflow/core/profiler/internal/tfprof_stats_test.cc b/tensorflow/core/profiler/internal/tfprof_stats_test.cc
index 0bea14fe66..2f2101d76b 100644
--- a/tensorflow/core/profiler/internal/tfprof_stats_test.cc
+++ b/tensorflow/core/profiler/internal/tfprof_stats_test.cc
@@ -69,6 +69,14 @@ class TFProfStatsTest : public ::testing::Test {
     tf_stats_->BuildAllViews();
   }
 
+  string TestToFromProto(const string& cmd, const Options& opts) {
+    string profile_file = io::JoinPath(testing::TmpDir(), "profile");
+    tf_stats_->WriteProfile(profile_file);
+    TFStats new_stats(profile_file, nullptr);
+    new_stats.BuildAllViews();
+    return new_stats.ShowGraphNode(cmd, opts).DebugString();
+  }
+
   std::unique_ptr<TFStats> tf_stats_;
 };
 
@@ -98,6 +106,8 @@ TEST_F(TFProfStatsTest, CustomOpType) {
       "2\ntotal_definition_count: 3\ntotal_output_bytes: 2560\n",
       &expected));
   EXPECT_EQ(expected.DebugString(), root.DebugString());
+
+  EXPECT_EQ(root.DebugString(), TestToFromProto("scope", opts));
 }
 
 TEST_F(TFProfStatsTest, CheckPointOpType) {
@@ -126,6 +136,8 @@ TEST_F(TFProfStatsTest, CheckPointOpType) {
       "2\ntotal_definition_count: 3\ntotal_output_bytes: 2560\n",
       &expected));
   EXPECT_EQ(expected.DebugString(), root.DebugString());
+
+  EXPECT_EQ(root.DebugString(), TestToFromProto("scope", opts));
 }
 
 TEST_F(TFProfStatsTest, TestGraph) {
@@ -157,6 +169,8 @@ TEST_F(TFProfStatsTest, TestGraph) {
       "9984\ntotal_residual_bytes: 1280\ntotal_output_bytes: 4864\n",
       &expected));
   EXPECT_EQ(expected.DebugString(), root.DebugString());
+
+  EXPECT_EQ(root.DebugString(), TestToFromProto("graph", opts));
 }
 
 TEST_F(TFProfStatsTest, TestFloatOps) {
@@ -203,6 +217,8 @@ TEST_F(TFProfStatsTest, TestFloatOps) {
       "9984\ntotal_residual_bytes: 1280\ntotal_output_bytes: 4864\n",
       &expected));
   EXPECT_EQ(expected.DebugString(), root.DebugString());
+
+  EXPECT_EQ(root.DebugString(), TestToFromProto("scope", opts));
 }
 
 TEST_F(TFProfStatsTest, TestAccountShownNameOnly) {
@@ -236,6 +252,8 @@ TEST_F(TFProfStatsTest, TestAccountShownNameOnly) {
       "4096\ntotal_residual_bytes: 512\ntotal_output_bytes: 512\n",
       &expected));
   EXPECT_EQ(expected.DebugString(), root.DebugString());
+
+  EXPECT_EQ(root.DebugString(), TestToFromProto("scope", opts));
 }
 
 TEST_F(TFProfStatsTest, TestShowTensorValue) {
diff --git a/tensorflow/core/profiler/internal/tfprof_utils.cc b/tensorflow/core/profiler/internal/tfprof_utils.cc
index 383c4725b7..1ce59ad755 100644
--- a/tensorflow/core/profiler/internal/tfprof_utils.cc
+++ b/tensorflow/core/profiler/internal/tfprof_utils.cc
@@ -297,6 +297,7 @@ void PrintHelp() {
       "See https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/profiler/"
       "g3doc/command_line.md for command line tool tutorial.\n");
   printf(
+      "profiler --profile_path=<ProfileProto binary file> # required\nor:\n"
       "profiler --graph_path=<GraphDef proto file>  # required\n"
       "         --run_meta_patn=<RunMetadata proto file>  # optional\n"
       "         --run_log_path=<OpLogProto proto file>  # optional\n\n");
diff --git a/tensorflow/core/profiler/profiler.cc b/tensorflow/core/profiler/profiler.cc
index 6acf4ea377..96e0b06bf3 100644
--- a/tensorflow/core/profiler/profiler.cc
+++ b/tensorflow/core/profiler/profiler.cc
@@ -66,6 +66,7 @@ void completion(const char* buf, linenoiseCompletions* lc) {
 }
 
 int Run(int argc, char** argv) {
+  string FLAGS_profile_path = "";
   string FLAGS_graph_path = "";
   string FLAGS_run_meta_path = "";
   string FLAGS_op_log_path = "";
@@ -96,6 +97,7 @@ int Run(int argc, char** argv) {
   }
 
   std::vector<Flag> flag_list = {
+      Flag("profile_path", &FLAGS_profile_path, "Profile binary file name."),
       Flag("graph_path", &FLAGS_graph_path, "GraphDef proto text file name"),
       Flag("run_meta_path", &FLAGS_run_meta_path,
            "Comma-separated list of RunMetadata proto binary "
@@ -138,6 +140,12 @@ int Run(int argc, char** argv) {
   }
   port::InitMain(argv[0], &argc, &argv);
 
+  if (!FLAGS_profile_path.empty() && !FLAGS_graph_path.empty()) {
+    fprintf(stderr,
+            "both --graph_path and --profile_path are set. "
+            "Ignore graph_path\n");
+  }
+
   std::vector<string> account_type_regexes =
       str_util::Split(FLAGS_account_type_regexes, ',', str_util::SkipEmpty());
   std::vector<string> start_name_regexes =
@@ -157,7 +165,7 @@ int Run(int argc, char** argv) {
   CHECK(s.ok()) << s.ToString();
 
   string cmd = "";
-  if (argc == 1 && FLAGS_graph_path.empty()) {
+  if (argc == 1 && FLAGS_graph_path.empty() && FLAGS_profile_path.empty()) {
     PrintHelp();
     return 0;
   } else if (argc > 1) {
@@ -173,24 +181,6 @@ int Run(int argc, char** argv) {
   }
 
   printf("Reading Files...\n");
-  std::unique_ptr<GraphDef> graph(new GraphDef());
-  TF_CHECK_OK(
-      ReadProtoFile(Env::Default(), FLAGS_graph_path, graph.get(), false));
-
-  std::unique_ptr<OpLogProto> op_log(new OpLogProto());
-  if (!FLAGS_op_log_path.empty()) {
-    string op_log_str;
-    s = ReadFileToString(Env::Default(), FLAGS_op_log_path, &op_log_str);
-    if (!s.ok()) {
-      fprintf(stderr, "Failed to read op_log_path: %s\n", s.ToString().c_str());
-      return 1;
-    }
-    if (!ParseProtoUnlimited(op_log.get(), op_log_str)) {
-      fprintf(stderr, "Failed to parse op_log_path\n");
-      return 1;
-    }
-  }
-
   std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader;
   TF_Status* status = TF_NewStatus();
   if (!FLAGS_checkpoint_path.empty()) {
@@ -204,25 +194,52 @@ int Run(int argc, char** argv) {
     TF_DeleteStatus(status);
   }
 
-  TFStats tf_stat(std::move(graph), nullptr, std::move(op_log),
-                  std::move(ckpt_reader));
+  std::unique_ptr<TFStats> tf_stat;
+  if (!FLAGS_profile_path.empty()) {
+    tf_stat.reset(new TFStats(FLAGS_profile_path, std::move(ckpt_reader)));
+  } else {
+    printf(
+        "Try to use a single --profile_path instead of "
+        "graph_path,op_log_path,run_meta_path\n");
+    std::unique_ptr<GraphDef> graph(new GraphDef());
+    TF_CHECK_OK(
+        ReadProtoFile(Env::Default(), FLAGS_graph_path, graph.get(), false));
 
-  std::vector<string> run_meta_files =
-      str_util::Split(FLAGS_run_meta_path, ',', str_util::SkipEmpty());
-  for (int i = 0; i < run_meta_files.size(); ++i) {
-    std::unique_ptr<RunMetadata> run_meta(new RunMetadata());
-    s = ReadProtoFile(Env::Default(), run_meta_files[i], run_meta.get(), true);
-    if (!s.ok()) {
-      fprintf(stderr, "Failed to read run_meta_path %s. Status: %s\n",
-              run_meta_files[i].c_str(), s.ToString().c_str());
-      return 1;
+    std::unique_ptr<OpLogProto> op_log(new OpLogProto());
+    if (!FLAGS_op_log_path.empty()) {
+      string op_log_str;
+      s = ReadFileToString(Env::Default(), FLAGS_op_log_path, &op_log_str);
+      if (!s.ok()) {
+        fprintf(stderr, "Failed to read op_log_path: %s\n",
+                s.ToString().c_str());
+        return 1;
+      }
+      if (!ParseProtoUnlimited(op_log.get(), op_log_str)) {
+        fprintf(stderr, "Failed to parse op_log_path\n");
+        return 1;
+      }
+    }
+    tf_stat.reset(new TFStats(std::move(graph), nullptr, std::move(op_log),
+                              std::move(ckpt_reader)));
+
+    std::vector<string> run_meta_files =
+        str_util::Split(FLAGS_run_meta_path, ',', str_util::SkipEmpty());
+    for (int i = 0; i < run_meta_files.size(); ++i) {
+      std::unique_ptr<RunMetadata> run_meta(new RunMetadata());
+      s = ReadProtoFile(Env::Default(), run_meta_files[i], run_meta.get(),
+                        true);
+      if (!s.ok()) {
+        fprintf(stderr, "Failed to read run_meta_path %s. Status: %s\n",
+                run_meta_files[i].c_str(), s.ToString().c_str());
+        return 1;
+      }
+      tf_stat->AddRunMeta(i, std::move(run_meta));
     }
-    tf_stat.AddRunMeta(i, std::move(run_meta));
   }
 
   if (cmd == kCmds[4]) {
-    tf_stat.BuildAllViews();
-    Advisor(&tf_stat).Advise(Advisor::DefaultOptions());
+    tf_stat->BuildAllViews();
+    Advisor(tf_stat.get()).Advise(Advisor::DefaultOptions());
     return 0;
   }
 
@@ -236,12 +253,12 @@ int Run(int argc, char** argv) {
       select, output_type, output_options);
 
   if (cmd == kCmds[2] || cmd == kCmds[3]) {
-    tf_stat.BuildView(cmd);
-    tf_stat.ShowMultiGraphNode(cmd, opts);
+    tf_stat->BuildView(cmd);
+    tf_stat->ShowMultiGraphNode(cmd, opts);
     return 0;
   } else if (cmd == kCmds[0] || cmd == kCmds[1]) {
-    tf_stat.BuildView(cmd);
-    tf_stat.ShowGraphNode(cmd, opts);
+    tf_stat->BuildView(cmd);
+    tf_stat->ShowGraphNode(cmd, opts);
     return 0;
   }
 
@@ -270,14 +287,14 @@ int Run(int argc, char** argv) {
     } else if (cmd == kCmds[6]) {
       PrintHelp();
     } else if (cmd == kCmds[2] || cmd == kCmds[3]) {
-      tf_stat.BuildView(cmd);
-      tf_stat.ShowMultiGraphNode(cmd, new_opts);
+      tf_stat->BuildView(cmd);
+      tf_stat->ShowMultiGraphNode(cmd, new_opts);
     } else if (cmd == kCmds[0] || cmd == kCmds[1]) {
-      tf_stat.BuildView(cmd);
-      tf_stat.ShowGraphNode(cmd, new_opts);
+      tf_stat->BuildView(cmd);
+      tf_stat->ShowGraphNode(cmd, new_opts);
     } else if (cmd == kCmds[4]) {
-      tf_stat.BuildAllViews();
-      Advisor(&tf_stat).Advise(Advisor::DefaultOptions());
+      tf_stat->BuildAllViews();
+      Advisor(tf_stat.get()).Advise(Advisor::DefaultOptions());
     }
   }
   return 0;
diff --git a/tensorflow/core/profiler/tfprof_log.proto b/tensorflow/core/profiler/tfprof_log.proto
index 006ee2b7a5..1ce5a5eecf 100644
--- a/tensorflow/core/profiler/tfprof_log.proto
+++ b/tensorflow/core/profiler/tfprof_log.proto
@@ -2,6 +2,8 @@ syntax = "proto3";
 
 package tensorflow.tfprof;
 
+import "tensorflow/core/framework/attr_value.proto";
+
 // It specifies the Python callstack that creates an op.
 message CodeDef {
   repeated Trace traces = 1;
@@ -31,3 +33,94 @@ message OpLogEntry {
 message OpLogProto {
   repeated OpLogEntry log_entries = 1;
 }
+
+// A proto representation of the profiler's profile.
+// It allows serialization, shipping around and deserialization of the profiles.
+//
+// Please don't depend on the internals of the profile proto.
+message ProfileProto {
+  map<int64, ProfileNode> nodes = 1;
+  // Whether or not has code traces.
+  bool has_trace = 2;
+}
+
+message ProfileNode {
+  // graph node name.
+  string name = 1;
+  // graph operation type.
+  string op = 9;
+  // A unique id for the node.
+  int64 id = 13;
+
+  map<int32, int64> inputs = 2;
+  map<int32, Tuple> input_shapes = 16;
+  map<int32, int64> outputs = 3;
+  map<int32, Tuple> output_shapes = 15;
+  // A map from source node id to its output index to current node.
+  map<int64, int32> src_output_index = 14;
+
+  repeated int64 shape = 4;
+  repeated string op_types = 5;
+  string canonical_device = 6;
+  string host_device = 7;
+
+  int64 float_ops = 8;
+
+  CodeDef trace = 10;
+  map<string, AttrValue> attrs = 11;
+
+  map<int64, ExecProfile> execs = 12;
+}
+
+message ExecProfile {
+  // Can be larger than 1 if run multiple times in loop.
+  int64 run_count = 1;
+  // The earliest/latest time including scheduling and execution.
+  int64 all_start_micros = 2;
+  int64 latest_end_micros = 3;
+
+  // device -> vector of {op_start_micros, op_exec_micros} pairs.
+  // accelerator_execs: gpu:id/stream:all -> {op_start_micros, op_exec_micros}
+  // For accelerator, vector size can be larger than 1, multiple kernel fires
+  // or in tf.while_loop.
+  map<string, ExecTime> accelerator_execs = 4;
+  // cpu_execs: cpu/gpu:id -> {op_start_micros, op_exec_micros}
+  // For cpu, vector size can be larger than 1 if in tf.while_loop.
+  map<string, ExecTime> cpu_execs = 5;
+
+  map<int32, Memory> output_memory = 17;
+
+  repeated string devices = 6;
+
+  // Total bytes requested by the op.
+  int64 requested_bytes = 7;
+  // Total bytes requested by the op and released before op end.
+  int64 peak_bytes = 8;
+  // Total bytes requested by the op and not released after op end.
+  int64 residual_bytes = 9;
+  // Total bytes output by the op (not necessarily requested by the op).
+  int64 output_bytes = 10;
+  // Total temporary bytes allocated and released by the op.
+  int64 host_temp_bytes = 11;
+  // Total persistent bytes (e.g. variable) allocated by the op.
+  int64 host_persistent_bytes = 12;
+  int64 accelerator_temp_bytes = 13;
+  int64 accelerator_persistent_bytes = 14;
+  // The total number of bytes currently allocated by the allocator if >0.
+  int64 allocator_bytes_in_use = 15;
+
+  bool memory_intialized = 16;
+}
+
+message ExecTime {
+  repeated Tuple times = 1;
+}
+
+message Tuple {
+  repeated int64 int64_values = 1;
+}
+
+message Memory {
+  int64 bytes = 1;
+  uint64 ptr = 2;
+}
author	A. Unique TensorFlower <gardener@tensorflow.org>	2017-09-06 21:56:33 -0700
committer	TensorFlower Gardener <gardener@tensorflow.org>	2017-09-06 22:01:54 -0700
commit	9e9ffa33d154b6c332bd475d6da2030746460fed (patch)
tree	64de1c00853926d394f6f4ba82f01f448e990fda /tensorflow/core/profiler
parent	27ce2a4ac956941ba8a0b9aaaa77acc0aa861fef (diff)