1. Adjust code view pprof image to better visualize backprop.

2. Allow to add multiple RunMetadata for 1 step, e.g. 1 for var initialization, 1 for training. So it has a complete profile. 3. Improve tests a bit. PiperOrigin-RevId: 165385567
author: A. Unique TensorFlower <gardener@tensorflow.org> 2017-08-15 17:48:55 -0700
committer: TensorFlower Gardener <gardener@tensorflow.org> 2017-08-15 17:52:23 -0700
commit: 93b21f7b1fa725299f86058436f034b15350de52 (patch)
tree: 6b0dcf69ed06601680b0b14eb0a9c25edfc292e9 /tensorflow
parent: 5db8be66563227f5bba37aeddce3951239dcd947 (diff)
10 files changed, 324 insertions, 79 deletions
diff --git a/tensorflow/core/profiler/README.md b/tensorflow/core/profiler/README.md
index 06118e6eb2..f755089fc7 100644
--- a/tensorflow/core/profiler/README.md
+++ b/tensorflow/core/profiler/README.md
@@ -202,14 +202,13 @@ Open a Chrome browser, enter URL chrome://tracing and load the timeline file.
 # nodes created by the Python call stack.
 # Nevertheless, it pops critical Python code path for us.
 #
-# `-trim_name_regexes` trims the python call stack, which are always the same
-# for the leaves.
+# `-trim_name_regexes` trims the some traces that have no valuable information.
 # `-select accelerator_micros` pick accelerator time for pprof graph. User
 # can also generate memory profile using `-select bytes`
-tfprof> code -max_depth 100 -trim_name_regexes '^ops.py.*' -select accelerator_micros -output pprof:outfile=<filename>
+tfprof> code -select accelerator_micros -max_depth 100000 -output pprof:outfile=<filename>  -trim_name_regexes .*apply_op.*
 
 # Use pprof to visualize the generated file.
-pprof -png --nodecount=20 --sample_index=1 <filename>
+pprof -png --nodecount=100 --sample_index=1 <filename>
 ```
 
 <left>
diff --git a/tensorflow/core/profiler/g3doc/pprof.jpg b/tensorflow/core/profiler/g3doc/pprof.jpg
index bf28967d68..8ffeda8360 100644
--- a/tensorflow/core/profiler/g3doc/pprof.jpg
+++ b/tensorflow/core/profiler/g3doc/pprof.jpg
diff --git a/tensorflow/core/profiler/internal/tfprof_code.cc b/tensorflow/core/profiler/internal/tfprof_code.cc
index 1c512a7ca1..7f4d682cda 100644
--- a/tensorflow/core/profiler/internal/tfprof_code.cc
+++ b/tensorflow/core/profiler/internal/tfprof_code.cc
@@ -32,6 +32,9 @@ limitations under the License.
 namespace tensorflow {
 namespace tfprof {
 namespace {
+
+const char* const kGradientSuffix = " (gradient)";
+
 // Convert to Trace proto into a short readable string.
 string GetTraceString(const CodeDef::Trace& trace) {
   string ntrace = io::Basename(trace.file()).ToString();
@@ -49,6 +52,24 @@ string GetTraceString(const CodeDef::Trace& trace) {
   return ntrace;
 }
 
+bool IsGradNode(const string& name, string* forward_name) {
+  // Given a forward operation with name op, its gradient op has the following
+  // name: ...gradients/op_grad/...
+  // TODO(xpan): This is hacky.
+  auto grad_prefix = name.find("gradients/");
+  auto grad_suffix = name.find("_grad/");
+  if (grad_prefix == name.npos || grad_suffix == name.npos) {
+    return false;
+  }
+  auto start = grad_prefix + string("gradients/").length();
+  auto len = grad_suffix - start;
+  if (len <= 0) {
+    return false;
+  }
+  *forward_name = name.substr(start, len);
+  return true;
+}
+
 // StringTable maps each string to an id.
 class StringTable {
  public:
@@ -170,12 +191,17 @@ class Samples {
     for (const CodeNode* cn : all_leaf) {
       for (auto gn_it : cn->node->graph_nodes()) {
         const TFGraphNode* gn = gn_it.second;
-        pprof::Sample* sample_pb = &sample_table_[gn->name()];
+        string name = gn->name();
+        // Generate a new trace name, in case the name is taken.
+        while (sample_table_.find(name) != sample_table_.end()) {
+          name += '@';
+        }
+        pprof::Sample* sample_pb = &sample_table_[name];
         for (uint64 id : location_ids) {
           sample_pb->mutable_location_id()->Add(id);
         }
         pprof::Label* label_pb = sample_pb->mutable_label()->Add();
-        label_pb->set_key(string_table_->GetIndex("node_name"));
+        label_pb->set_key(string_table_->GetIndex("graph node:"));
         label_pb->set_str(string_table_->GetIndex(gn->name()));
 
         sample_pb->mutable_value()->Add(1);
@@ -240,11 +266,11 @@ class PprofProfileImpl : public PprofProfile {
         samples_(new Samples(&string_table_, opts)) {}
 
   uint64 AddLocation(const CodeNode* callee, const CodeNode* caller) override {
-    const string& file_path = caller->trace->file();
-    uint64 lineno = caller->trace->lineno();
-    const string& callee_file_path = callee->trace->file();
-    const string& callee_function = callee->trace->function();
-    uint64 callee_func_start_line = callee->trace->func_start_line();
+    const string& file_path = caller->file();
+    uint64 lineno = caller->lineno();
+    const string& callee_file_path = callee->file();
+    const string& callee_function = callee->function();
+    uint64 callee_func_start_line = callee->func_start_line();
 
     return loc_table_->GetIndex(file_path, lineno, callee_function,
                                 callee_file_path, callee_func_start_line);
@@ -274,7 +300,7 @@ class PprofProfileImpl : public PprofProfile {
     if (!s.ok()) return s;
     s = zlib_output_buffer->Close();
     if (!s.ok()) return s;
-    fprintf(stdout, "\nRun pprof -png --nodecount=20 --sample_index=1 <%s>\n",
+    fprintf(stdout, "\nRun pprof -png --nodecount=100 --sample_index=1 <%s>\n",
             filename.c_str());
     return s;
   }
@@ -303,19 +329,20 @@ class PprofProfileImpl : public PprofProfile {
             string_table_.GetIndex("CPU execution time."));
       }
     } else if (type == kShown[0]) {
-      sample_type->set_unit(string_table_.GetIndex("requested bytes"));
+      sample_type->set_unit(string_table_.GetIndex("bytes"));
       profile_pb->mutable_comment()->Add(
-          string_table_.GetIndex("Sum of operation total requested memory."));
+          string_table_.GetIndex("Sum of operation total memory requests, "
+                                 "excluding deallocations."));
     } else if (type == kShown[11]) {
-      sample_type->set_unit(string_table_.GetIndex("peak bytes"));
+      sample_type->set_unit(string_table_.GetIndex("bytes"));
       profile_pb->mutable_comment()->Add(
           string_table_.GetIndex("Sum of operation peak memory usage."));
     } else if (type == kShown[12]) {
-      sample_type->set_unit(string_table_.GetIndex("residual bytes"));
+      sample_type->set_unit(string_table_.GetIndex("bytes"));
       profile_pb->mutable_comment()->Add(string_table_.GetIndex(
           "Sum of operation allocated memory after finish."));
     } else if (type == kShown[13]) {
-      sample_type->set_unit(string_table_.GetIndex("output bytes"));
+      sample_type->set_unit(string_table_.GetIndex("bytes"));
       profile_pb->mutable_comment()->Add(
           string_table_.GetIndex("Sum of operation output size."));
     } else if (type == kShown[2]) {
@@ -357,26 +384,101 @@ void TFCode::AddNode(TFGraphNode* node) {
   if (node->code().traces_size() == 0) {
     return;
   }
+
+  // We infer the forward operation name from gradient op name. So, we can
+  // map gradient op traces to forward op traces.
+  // E.g. gradient node of 'inp_1/Conv2D' would be 'gradients/inp_1/Conv2D_grad.
+  string forward_name;
+  if (IsGradNode(node->name(), &forward_name)) {
+    auto grad_nodes_it = grad_nodes_.find(forward_name);
+    if (grad_nodes_it != grad_nodes_.end()) {
+      grad_nodes_it->second.push_back(node);
+    } else {
+      grad_nodes_.insert(
+          std::pair<string, std::vector<TFGraphNode*>>(forward_name, {node}));
+    }
+    return;
+  } else {
+    forward_nodes_[node->name()] = node;
+  }
+
+  // Track if this is the first trace (first node). If true, add all
+  // traces to common_traces_. Otherwise, remove uncommon traces from
+  // common traces_.
+  bool first_trace = false;
   if (!root_) {
     graph_root_.reset(new TFMultiGraphNode(kTFProfRoot));
-    root_.reset(new CodeNode(graph_root_.get(), nullptr));
+    root_.reset(new CodeNode(graph_root_.get(), nullptr, ""));
+    first_trace = true;
   }
 
   CodeNode* pre_code_node = root_.get();
   // TODO(xpan): Consider to release CodeDef after TFCode is built. It
   // takes a lot of memory.
+  std::set<string> traces;
   for (int i = 0; i < node->code().traces_size(); ++i) {
     // Unlike op name, which is globally unique, trace name is only unique
     // w.r.t. it's parent.
     const string& trace = GetTraceString(node->code().traces(i));
-    pre_code_node = pre_code_node->AddChildren(trace, &node->code().traces(i));
+    traces.insert(trace);
+    pre_code_node =
+        pre_code_node->AddChildren(trace, &node->code().traces(i), "");
     if (i == node->code().traces_size() - 1) {
       pre_code_node->node->AddGraphNode(node);
     }
   }
+  if (first_trace) {
+    common_traces_.insert(traces.begin(), traces.end());
+  } else {
+    for (auto it = common_traces_.begin(); it != common_traces_.end();) {
+      if (traces.find(*it) == traces.end()) {
+        common_traces_.erase(it++);
+      } else {
+        ++it;
+      }
+    }
+  }
 }
 
 void TFCode::Build() {
+  int64 unaccounted_nodes = 0;
+  for (auto it : grad_nodes_) {
+    const string& forward_name = it.first;
+    auto forward_it = forward_nodes_.find(forward_name);
+    if (forward_it == forward_nodes_.end()) {
+      unaccounted_nodes += 1;
+      continue;
+    }
+    TFGraphNode* fn = forward_it->second;
+    CodeNode* leaf = nullptr;
+    CodeNode* pre_code_node = root_.get();
+    for (int i = 0; i < fn->code().traces_size(); ++i) {
+      const string& trace =
+          GetTraceString(fn->code().traces(i)) + kGradientSuffix;
+      pre_code_node = pre_code_node->AddChildren(trace, &fn->code().traces(i),
+                                                 kGradientSuffix);
+      if (i == fn->code().traces_size() - 1) {
+        leaf = pre_code_node;
+      }
+    }
+    for (TFGraphNode* gn : it.second) {
+      leaf->node->AddGraphNode(gn);
+    }
+  }
+  if (unaccounted_nodes > 0) {
+    fprintf(stderr, "%lld gradient nodes not accounted\n", unaccounted_nodes);
+  }
+
+  // For trace that all traces share, such as "main", "apply_op", people
+  // are unlikely inerested. We track them and hide them from display.
+  if (forward_nodes_.size() > 100) {
+    std::set<string> tmp = common_traces_;
+    for (const string& t : tmp) {
+      common_traces_.insert(t + kGradientSuffix);
+    }
+  } else {
+    common_traces_.clear();
+  }
 }
 
 const ShowMultiNode* TFCode::ShowInternal(const Options& opts,
@@ -439,12 +541,12 @@ const ShowMultiNode* TFCode::ShowInternal(const Options& opts,
 void TFCode::Format(const CodeNode* root, const std::vector<CodeNode*>& nodes,
                     const Options& opts, string* display_str,
                     MultiGraphNodeProto* proto, std::vector<uint64>* call_ids) {
-  if (nodes.empty() && root->trace && opts.output_type == kOutput[3]) {
+  if (nodes.empty() && root->has_trace() && opts.output_type == kOutput[3]) {
     pprof_profile_->AddSample(root, call_ids);
   }
 
   for (CodeNode* node : nodes) {
-    if (root->trace && opts.output_type == kOutput[3]) {
+    if (root->has_trace() && opts.output_type == kOutput[3]) {
       uint64 loc_id = pprof_profile_->AddLocation(node, root);
       call_ids->push_back(loc_id);
     }
@@ -452,7 +554,7 @@ void TFCode::Format(const CodeNode* root, const std::vector<CodeNode*>& nodes,
     MultiGraphNodeProto* child = proto->add_children();
     child->MergeFrom(node->proto());
     Format(node, node->show_children, opts, display_str, child, call_ids);
-    if (root->trace && opts.output_type == kOutput[3]) {
+    if (root->has_trace() && opts.output_type == kOutput[3]) {
       call_ids->pop_back();
     }
   }
@@ -493,7 +595,8 @@ std::vector<CodeNode*> TFCode::PrintScope(const std::vector<CodeNode*> roots,
       continue;
     }
     int ident = last_ident;
-    bool show = ShouldShow(node, opts, depth);
+    bool show = ShouldShow(node, opts, depth) &&
+                common_traces_.find(node->name()) == common_traces_.end();
     if (show) ident += 2;
 
     std::vector<CodeNode*> show_cnodes =
diff --git a/tensorflow/core/profiler/internal/tfprof_code.h b/tensorflow/core/profiler/internal/tfprof_code.h
index 8da036e6b7..82bac8f415 100644
--- a/tensorflow/core/profiler/internal/tfprof_code.h
+++ b/tensorflow/core/profiler/internal/tfprof_code.h
@@ -57,8 +57,11 @@ class TFCode : public TFMultiShow {
   TFCode() {}
   ~TFCode() override {}
 
+  // Add nodes to the code view. Called before Build()
   void AddNode(TFGraphNode* node) override;
 
+  // Build the code view structure. Called after all nodes
+  // are added via AddNode().
   void Build() override;
 
  private:
@@ -82,9 +85,14 @@ class TFCode : public TFMultiShow {
   string FormatNode(CodeNode* node, const Options& opts, int64 indent) const;
   string FormatNodeMemory(CodeNode* node, int64 bytes, int64 total_bytes) const;
 
+  // Common traces track the code path that all traces share. Such as
+  // "main()", "create_op", etc.
+  std::set<string> common_traces_;
   std::unique_ptr<CodeNode> root_;
   std::unique_ptr<TFMultiGraphNode> graph_root_;
   std::unique_ptr<PprofProfile> pprof_profile_;
+  std::map<string, std::vector<TFGraphNode*>> grad_nodes_;
+  std::map<string, TFGraphNode*> forward_nodes_;
 };
 }  // namespace tfprof
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/internal/tfprof_node_show.h b/tensorflow/core/profiler/internal/tfprof_node_show.h
index cdcb3f499b..d3c5ffd7f6 100644
--- a/tensorflow/core/profiler/internal/tfprof_node_show.h
+++ b/tensorflow/core/profiler/internal/tfprof_node_show.h
@@ -111,11 +111,13 @@ class ShowMultiNode {
 
 class CodeNode : public ShowMultiNode {
  public:
-  explicit CodeNode(TFMultiGraphNode* node, const CodeDef::Trace* trace)
-      : ShowMultiNode(node), trace(trace) {}
+  CodeNode(TFMultiGraphNode* node, const CodeDef::Trace* trace,
+           const string& suffix)
+      : ShowMultiNode(node), trace_(trace), suffix_(suffix) {}
   ~CodeNode() override {}
 
-  CodeNode* AddChildren(const string& name, const CodeDef::Trace* trace) {
+  CodeNode* AddChildren(const string& name, const CodeDef::Trace* trace,
+                        const string suffix) {
     auto it = children_.find(name);
     if (it != children_.end()) {
       return it->second.get();
@@ -124,16 +126,23 @@ class CodeNode : public ShowMultiNode {
     graph_children_.push_back(
         std::unique_ptr<TFMultiGraphNode>(new TFMultiGraphNode(name)));
     auto child = &children_[name];
-    child->reset(new CodeNode(graph_children_.back().get(), trace));
+    child->reset(new CodeNode(graph_children_.back().get(), trace, suffix));
     children.push_back(child->get());
     return child->get();
   }
 
-  const CodeDef::Trace* trace;
+  bool has_trace() const { return trace_ != nullptr; }
+  const int32 lineno() const { return trace_->lineno(); }
+  string file() const { return trace_->file() + suffix_; }
+  string function() const { return trace_->function() + suffix_; }
+  int32 func_start_line() const { return trace_->func_start_line(); }
+
   std::vector<CodeNode*> children;
   std::vector<CodeNode*> show_children;
 
  private:
+  const CodeDef::Trace* trace_;
+  string suffix_;
   std::vector<std::unique_ptr<TFMultiGraphNode>> graph_children_;
   std::map<string, std::unique_ptr<CodeNode>> children_;
 };
diff --git a/tensorflow/core/profiler/internal/tfprof_stats.cc b/tensorflow/core/profiler/internal/tfprof_stats.cc
index f0db8edd4a..ea3e7b487c 100644
--- a/tensorflow/core/profiler/internal/tfprof_stats.cc
+++ b/tensorflow/core/profiler/internal/tfprof_stats.cc
@@ -177,9 +177,8 @@ void TFStats::AddRunMeta(int64 step, std::unique_ptr<RunMetadata> run_meta) {
     fprintf(stderr, "Invalid RunMetadata for step %lld\n", step);
     return;
   }
-  if (steps_.find(step) != steps_.end()) {
-    fprintf(stderr, "The same step %lld has been added before.\n", step);
-    return;
+  if (steps_.find(step) == steps_.end()) {
+    steps_.insert(step);
   }
   steps_.insert(step);
 
diff --git a/tensorflow/python/profiler/internal/run_metadata_test.py b/tensorflow/python/profiler/internal/run_metadata_test.py
index b758edf87e..1e26a9897e 100644
--- a/tensorflow/python/profiler/internal/run_metadata_test.py
+++ b/tensorflow/python/profiler/internal/run_metadata_test.py
@@ -40,14 +40,22 @@ SIZE = 1300
 builder = option_builder.ProfileOptionBuilder
 
 
-def _extract_node(run_meta, node_names):
-  if not isinstance(node_names, list):
-    node_names = [node_names]
+def _extract_node(run_meta, node_name):
   ret = defaultdict(list)
   for dev_stat in run_meta.step_stats.dev_stats:
-    dev = dev_stat.device
+    dev = dev_stat.device.lower()
+    if dev.find('cpu:') > 0:
+      dev = dev[dev.find('cpu:'):]
+    elif dev.find('gpu:') > 0:
+      dev = dev[dev.find('gpu:'):]
+    else:
+      assert False, 'Unrecognized device name: %s' % dev
+
     for node_stat in dev_stat.node_stats:
-      if node_stat.node_name in node_names:
+      nname = node_stat.node_name
+      if nname.find(':') > 0:
+        nname = nname[:nname.find(':')]
+      if nname == node_name:
         ret[dev].append(node_stat)
   return ret
 
@@ -62,6 +70,7 @@ def _run_model():
     opts = builder.time_and_memory()
     opts['min_micros'] = 0
     opts['min_bytes'] = 0
+    opts['output'] = 'none'
     _ = sess.run(y,
                  options=config_pb2.RunOptions(
                      trace_level=config_pb2.RunOptions.FULL_TRACE),
@@ -85,9 +94,11 @@ def _run_loop_model():
                      trace_level=config_pb2.RunOptions.FULL_TRACE),
                  run_metadata=run_meta)
 
+    opts = builder.time_and_memory()
+    opts['output'] = 'none'
+
     tfprof_node = model_analyzer.profile(
-        sess.graph, run_meta,
-        options=builder.time_and_memory())
+        sess.graph, run_meta, options=opts)
     return tfprof_node, run_meta
 
 
@@ -104,17 +115,9 @@ class RunMetadataTest(test.TestCase):
       self.assertEqual(tfprof_node.children[0].name, 'MatMul')
       self.assertGreater(tfprof_node.children[0].exec_micros, 10)
 
-    ret = _extract_node(run_meta, ['MatMul', 'MatMul:MatMul'])
-    self.assertEqual(len(ret), 3)
-    self.assertTrue('/job:localhost/replica:0/task:0' + gpu_dev in ret)
-    del ret['/job:localhost/replica:0/task:0' + gpu_dev]
-
-    has_all_stream = False
-    for k, _ in six.iteritems(ret):
-      self.assertTrue(gpu_dev + '/stream' in k)
-      if gpu_dev + '/stream:all' in k:
-        has_all_stream = True
-    self.assertTrue(has_all_stream)
+    ret = _extract_node(run_meta, 'MatMul')
+    self.assertEqual(len(ret['gpu:0']), 1)
+    self.assertEqual(len(ret['gpu:0/stream:all']), 1, '%s' % run_meta)
 
   def testCPU(self):
     ops.reset_default_graph()
@@ -124,8 +127,7 @@ class RunMetadataTest(test.TestCase):
       self.assertGreater(tfprof_node.children[0].exec_micros, 0)
 
     ret = _extract_node(run_meta, 'MatMul')
-    self.assertEqual(len(ret), 1)
-    self.assertTrue('/job:localhost/replica:0/task:0/cpu:0' in ret)
+    self.assertEqual(len(ret['cpu:0']), 1)
 
     ret = _extract_node(run_meta, 'MatMul:MatMul')
     self.assertEqual(len(ret), 0)
@@ -137,10 +139,10 @@ class RunMetadataTest(test.TestCase):
       # The while-loop caused a node to appear 4 times in scheduling.
       ret = _extract_node(run_meta,
                           'rnn/while/rnn/basic_rnn_cell/basic_rnn_cell/MatMul')
-      self.assertEqual(len(ret['/job:localhost/replica:0/task:0/cpu:0']), 4)
+      self.assertEqual(len(ret['cpu:0']), 4)
 
       total_cpu_execs = 0
-      for node in ret['/job:localhost/replica:0/task:0/cpu:0']:
+      for node in ret['cpu:0']:
         total_cpu_execs += node.op_end_rel_micros
 
       mm_node = lib.SearchTFProfNode(
@@ -151,10 +153,31 @@ class RunMetadataTest(test.TestCase):
       self.assertEqual(mm_node.cpu_exec_micros, total_cpu_execs)
       self.assertEqual(mm_node.exec_micros, total_cpu_execs)
 
+  def testGradientGraph(self):
+    # Note: Please don't just adjust the test to make it pass.
+    # The code view logic depends on it.
+    ops.reset_default_graph()
+    _, _ = _run_loop_model()
+    graph = ops.get_default_graph()
+    forward_op = set()
+    backward_op = set()
+    back_to_forward = dict()
+    for op in graph.get_operations():
+      if op.name.find('gradients/') > 0 and op.name.find('_grad/') > 0:
+        backward_op.add(op.name)
+        idx1 = op.name.find('gradients/') + 10
+        idx2 = op.name.find('_grad/')
+        back_to_forward[op.name] = op.name[idx1:idx2]
+      else:
+        forward_op.add(op.name)
+
+    for _, f in six.iteritems(back_to_forward):
+      self.assertTrue(f in forward_op)
+
   # pylint: disable=pointless-string-statement
   """
-  TODO(xpan): This test is flaky because RunMetadata returned from TensorFlow
-  is random. Still being investigated.
+  # TODO(xpan): This test is flaky because RunMetadata returned from TensorFlow
+  # is random. Still being investigated.
   def testLoopGPU(self):
     if not test.is_gpu_available():
       return
@@ -165,30 +188,17 @@ class RunMetadataTest(test.TestCase):
       # The while-loop caused a node to appear 4 times in scheduling.
       ret = _extract_node(run_meta,
                           'rnn/while/rnn/basic_rnn_cell/basic_rnn_cell/MatMul')
-      self.assertEqual(len(ret['/job:localhost/replica:0/task:0/device:GPU:0']), 4)
+      self.assertEqual(len(ret['gpu:0']), 4, '%s' % run_meta)
 
       total_cpu_execs = 0
-      for node in ret['/job:localhost/replica:0/task:0/device:GPU:0']:
+      for node in ret['gpu:0']:
         total_cpu_execs += node.op_end_rel_micros
 
-      ret = _extract_node(
-          run_meta,
-          'rnn/while/rnn/basic_rnn_cell/basic_rnn_cell/MatMul:MatMul')
-      self.assertGreaterEqual(len(ret['/device:GPU:0/stream:all']), 4)
+      self.assertGreaterEqual(len(ret['gpu:0/stream:all']), 4, '%s' % run_meta)
 
       total_accelerator_execs = 0
-      for node in ret['/device:GPU:0/stream:all']:
+      for node in ret['gpu:0/stream:all']:
         total_accelerator_execs += node.op_end_rel_micros
-
-      mm_node = lib.SearchTFProfNode(
-          tfprof_node,
-          'rnn/while/rnn/basic_rnn_cell/basic_rnn_cell/MatMul')
-
-      self.assertEqual(mm_node.run_count, 4)
-      self.assertEqual(mm_node.accelerator_exec_micros, total_accelerator_execs)
-      self.assertEqual(mm_node.cpu_exec_micros, total_cpu_execs)
-      self.assertEqual(mm_node.exec_micros,
-                       total_cpu_execs + total_accelerator_execs)
   """
 
 
diff --git a/tensorflow/python/profiler/model_analyzer_test.py b/tensorflow/python/profiler/model_analyzer_test.py
index 21d26b8782..841fe46393 100644
--- a/tensorflow/python/profiler/model_analyzer_test.py
+++ b/tensorflow/python/profiler/model_analyzer_test.py
@@ -21,6 +21,7 @@ import gzip
 import io
 import os
 import random
+import re
 
 from tensorflow.core.profiler import profile_pb2
 from tensorflow.core.protobuf import config_pb2
@@ -57,6 +58,68 @@ class PrintModelAnalysisTest(test.TestCase):
                          '  ScalarW (1, 1/1 params)\n',
                          f.read())
 
+  def testSelectEverthingDetail(self):
+    ops.reset_default_graph()
+    dev = '/gpu:0' if test.is_gpu_available() else '/cpu:0'
+    outfile = os.path.join(test.get_temp_dir(), 'dump')
+    opts = (builder(builder.trainable_variables_parameter())
+            .with_file_output(outfile)
+            .with_accounted_types(['.*'])
+            .select(['micros', 'bytes', 'params', 'float_ops', 'occurrence',
+                     'device', 'op_types', 'input_shapes']).build())
+
+    config = config_pb2.ConfigProto()
+    with session.Session(config=config) as sess, ops.device(dev):
+      x = lib.BuildSmallModel()
+
+      sess.run(variables.global_variables_initializer())
+      run_meta = config_pb2.RunMetadata()
+      _ = sess.run(x,
+                   options=config_pb2.RunOptions(
+                       trace_level=config_pb2.RunOptions.FULL_TRACE),
+                   run_metadata=run_meta)
+
+      model_analyzer.profile(
+          sess.graph, run_meta, options=opts)
+
+      with gfile.Open(outfile, 'r') as f:
+        # pylint: disable=line-too-long
+        outputs = f.read().split('\n')
+
+        self.assertEqual(outputs[0],
+                         'node name | # parameters | # float_ops | requested bytes | total execution time | accelerator execution time | cpu execution time | assigned devices | op types | op count (run|defined) | input shapes')
+        for o in outputs[1:]:
+          if o.find('Conv2D ') > 0:
+            metrics = o[o.find('(') +1: o.find(')')].split(',')
+            # Make sure time is profiled.
+            gap = 1 if test.is_gpu_available() else 2
+            for i in range(3, 6, gap):
+              mat = re.search('(.*)us/(.*)us', metrics[i])
+              self.assertGreater(float(mat.group(1)), 0.0)
+              self.assertGreater(float(mat.group(2)), 0.0)
+            # Make sure device is profiled.
+            if test.is_gpu_available():
+              self.assertTrue(metrics[6].find('gpu') > 0)
+              self.assertFalse(metrics[6].find('cpu') > 0)
+            else:
+              self.assertFalse(metrics[6].find('gpu') > 0)
+              self.assertTrue(metrics[6].find('cpu') > 0)
+            # Make sure float_ops is profiled.
+            mat = re.search('(.*)k/(.*)k flops', metrics[1].strip())
+            self.assertGreater(float(mat.group(1)), 0.0)
+            self.assertGreater(float(mat.group(2)), 0.0)
+            # Make sure op_count is profiled.
+            self.assertEqual(metrics[8].strip(), '1/1|1/1')
+            # Make sure input_shapes is profiled.
+            self.assertEqual(metrics[9].strip(), '0:2x6x6x3|1:3x3x3x6')
+
+          if o.find('DW (3x3x3x6') > 0:
+            metrics = o[o.find('(') +1: o.find(')')].split(',')
+            mat = re.search('(.*)/(.*) params', metrics[1].strip())
+            self.assertGreater(float(mat.group(1)), 0.0)
+            self.assertGreater(float(mat.group(2)), 0.0)
+        # pylint: enable=line-too-long
+
   def testSelectEverything(self):
     ops.reset_default_graph()
     outfile = os.path.join(test.get_temp_dir(), 'dump')
@@ -151,29 +214,38 @@ class PrintModelAnalysisTest(test.TestCase):
       with gfile.Open(outfile, 'r') as f:
         lines = f.read().split('\n')
         result = '\n'.join([l[:min(len(l), 80)] for l in lines])
-        self.assertEqual('node name | # parameters | # float_ops\n_TFProfRoot (--/2.84k params, --/91.04k flops)\n  model_analyzer_testlib.py:58:BuildFullModel:seq.append(array_... (0/1.80k para\n    model_analyzer_testlib.py:35:BuildSmallModel:image = array_ops... (0/0 param\n    model_analyzer_testlib.py:39:BuildSmallModel:initializer=init_... (0/4 param\n    model_analyzer_testlib.py:43:BuildSmallModel:initializer=init_... (0/648 par\n    model_analyzer_testlib.py:44:BuildSmallModel:x = nn_ops.conv2d... (0/0 param\n    model_analyzer_testlib.py:48:BuildSmallModel:initializer=init_... (0/1.15k p\n    model_analyzer_testlib.py:49:BuildSmallModel:x = nn_ops.conv2d... (0/0 param\n  model_analyzer_testlib.py:62:BuildFullModel:cell, array_ops.c... (0/1.04k para\n  model_analyzer_testlib.py:64:BuildFullModel:target = array_op... (0/0 params, \n  model_analyzer_testlib.py:65:BuildFullModel:loss = nn_ops.l2_... (0/0 params, \n  model_analyzer_testlib.py:67:BuildFullModel:return sgd_op.min... (0/0 params, \n',
+        self.assertEqual('node name | # parameters | # float_ops\n_TFProfRoot (--/2.84k params, --/91.04k flops)\n  model_analyzer_testlib.py:58:BuildFullModel:seq.append(array_... (0/1.80k para\n    model_analyzer_testlib.py:35:BuildSmallModel:image = array_ops... (0/0 param\n    model_analyzer_testlib.py:39:BuildSmallModel:initializer=init_... (0/4 param\n    model_analyzer_testlib.py:43:BuildSmallModel:initializer=init_... (0/648 par\n    model_analyzer_testlib.py:44:BuildSmallModel:x = nn_ops.conv2d... (0/0 param\n    model_analyzer_testlib.py:48:BuildSmallModel:initializer=init_... (0/1.15k p\n    model_analyzer_testlib.py:49:BuildSmallModel:x = nn_ops.conv2d... (0/0 param\n  model_analyzer_testlib.py:58:BuildFullModel:seq.append(array_... (gradient) (0\n    model_analyzer_testlib.py:44:BuildSmallModel:x = nn_ops.conv2d... (gradient)\n    model_analyzer_testlib.py:49:BuildSmallModel:x = nn_ops.conv2d... (gradient)\n  model_analyzer_testlib.py:62:BuildFullModel:cell, array_ops.c... (0/1.04k para\n  model_analyzer_testlib.py:62:BuildFullModel:cell, array_ops.c... (gradient) (0\n  model_analyzer_testlib.py:64:BuildFullModel:target = array_op... (0/0 params, \n  model_analyzer_testlib.py:65:BuildFullModel:loss = nn_ops.l2_... (0/0 params, \n  model_analyzer_testlib.py:65:BuildFullModel:loss = nn_ops.l2_... (gradient) (0\n  model_analyzer_testlib.py:67:BuildFullModel:return sgd_op.min... (0/0 params, \n',
                          result)
 
       self.assertLess(0, tfprof_node.total_exec_micros)
       self.assertEqual(2844, tfprof_node.total_parameters)
       self.assertEqual(91040, tfprof_node.total_float_ops)
-      self.assertEqual(5, len(tfprof_node.children))
+      self.assertEqual(8, len(tfprof_node.children))
       self.assertEqual('_TFProfRoot', tfprof_node.name)
       self.assertEqual(
           'model_analyzer_testlib.py:58:BuildFullModel:seq.append(array_...',
           tfprof_node.children[0].name)
       self.assertEqual(
-          'model_analyzer_testlib.py:62:BuildFullModel:cell, array_ops.c...',
+          'model_analyzer_testlib.py:58:BuildFullModel:seq.append(array_... (gradient)',
           tfprof_node.children[1].name)
       self.assertEqual(
-          'model_analyzer_testlib.py:64:BuildFullModel:target = array_op...',
+          'model_analyzer_testlib.py:62:BuildFullModel:cell, array_ops.c...',
           tfprof_node.children[2].name)
       self.assertEqual(
-          'model_analyzer_testlib.py:65:BuildFullModel:loss = nn_ops.l2_...',
+          'model_analyzer_testlib.py:62:BuildFullModel:cell, array_ops.c... (gradient)',
           tfprof_node.children[3].name)
       self.assertEqual(
-          'model_analyzer_testlib.py:67:BuildFullModel:return sgd_op.min...',
+          'model_analyzer_testlib.py:64:BuildFullModel:target = array_op...',
           tfprof_node.children[4].name)
+      self.assertEqual(
+          'model_analyzer_testlib.py:65:BuildFullModel:loss = nn_ops.l2_...',
+          tfprof_node.children[5].name)
+      self.assertEqual(
+          'model_analyzer_testlib.py:65:BuildFullModel:loss = nn_ops.l2_... (gradient)',
+          tfprof_node.children[6].name)
+      self.assertEqual(
+          'model_analyzer_testlib.py:67:BuildFullModel:return sgd_op.min...',
+          tfprof_node.children[7].name)
       # pylint: enable=line-too-long
 
   def testCodeViewLeafGraphNode(self):
diff --git a/tensorflow/python/profiler/option_builder.py b/tensorflow/python/profiler/option_builder.py
index 502fc49bb6..641895ffe5 100644
--- a/tensorflow/python/profiler/option_builder.py
+++ b/tensorflow/python/profiler/option_builder.py
@@ -406,7 +406,7 @@ class ProfileOptionBuilder(object):
     """Generate a pprof profile gzip file.
 
     To use the pprof file:
-      pprof -png --nodecount=20 --sample_index=1 <pprof_file>
+      pprof -png --nodecount=100 --sample_index=1 <pprof_file>
 
     Args:
       pprof_file: filename for output, usually suffixed with .pb.gz.
diff --git a/tensorflow/python/profiler/profiler_test.py b/tensorflow/python/profiler/profiler_test.py
index 2170e1bdea..46afe1fe55 100644
--- a/tensorflow/python/profiler/profiler_test.py
+++ b/tensorflow/python/profiler/profiler_test.py
@@ -183,6 +183,51 @@ class ProfilerTest(test.TestCase):
       checker = advice_pb.checkers['ExpensiveOperationChecker']
       self.assertGreater(len(checker.reports), 0)
 
+  def testMultipleProfilePerStep(self):
+    ops.reset_default_graph()
+    opts = (builder(builder.trainable_variables_parameter())
+            .with_empty_output()
+            .with_accounted_types(['.*'])
+            .select(['micros', 'bytes', 'peak_bytes',
+                     'residual_bytes', 'output_bytes']).build())
+
+    r = lib.BuildSmallModel()
+    sess = session.Session()
+    profiler = model_analyzer.Profiler(sess.graph)
+
+    init_var_run_meta = config_pb2.RunMetadata()
+    sess.run(variables.global_variables_initializer(),
+             options=config_pb2.RunOptions(
+                 trace_level=config_pb2.RunOptions.FULL_TRACE),
+             run_metadata=init_var_run_meta)
+
+    train_run_meta = config_pb2.RunMetadata()
+    sess.run(r,
+             options=config_pb2.RunOptions(
+                 trace_level=config_pb2.RunOptions.FULL_TRACE),
+             run_metadata=train_run_meta)
+
+    profiler.add_step(0, train_run_meta)
+    ret1 = profiler.profile_name_scope(opts)
+    n1 = lib.SearchTFProfNode(
+        ret1, 'DW/Initializer/random_normal/RandomStandardNormal')
+    # Without the var initialization run_meta, it doesn't have the
+    # information of var_initialization.
+    self.assertEqual(n1.exec_micros, 0)
+    self.assertEqual(n1.requested_bytes, 0)
+    self.assertEqual(n1.peak_bytes, 0)
+    self.assertEqual(n1.residual_bytes, 0)
+
+    profiler.add_step(0, init_var_run_meta)
+    ret2 = profiler.profile_name_scope(opts)
+    n2 = lib.SearchTFProfNode(
+        ret2, 'DW/Initializer/random_normal/RandomStandardNormal')
+    # After adding the var initialization run_meta.
+    self.assertGreater(n2.exec_micros, 0)
+    self.assertGreater(n2.requested_bytes, 0)
+    self.assertGreater(n2.peak_bytes, 0)
+    self.assertGreater(n2.residual_bytes, 0)
+
 
 if __name__ == '__main__':
   test.main()
author	A. Unique TensorFlower <gardener@tensorflow.org>	2017-08-15 17:48:55 -0700
committer	TensorFlower Gardener <gardener@tensorflow.org>	2017-08-15 17:52:23 -0700
commit	93b21f7b1fa725299f86058436f034b15350de52 (patch)
tree	6b0dcf69ed06601680b0b14eb0a9c25edfc292e9 /tensorflow
parent	5db8be66563227f5bba37aeddce3951239dcd947 (diff)