diff options
author | 2017-08-15 17:48:55 -0700 | |
---|---|---|
committer | 2017-08-15 17:52:23 -0700 | |
commit | 93b21f7b1fa725299f86058436f034b15350de52 (patch) | |
tree | 6b0dcf69ed06601680b0b14eb0a9c25edfc292e9 /tensorflow | |
parent | 5db8be66563227f5bba37aeddce3951239dcd947 (diff) |
1. Adjust code view pprof image to better visualize backprop.
2. Allow to add multiple RunMetadata for 1 step, e.g. 1 for var initialization,
1 for training. So it has a complete profile.
3. Improve tests a bit.
PiperOrigin-RevId: 165385567
Diffstat (limited to 'tensorflow')
-rw-r--r-- | tensorflow/core/profiler/README.md | 7 | ||||
-rw-r--r-- | tensorflow/core/profiler/g3doc/pprof.jpg | bin | 267669 -> 255048 bytes | |||
-rw-r--r-- | tensorflow/core/profiler/internal/tfprof_code.cc | 141 | ||||
-rw-r--r-- | tensorflow/core/profiler/internal/tfprof_code.h | 8 | ||||
-rw-r--r-- | tensorflow/core/profiler/internal/tfprof_node_show.h | 19 | ||||
-rw-r--r-- | tensorflow/core/profiler/internal/tfprof_stats.cc | 5 | ||||
-rw-r--r-- | tensorflow/python/profiler/internal/run_metadata_test.py | 92 | ||||
-rw-r--r-- | tensorflow/python/profiler/model_analyzer_test.py | 84 | ||||
-rw-r--r-- | tensorflow/python/profiler/option_builder.py | 2 | ||||
-rw-r--r-- | tensorflow/python/profiler/profiler_test.py | 45 |
10 files changed, 324 insertions, 79 deletions
diff --git a/tensorflow/core/profiler/README.md b/tensorflow/core/profiler/README.md index 06118e6eb2..f755089fc7 100644 --- a/tensorflow/core/profiler/README.md +++ b/tensorflow/core/profiler/README.md @@ -202,14 +202,13 @@ Open a Chrome browser, enter URL chrome://tracing and load the timeline file. # nodes created by the Python call stack. # Nevertheless, it pops critical Python code path for us. # -# `-trim_name_regexes` trims the python call stack, which are always the same -# for the leaves. +# `-trim_name_regexes` trims the some traces that have no valuable information. # `-select accelerator_micros` pick accelerator time for pprof graph. User # can also generate memory profile using `-select bytes` -tfprof> code -max_depth 100 -trim_name_regexes '^ops.py.*' -select accelerator_micros -output pprof:outfile=<filename> +tfprof> code -select accelerator_micros -max_depth 100000 -output pprof:outfile=<filename> -trim_name_regexes .*apply_op.* # Use pprof to visualize the generated file. -pprof -png --nodecount=20 --sample_index=1 <filename> +pprof -png --nodecount=100 --sample_index=1 <filename> ``` <left> diff --git a/tensorflow/core/profiler/g3doc/pprof.jpg b/tensorflow/core/profiler/g3doc/pprof.jpg Binary files differindex bf28967d68..8ffeda8360 100644 --- a/tensorflow/core/profiler/g3doc/pprof.jpg +++ b/tensorflow/core/profiler/g3doc/pprof.jpg diff --git a/tensorflow/core/profiler/internal/tfprof_code.cc b/tensorflow/core/profiler/internal/tfprof_code.cc index 1c512a7ca1..7f4d682cda 100644 --- a/tensorflow/core/profiler/internal/tfprof_code.cc +++ b/tensorflow/core/profiler/internal/tfprof_code.cc @@ -32,6 +32,9 @@ limitations under the License. namespace tensorflow { namespace tfprof { namespace { + +const char* const kGradientSuffix = " (gradient)"; + // Convert to Trace proto into a short readable string. string GetTraceString(const CodeDef::Trace& trace) { string ntrace = io::Basename(trace.file()).ToString(); @@ -49,6 +52,24 @@ string GetTraceString(const CodeDef::Trace& trace) { return ntrace; } +bool IsGradNode(const string& name, string* forward_name) { + // Given a forward operation with name op, its gradient op has the following + // name: ...gradients/op_grad/... + // TODO(xpan): This is hacky. + auto grad_prefix = name.find("gradients/"); + auto grad_suffix = name.find("_grad/"); + if (grad_prefix == name.npos || grad_suffix == name.npos) { + return false; + } + auto start = grad_prefix + string("gradients/").length(); + auto len = grad_suffix - start; + if (len <= 0) { + return false; + } + *forward_name = name.substr(start, len); + return true; +} + // StringTable maps each string to an id. class StringTable { public: @@ -170,12 +191,17 @@ class Samples { for (const CodeNode* cn : all_leaf) { for (auto gn_it : cn->node->graph_nodes()) { const TFGraphNode* gn = gn_it.second; - pprof::Sample* sample_pb = &sample_table_[gn->name()]; + string name = gn->name(); + // Generate a new trace name, in case the name is taken. + while (sample_table_.find(name) != sample_table_.end()) { + name += '@'; + } + pprof::Sample* sample_pb = &sample_table_[name]; for (uint64 id : location_ids) { sample_pb->mutable_location_id()->Add(id); } pprof::Label* label_pb = sample_pb->mutable_label()->Add(); - label_pb->set_key(string_table_->GetIndex("node_name")); + label_pb->set_key(string_table_->GetIndex("graph node:")); label_pb->set_str(string_table_->GetIndex(gn->name())); sample_pb->mutable_value()->Add(1); @@ -240,11 +266,11 @@ class PprofProfileImpl : public PprofProfile { samples_(new Samples(&string_table_, opts)) {} uint64 AddLocation(const CodeNode* callee, const CodeNode* caller) override { - const string& file_path = caller->trace->file(); - uint64 lineno = caller->trace->lineno(); - const string& callee_file_path = callee->trace->file(); - const string& callee_function = callee->trace->function(); - uint64 callee_func_start_line = callee->trace->func_start_line(); + const string& file_path = caller->file(); + uint64 lineno = caller->lineno(); + const string& callee_file_path = callee->file(); + const string& callee_function = callee->function(); + uint64 callee_func_start_line = callee->func_start_line(); return loc_table_->GetIndex(file_path, lineno, callee_function, callee_file_path, callee_func_start_line); @@ -274,7 +300,7 @@ class PprofProfileImpl : public PprofProfile { if (!s.ok()) return s; s = zlib_output_buffer->Close(); if (!s.ok()) return s; - fprintf(stdout, "\nRun pprof -png --nodecount=20 --sample_index=1 <%s>\n", + fprintf(stdout, "\nRun pprof -png --nodecount=100 --sample_index=1 <%s>\n", filename.c_str()); return s; } @@ -303,19 +329,20 @@ class PprofProfileImpl : public PprofProfile { string_table_.GetIndex("CPU execution time.")); } } else if (type == kShown[0]) { - sample_type->set_unit(string_table_.GetIndex("requested bytes")); + sample_type->set_unit(string_table_.GetIndex("bytes")); profile_pb->mutable_comment()->Add( - string_table_.GetIndex("Sum of operation total requested memory.")); + string_table_.GetIndex("Sum of operation total memory requests, " + "excluding deallocations.")); } else if (type == kShown[11]) { - sample_type->set_unit(string_table_.GetIndex("peak bytes")); + sample_type->set_unit(string_table_.GetIndex("bytes")); profile_pb->mutable_comment()->Add( string_table_.GetIndex("Sum of operation peak memory usage.")); } else if (type == kShown[12]) { - sample_type->set_unit(string_table_.GetIndex("residual bytes")); + sample_type->set_unit(string_table_.GetIndex("bytes")); profile_pb->mutable_comment()->Add(string_table_.GetIndex( "Sum of operation allocated memory after finish.")); } else if (type == kShown[13]) { - sample_type->set_unit(string_table_.GetIndex("output bytes")); + sample_type->set_unit(string_table_.GetIndex("bytes")); profile_pb->mutable_comment()->Add( string_table_.GetIndex("Sum of operation output size.")); } else if (type == kShown[2]) { @@ -357,26 +384,101 @@ void TFCode::AddNode(TFGraphNode* node) { if (node->code().traces_size() == 0) { return; } + + // We infer the forward operation name from gradient op name. So, we can + // map gradient op traces to forward op traces. + // E.g. gradient node of 'inp_1/Conv2D' would be 'gradients/inp_1/Conv2D_grad. + string forward_name; + if (IsGradNode(node->name(), &forward_name)) { + auto grad_nodes_it = grad_nodes_.find(forward_name); + if (grad_nodes_it != grad_nodes_.end()) { + grad_nodes_it->second.push_back(node); + } else { + grad_nodes_.insert( + std::pair<string, std::vector<TFGraphNode*>>(forward_name, {node})); + } + return; + } else { + forward_nodes_[node->name()] = node; + } + + // Track if this is the first trace (first node). If true, add all + // traces to common_traces_. Otherwise, remove uncommon traces from + // common traces_. + bool first_trace = false; if (!root_) { graph_root_.reset(new TFMultiGraphNode(kTFProfRoot)); - root_.reset(new CodeNode(graph_root_.get(), nullptr)); + root_.reset(new CodeNode(graph_root_.get(), nullptr, "")); + first_trace = true; } CodeNode* pre_code_node = root_.get(); // TODO(xpan): Consider to release CodeDef after TFCode is built. It // takes a lot of memory. + std::set<string> traces; for (int i = 0; i < node->code().traces_size(); ++i) { // Unlike op name, which is globally unique, trace name is only unique // w.r.t. it's parent. const string& trace = GetTraceString(node->code().traces(i)); - pre_code_node = pre_code_node->AddChildren(trace, &node->code().traces(i)); + traces.insert(trace); + pre_code_node = + pre_code_node->AddChildren(trace, &node->code().traces(i), ""); if (i == node->code().traces_size() - 1) { pre_code_node->node->AddGraphNode(node); } } + if (first_trace) { + common_traces_.insert(traces.begin(), traces.end()); + } else { + for (auto it = common_traces_.begin(); it != common_traces_.end();) { + if (traces.find(*it) == traces.end()) { + common_traces_.erase(it++); + } else { + ++it; + } + } + } } void TFCode::Build() { + int64 unaccounted_nodes = 0; + for (auto it : grad_nodes_) { + const string& forward_name = it.first; + auto forward_it = forward_nodes_.find(forward_name); + if (forward_it == forward_nodes_.end()) { + unaccounted_nodes += 1; + continue; + } + TFGraphNode* fn = forward_it->second; + CodeNode* leaf = nullptr; + CodeNode* pre_code_node = root_.get(); + for (int i = 0; i < fn->code().traces_size(); ++i) { + const string& trace = + GetTraceString(fn->code().traces(i)) + kGradientSuffix; + pre_code_node = pre_code_node->AddChildren(trace, &fn->code().traces(i), + kGradientSuffix); + if (i == fn->code().traces_size() - 1) { + leaf = pre_code_node; + } + } + for (TFGraphNode* gn : it.second) { + leaf->node->AddGraphNode(gn); + } + } + if (unaccounted_nodes > 0) { + fprintf(stderr, "%lld gradient nodes not accounted\n", unaccounted_nodes); + } + + // For trace that all traces share, such as "main", "apply_op", people + // are unlikely inerested. We track them and hide them from display. + if (forward_nodes_.size() > 100) { + std::set<string> tmp = common_traces_; + for (const string& t : tmp) { + common_traces_.insert(t + kGradientSuffix); + } + } else { + common_traces_.clear(); + } } const ShowMultiNode* TFCode::ShowInternal(const Options& opts, @@ -439,12 +541,12 @@ const ShowMultiNode* TFCode::ShowInternal(const Options& opts, void TFCode::Format(const CodeNode* root, const std::vector<CodeNode*>& nodes, const Options& opts, string* display_str, MultiGraphNodeProto* proto, std::vector<uint64>* call_ids) { - if (nodes.empty() && root->trace && opts.output_type == kOutput[3]) { + if (nodes.empty() && root->has_trace() && opts.output_type == kOutput[3]) { pprof_profile_->AddSample(root, call_ids); } for (CodeNode* node : nodes) { - if (root->trace && opts.output_type == kOutput[3]) { + if (root->has_trace() && opts.output_type == kOutput[3]) { uint64 loc_id = pprof_profile_->AddLocation(node, root); call_ids->push_back(loc_id); } @@ -452,7 +554,7 @@ void TFCode::Format(const CodeNode* root, const std::vector<CodeNode*>& nodes, MultiGraphNodeProto* child = proto->add_children(); child->MergeFrom(node->proto()); Format(node, node->show_children, opts, display_str, child, call_ids); - if (root->trace && opts.output_type == kOutput[3]) { + if (root->has_trace() && opts.output_type == kOutput[3]) { call_ids->pop_back(); } } @@ -493,7 +595,8 @@ std::vector<CodeNode*> TFCode::PrintScope(const std::vector<CodeNode*> roots, continue; } int ident = last_ident; - bool show = ShouldShow(node, opts, depth); + bool show = ShouldShow(node, opts, depth) && + common_traces_.find(node->name()) == common_traces_.end(); if (show) ident += 2; std::vector<CodeNode*> show_cnodes = diff --git a/tensorflow/core/profiler/internal/tfprof_code.h b/tensorflow/core/profiler/internal/tfprof_code.h index 8da036e6b7..82bac8f415 100644 --- a/tensorflow/core/profiler/internal/tfprof_code.h +++ b/tensorflow/core/profiler/internal/tfprof_code.h @@ -57,8 +57,11 @@ class TFCode : public TFMultiShow { TFCode() {} ~TFCode() override {} + // Add nodes to the code view. Called before Build() void AddNode(TFGraphNode* node) override; + // Build the code view structure. Called after all nodes + // are added via AddNode(). void Build() override; private: @@ -82,9 +85,14 @@ class TFCode : public TFMultiShow { string FormatNode(CodeNode* node, const Options& opts, int64 indent) const; string FormatNodeMemory(CodeNode* node, int64 bytes, int64 total_bytes) const; + // Common traces track the code path that all traces share. Such as + // "main()", "create_op", etc. + std::set<string> common_traces_; std::unique_ptr<CodeNode> root_; std::unique_ptr<TFMultiGraphNode> graph_root_; std::unique_ptr<PprofProfile> pprof_profile_; + std::map<string, std::vector<TFGraphNode*>> grad_nodes_; + std::map<string, TFGraphNode*> forward_nodes_; }; } // namespace tfprof } // namespace tensorflow diff --git a/tensorflow/core/profiler/internal/tfprof_node_show.h b/tensorflow/core/profiler/internal/tfprof_node_show.h index cdcb3f499b..d3c5ffd7f6 100644 --- a/tensorflow/core/profiler/internal/tfprof_node_show.h +++ b/tensorflow/core/profiler/internal/tfprof_node_show.h @@ -111,11 +111,13 @@ class ShowMultiNode { class CodeNode : public ShowMultiNode { public: - explicit CodeNode(TFMultiGraphNode* node, const CodeDef::Trace* trace) - : ShowMultiNode(node), trace(trace) {} + CodeNode(TFMultiGraphNode* node, const CodeDef::Trace* trace, + const string& suffix) + : ShowMultiNode(node), trace_(trace), suffix_(suffix) {} ~CodeNode() override {} - CodeNode* AddChildren(const string& name, const CodeDef::Trace* trace) { + CodeNode* AddChildren(const string& name, const CodeDef::Trace* trace, + const string suffix) { auto it = children_.find(name); if (it != children_.end()) { return it->second.get(); @@ -124,16 +126,23 @@ class CodeNode : public ShowMultiNode { graph_children_.push_back( std::unique_ptr<TFMultiGraphNode>(new TFMultiGraphNode(name))); auto child = &children_[name]; - child->reset(new CodeNode(graph_children_.back().get(), trace)); + child->reset(new CodeNode(graph_children_.back().get(), trace, suffix)); children.push_back(child->get()); return child->get(); } - const CodeDef::Trace* trace; + bool has_trace() const { return trace_ != nullptr; } + const int32 lineno() const { return trace_->lineno(); } + string file() const { return trace_->file() + suffix_; } + string function() const { return trace_->function() + suffix_; } + int32 func_start_line() const { return trace_->func_start_line(); } + std::vector<CodeNode*> children; std::vector<CodeNode*> show_children; private: + const CodeDef::Trace* trace_; + string suffix_; std::vector<std::unique_ptr<TFMultiGraphNode>> graph_children_; std::map<string, std::unique_ptr<CodeNode>> children_; }; diff --git a/tensorflow/core/profiler/internal/tfprof_stats.cc b/tensorflow/core/profiler/internal/tfprof_stats.cc index f0db8edd4a..ea3e7b487c 100644 --- a/tensorflow/core/profiler/internal/tfprof_stats.cc +++ b/tensorflow/core/profiler/internal/tfprof_stats.cc @@ -177,9 +177,8 @@ void TFStats::AddRunMeta(int64 step, std::unique_ptr<RunMetadata> run_meta) { fprintf(stderr, "Invalid RunMetadata for step %lld\n", step); return; } - if (steps_.find(step) != steps_.end()) { - fprintf(stderr, "The same step %lld has been added before.\n", step); - return; + if (steps_.find(step) == steps_.end()) { + steps_.insert(step); } steps_.insert(step); diff --git a/tensorflow/python/profiler/internal/run_metadata_test.py b/tensorflow/python/profiler/internal/run_metadata_test.py index b758edf87e..1e26a9897e 100644 --- a/tensorflow/python/profiler/internal/run_metadata_test.py +++ b/tensorflow/python/profiler/internal/run_metadata_test.py @@ -40,14 +40,22 @@ SIZE = 1300 builder = option_builder.ProfileOptionBuilder -def _extract_node(run_meta, node_names): - if not isinstance(node_names, list): - node_names = [node_names] +def _extract_node(run_meta, node_name): ret = defaultdict(list) for dev_stat in run_meta.step_stats.dev_stats: - dev = dev_stat.device + dev = dev_stat.device.lower() + if dev.find('cpu:') > 0: + dev = dev[dev.find('cpu:'):] + elif dev.find('gpu:') > 0: + dev = dev[dev.find('gpu:'):] + else: + assert False, 'Unrecognized device name: %s' % dev + for node_stat in dev_stat.node_stats: - if node_stat.node_name in node_names: + nname = node_stat.node_name + if nname.find(':') > 0: + nname = nname[:nname.find(':')] + if nname == node_name: ret[dev].append(node_stat) return ret @@ -62,6 +70,7 @@ def _run_model(): opts = builder.time_and_memory() opts['min_micros'] = 0 opts['min_bytes'] = 0 + opts['output'] = 'none' _ = sess.run(y, options=config_pb2.RunOptions( trace_level=config_pb2.RunOptions.FULL_TRACE), @@ -85,9 +94,11 @@ def _run_loop_model(): trace_level=config_pb2.RunOptions.FULL_TRACE), run_metadata=run_meta) + opts = builder.time_and_memory() + opts['output'] = 'none' + tfprof_node = model_analyzer.profile( - sess.graph, run_meta, - options=builder.time_and_memory()) + sess.graph, run_meta, options=opts) return tfprof_node, run_meta @@ -104,17 +115,9 @@ class RunMetadataTest(test.TestCase): self.assertEqual(tfprof_node.children[0].name, 'MatMul') self.assertGreater(tfprof_node.children[0].exec_micros, 10) - ret = _extract_node(run_meta, ['MatMul', 'MatMul:MatMul']) - self.assertEqual(len(ret), 3) - self.assertTrue('/job:localhost/replica:0/task:0' + gpu_dev in ret) - del ret['/job:localhost/replica:0/task:0' + gpu_dev] - - has_all_stream = False - for k, _ in six.iteritems(ret): - self.assertTrue(gpu_dev + '/stream' in k) - if gpu_dev + '/stream:all' in k: - has_all_stream = True - self.assertTrue(has_all_stream) + ret = _extract_node(run_meta, 'MatMul') + self.assertEqual(len(ret['gpu:0']), 1) + self.assertEqual(len(ret['gpu:0/stream:all']), 1, '%s' % run_meta) def testCPU(self): ops.reset_default_graph() @@ -124,8 +127,7 @@ class RunMetadataTest(test.TestCase): self.assertGreater(tfprof_node.children[0].exec_micros, 0) ret = _extract_node(run_meta, 'MatMul') - self.assertEqual(len(ret), 1) - self.assertTrue('/job:localhost/replica:0/task:0/cpu:0' in ret) + self.assertEqual(len(ret['cpu:0']), 1) ret = _extract_node(run_meta, 'MatMul:MatMul') self.assertEqual(len(ret), 0) @@ -137,10 +139,10 @@ class RunMetadataTest(test.TestCase): # The while-loop caused a node to appear 4 times in scheduling. ret = _extract_node(run_meta, 'rnn/while/rnn/basic_rnn_cell/basic_rnn_cell/MatMul') - self.assertEqual(len(ret['/job:localhost/replica:0/task:0/cpu:0']), 4) + self.assertEqual(len(ret['cpu:0']), 4) total_cpu_execs = 0 - for node in ret['/job:localhost/replica:0/task:0/cpu:0']: + for node in ret['cpu:0']: total_cpu_execs += node.op_end_rel_micros mm_node = lib.SearchTFProfNode( @@ -151,10 +153,31 @@ class RunMetadataTest(test.TestCase): self.assertEqual(mm_node.cpu_exec_micros, total_cpu_execs) self.assertEqual(mm_node.exec_micros, total_cpu_execs) + def testGradientGraph(self): + # Note: Please don't just adjust the test to make it pass. + # The code view logic depends on it. + ops.reset_default_graph() + _, _ = _run_loop_model() + graph = ops.get_default_graph() + forward_op = set() + backward_op = set() + back_to_forward = dict() + for op in graph.get_operations(): + if op.name.find('gradients/') > 0 and op.name.find('_grad/') > 0: + backward_op.add(op.name) + idx1 = op.name.find('gradients/') + 10 + idx2 = op.name.find('_grad/') + back_to_forward[op.name] = op.name[idx1:idx2] + else: + forward_op.add(op.name) + + for _, f in six.iteritems(back_to_forward): + self.assertTrue(f in forward_op) + # pylint: disable=pointless-string-statement """ - TODO(xpan): This test is flaky because RunMetadata returned from TensorFlow - is random. Still being investigated. + # TODO(xpan): This test is flaky because RunMetadata returned from TensorFlow + # is random. Still being investigated. def testLoopGPU(self): if not test.is_gpu_available(): return @@ -165,30 +188,17 @@ class RunMetadataTest(test.TestCase): # The while-loop caused a node to appear 4 times in scheduling. ret = _extract_node(run_meta, 'rnn/while/rnn/basic_rnn_cell/basic_rnn_cell/MatMul') - self.assertEqual(len(ret['/job:localhost/replica:0/task:0/device:GPU:0']), 4) + self.assertEqual(len(ret['gpu:0']), 4, '%s' % run_meta) total_cpu_execs = 0 - for node in ret['/job:localhost/replica:0/task:0/device:GPU:0']: + for node in ret['gpu:0']: total_cpu_execs += node.op_end_rel_micros - ret = _extract_node( - run_meta, - 'rnn/while/rnn/basic_rnn_cell/basic_rnn_cell/MatMul:MatMul') - self.assertGreaterEqual(len(ret['/device:GPU:0/stream:all']), 4) + self.assertGreaterEqual(len(ret['gpu:0/stream:all']), 4, '%s' % run_meta) total_accelerator_execs = 0 - for node in ret['/device:GPU:0/stream:all']: + for node in ret['gpu:0/stream:all']: total_accelerator_execs += node.op_end_rel_micros - - mm_node = lib.SearchTFProfNode( - tfprof_node, - 'rnn/while/rnn/basic_rnn_cell/basic_rnn_cell/MatMul') - - self.assertEqual(mm_node.run_count, 4) - self.assertEqual(mm_node.accelerator_exec_micros, total_accelerator_execs) - self.assertEqual(mm_node.cpu_exec_micros, total_cpu_execs) - self.assertEqual(mm_node.exec_micros, - total_cpu_execs + total_accelerator_execs) """ diff --git a/tensorflow/python/profiler/model_analyzer_test.py b/tensorflow/python/profiler/model_analyzer_test.py index 21d26b8782..841fe46393 100644 --- a/tensorflow/python/profiler/model_analyzer_test.py +++ b/tensorflow/python/profiler/model_analyzer_test.py @@ -21,6 +21,7 @@ import gzip import io import os import random +import re from tensorflow.core.profiler import profile_pb2 from tensorflow.core.protobuf import config_pb2 @@ -57,6 +58,68 @@ class PrintModelAnalysisTest(test.TestCase): ' ScalarW (1, 1/1 params)\n', f.read()) + def testSelectEverthingDetail(self): + ops.reset_default_graph() + dev = '/gpu:0' if test.is_gpu_available() else '/cpu:0' + outfile = os.path.join(test.get_temp_dir(), 'dump') + opts = (builder(builder.trainable_variables_parameter()) + .with_file_output(outfile) + .with_accounted_types(['.*']) + .select(['micros', 'bytes', 'params', 'float_ops', 'occurrence', + 'device', 'op_types', 'input_shapes']).build()) + + config = config_pb2.ConfigProto() + with session.Session(config=config) as sess, ops.device(dev): + x = lib.BuildSmallModel() + + sess.run(variables.global_variables_initializer()) + run_meta = config_pb2.RunMetadata() + _ = sess.run(x, + options=config_pb2.RunOptions( + trace_level=config_pb2.RunOptions.FULL_TRACE), + run_metadata=run_meta) + + model_analyzer.profile( + sess.graph, run_meta, options=opts) + + with gfile.Open(outfile, 'r') as f: + # pylint: disable=line-too-long + outputs = f.read().split('\n') + + self.assertEqual(outputs[0], + 'node name | # parameters | # float_ops | requested bytes | total execution time | accelerator execution time | cpu execution time | assigned devices | op types | op count (run|defined) | input shapes') + for o in outputs[1:]: + if o.find('Conv2D ') > 0: + metrics = o[o.find('(') +1: o.find(')')].split(',') + # Make sure time is profiled. + gap = 1 if test.is_gpu_available() else 2 + for i in range(3, 6, gap): + mat = re.search('(.*)us/(.*)us', metrics[i]) + self.assertGreater(float(mat.group(1)), 0.0) + self.assertGreater(float(mat.group(2)), 0.0) + # Make sure device is profiled. + if test.is_gpu_available(): + self.assertTrue(metrics[6].find('gpu') > 0) + self.assertFalse(metrics[6].find('cpu') > 0) + else: + self.assertFalse(metrics[6].find('gpu') > 0) + self.assertTrue(metrics[6].find('cpu') > 0) + # Make sure float_ops is profiled. + mat = re.search('(.*)k/(.*)k flops', metrics[1].strip()) + self.assertGreater(float(mat.group(1)), 0.0) + self.assertGreater(float(mat.group(2)), 0.0) + # Make sure op_count is profiled. + self.assertEqual(metrics[8].strip(), '1/1|1/1') + # Make sure input_shapes is profiled. + self.assertEqual(metrics[9].strip(), '0:2x6x6x3|1:3x3x3x6') + + if o.find('DW (3x3x3x6') > 0: + metrics = o[o.find('(') +1: o.find(')')].split(',') + mat = re.search('(.*)/(.*) params', metrics[1].strip()) + self.assertGreater(float(mat.group(1)), 0.0) + self.assertGreater(float(mat.group(2)), 0.0) + # pylint: enable=line-too-long + def testSelectEverything(self): ops.reset_default_graph() outfile = os.path.join(test.get_temp_dir(), 'dump') @@ -151,29 +214,38 @@ class PrintModelAnalysisTest(test.TestCase): with gfile.Open(outfile, 'r') as f: lines = f.read().split('\n') result = '\n'.join([l[:min(len(l), 80)] for l in lines]) - self.assertEqual('node name | # parameters | # float_ops\n_TFProfRoot (--/2.84k params, --/91.04k flops)\n model_analyzer_testlib.py:58:BuildFullModel:seq.append(array_... (0/1.80k para\n model_analyzer_testlib.py:35:BuildSmallModel:image = array_ops... (0/0 param\n model_analyzer_testlib.py:39:BuildSmallModel:initializer=init_... (0/4 param\n model_analyzer_testlib.py:43:BuildSmallModel:initializer=init_... (0/648 par\n model_analyzer_testlib.py:44:BuildSmallModel:x = nn_ops.conv2d... (0/0 param\n model_analyzer_testlib.py:48:BuildSmallModel:initializer=init_... (0/1.15k p\n model_analyzer_testlib.py:49:BuildSmallModel:x = nn_ops.conv2d... (0/0 param\n model_analyzer_testlib.py:62:BuildFullModel:cell, array_ops.c... (0/1.04k para\n model_analyzer_testlib.py:64:BuildFullModel:target = array_op... (0/0 params, \n model_analyzer_testlib.py:65:BuildFullModel:loss = nn_ops.l2_... (0/0 params, \n model_analyzer_testlib.py:67:BuildFullModel:return sgd_op.min... (0/0 params, \n', + self.assertEqual('node name | # parameters | # float_ops\n_TFProfRoot (--/2.84k params, --/91.04k flops)\n model_analyzer_testlib.py:58:BuildFullModel:seq.append(array_... (0/1.80k para\n model_analyzer_testlib.py:35:BuildSmallModel:image = array_ops... (0/0 param\n model_analyzer_testlib.py:39:BuildSmallModel:initializer=init_... (0/4 param\n model_analyzer_testlib.py:43:BuildSmallModel:initializer=init_... (0/648 par\n model_analyzer_testlib.py:44:BuildSmallModel:x = nn_ops.conv2d... (0/0 param\n model_analyzer_testlib.py:48:BuildSmallModel:initializer=init_... (0/1.15k p\n model_analyzer_testlib.py:49:BuildSmallModel:x = nn_ops.conv2d... (0/0 param\n model_analyzer_testlib.py:58:BuildFullModel:seq.append(array_... (gradient) (0\n model_analyzer_testlib.py:44:BuildSmallModel:x = nn_ops.conv2d... (gradient)\n model_analyzer_testlib.py:49:BuildSmallModel:x = nn_ops.conv2d... (gradient)\n model_analyzer_testlib.py:62:BuildFullModel:cell, array_ops.c... (0/1.04k para\n model_analyzer_testlib.py:62:BuildFullModel:cell, array_ops.c... (gradient) (0\n model_analyzer_testlib.py:64:BuildFullModel:target = array_op... (0/0 params, \n model_analyzer_testlib.py:65:BuildFullModel:loss = nn_ops.l2_... (0/0 params, \n model_analyzer_testlib.py:65:BuildFullModel:loss = nn_ops.l2_... (gradient) (0\n model_analyzer_testlib.py:67:BuildFullModel:return sgd_op.min... (0/0 params, \n', result) self.assertLess(0, tfprof_node.total_exec_micros) self.assertEqual(2844, tfprof_node.total_parameters) self.assertEqual(91040, tfprof_node.total_float_ops) - self.assertEqual(5, len(tfprof_node.children)) + self.assertEqual(8, len(tfprof_node.children)) self.assertEqual('_TFProfRoot', tfprof_node.name) self.assertEqual( 'model_analyzer_testlib.py:58:BuildFullModel:seq.append(array_...', tfprof_node.children[0].name) self.assertEqual( - 'model_analyzer_testlib.py:62:BuildFullModel:cell, array_ops.c...', + 'model_analyzer_testlib.py:58:BuildFullModel:seq.append(array_... (gradient)', tfprof_node.children[1].name) self.assertEqual( - 'model_analyzer_testlib.py:64:BuildFullModel:target = array_op...', + 'model_analyzer_testlib.py:62:BuildFullModel:cell, array_ops.c...', tfprof_node.children[2].name) self.assertEqual( - 'model_analyzer_testlib.py:65:BuildFullModel:loss = nn_ops.l2_...', + 'model_analyzer_testlib.py:62:BuildFullModel:cell, array_ops.c... (gradient)', tfprof_node.children[3].name) self.assertEqual( - 'model_analyzer_testlib.py:67:BuildFullModel:return sgd_op.min...', + 'model_analyzer_testlib.py:64:BuildFullModel:target = array_op...', tfprof_node.children[4].name) + self.assertEqual( + 'model_analyzer_testlib.py:65:BuildFullModel:loss = nn_ops.l2_...', + tfprof_node.children[5].name) + self.assertEqual( + 'model_analyzer_testlib.py:65:BuildFullModel:loss = nn_ops.l2_... (gradient)', + tfprof_node.children[6].name) + self.assertEqual( + 'model_analyzer_testlib.py:67:BuildFullModel:return sgd_op.min...', + tfprof_node.children[7].name) # pylint: enable=line-too-long def testCodeViewLeafGraphNode(self): diff --git a/tensorflow/python/profiler/option_builder.py b/tensorflow/python/profiler/option_builder.py index 502fc49bb6..641895ffe5 100644 --- a/tensorflow/python/profiler/option_builder.py +++ b/tensorflow/python/profiler/option_builder.py @@ -406,7 +406,7 @@ class ProfileOptionBuilder(object): """Generate a pprof profile gzip file. To use the pprof file: - pprof -png --nodecount=20 --sample_index=1 <pprof_file> + pprof -png --nodecount=100 --sample_index=1 <pprof_file> Args: pprof_file: filename for output, usually suffixed with .pb.gz. diff --git a/tensorflow/python/profiler/profiler_test.py b/tensorflow/python/profiler/profiler_test.py index 2170e1bdea..46afe1fe55 100644 --- a/tensorflow/python/profiler/profiler_test.py +++ b/tensorflow/python/profiler/profiler_test.py @@ -183,6 +183,51 @@ class ProfilerTest(test.TestCase): checker = advice_pb.checkers['ExpensiveOperationChecker'] self.assertGreater(len(checker.reports), 0) + def testMultipleProfilePerStep(self): + ops.reset_default_graph() + opts = (builder(builder.trainable_variables_parameter()) + .with_empty_output() + .with_accounted_types(['.*']) + .select(['micros', 'bytes', 'peak_bytes', + 'residual_bytes', 'output_bytes']).build()) + + r = lib.BuildSmallModel() + sess = session.Session() + profiler = model_analyzer.Profiler(sess.graph) + + init_var_run_meta = config_pb2.RunMetadata() + sess.run(variables.global_variables_initializer(), + options=config_pb2.RunOptions( + trace_level=config_pb2.RunOptions.FULL_TRACE), + run_metadata=init_var_run_meta) + + train_run_meta = config_pb2.RunMetadata() + sess.run(r, + options=config_pb2.RunOptions( + trace_level=config_pb2.RunOptions.FULL_TRACE), + run_metadata=train_run_meta) + + profiler.add_step(0, train_run_meta) + ret1 = profiler.profile_name_scope(opts) + n1 = lib.SearchTFProfNode( + ret1, 'DW/Initializer/random_normal/RandomStandardNormal') + # Without the var initialization run_meta, it doesn't have the + # information of var_initialization. + self.assertEqual(n1.exec_micros, 0) + self.assertEqual(n1.requested_bytes, 0) + self.assertEqual(n1.peak_bytes, 0) + self.assertEqual(n1.residual_bytes, 0) + + profiler.add_step(0, init_var_run_meta) + ret2 = profiler.profile_name_scope(opts) + n2 = lib.SearchTFProfNode( + ret2, 'DW/Initializer/random_normal/RandomStandardNormal') + # After adding the var initialization run_meta. + self.assertGreater(n2.exec_micros, 0) + self.assertGreater(n2.requested_bytes, 0) + self.assertGreater(n2.peak_bytes, 0) + self.assertGreater(n2.residual_bytes, 0) + if __name__ == '__main__': test.main() |