diff options
author | 2017-09-06 21:56:33 -0700 | |
---|---|---|
committer | 2017-09-06 22:01:54 -0700 | |
commit | 9e9ffa33d154b6c332bd475d6da2030746460fed (patch) | |
tree | 64de1c00853926d394f6f4ba82f01f448e990fda /tensorflow/core/profiler | |
parent | 27ce2a4ac956941ba8a0b9aaaa77acc0aa861fef (diff) |
Unify all profile files (graph,run_meta,op_log) into one.
Also allow profiler to serialize/deserialize to/from file.
PiperOrigin-RevId: 167815923
Diffstat (limited to 'tensorflow/core/profiler')
-rw-r--r-- | tensorflow/core/profiler/README.md | 92 | ||||
-rw-r--r-- | tensorflow/core/profiler/internal/advisor/operation_checker.h | 7 | ||||
-rw-r--r-- | tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc | 2 | ||||
-rw-r--r-- | tensorflow/core/profiler/internal/print_model_analysis.cc | 40 | ||||
-rw-r--r-- | tensorflow/core/profiler/internal/print_model_analysis.h | 9 | ||||
-rw-r--r-- | tensorflow/core/profiler/internal/tfprof_graph.cc | 10 | ||||
-rw-r--r-- | tensorflow/core/profiler/internal/tfprof_node.cc | 68 | ||||
-rw-r--r-- | tensorflow/core/profiler/internal/tfprof_node.h | 313 | ||||
-rw-r--r-- | tensorflow/core/profiler/internal/tfprof_scope.cc | 4 | ||||
-rw-r--r-- | tensorflow/core/profiler/internal/tfprof_show_test.cc | 23 | ||||
-rw-r--r-- | tensorflow/core/profiler/internal/tfprof_stats.cc | 78 | ||||
-rw-r--r-- | tensorflow/core/profiler/internal/tfprof_stats.h | 12 | ||||
-rw-r--r-- | tensorflow/core/profiler/internal/tfprof_stats_test.cc | 18 | ||||
-rw-r--r-- | tensorflow/core/profiler/internal/tfprof_utils.cc | 1 | ||||
-rw-r--r-- | tensorflow/core/profiler/profiler.cc | 103 | ||||
-rw-r--r-- | tensorflow/core/profiler/tfprof_log.proto | 93 |
16 files changed, 631 insertions, 242 deletions
diff --git a/tensorflow/core/profiler/README.md b/tensorflow/core/profiler/README.md index f0d4dafd3e..2ddac0a79b 100644 --- a/tensorflow/core/profiler/README.md +++ b/tensorflow/core/profiler/README.md @@ -21,60 +21,70 @@ ### Quick Start ```python -# Enable tracing and profile the time and memory information. +# When using high-level API, session is usually hidden. +# +# Under the default ProfileContext, run a few hundred steps. +# The ProfileContext will sample some steps and dump the profiles +# to files. Users can then use command line tool or Web UI for +# interactive profiling. +with tf.contrib.tfprof.ProfileContext('/tmp/train_dir') as pctx: + # High level API, such as slim, Estimator, etc. + train_loop() + +bazel-bin/tensorflow/core/profiler/profiler \ + --profile_path=/tmp/train_dir/profile_xx +tfprof> op -select micros,bytes,occurrence -order_by micros + +# To be open sourced... +bazel-bin/third_party/tensorflow/python/profiler/profiler_ui \ + --profile_path=/tmp/profiles/profile_1 +``` +![ProfilerUI](g3doc/profiler_ui.jpg) + +```python +# When using lower-level APIs with a Session object. User can have +# explicit control of each step. +# +# Create options to profile the time and memory information. builder = tf.profiler.ProfileOptionBuilder opts = builder(builder.time_and_memory()).order_by('micros').build() -with tf.contrib.tfprof.ProfileContext() as pctx: +# Create a profiling context, set contructor argument `trace_steps`, +# `dump_steps` to empty for explicit control. +with tf.contrib.tfprof.ProfileContext('/tmp/train_dir', + trace_steps=[], + dump_steps=[]) as pctx: with tf.Session() as sess: - x = build_model() - sess.run(tf.global_variables_initializer()) - for i in range(10000): - if i == 10: - pctx.capture_next_run_meta() - _ = sess.run(x) - tf.profiler.profile(sess.graph, run_meta=pctx.run_meta(), cmd='op', options=opts) - else: - _ = sess.run(x) + # Enable tracing for next session.run. + pctx.trace_next_step() + # Dump the profile to '/tmp/train_dir' after the step. + pctx.dump_next_step() + _ = session.run(train_op) + pctx.profiler.profile_operations(options=opts) ``` ```python -# When using high-level API, session is usually hidden. -# Register profiling to ProfileContext that performs profiling automatically. +# For more advanced usage, user can control the tracing steps and +# dumping steps. User can also run online profiling during training. +# +# Create options to profile time/memory as well as parameters. builder = tf.profiler.ProfileOptionBuilder opts = builder(builder.time_and_memory()).order_by('micros').build() opts2 = tf.profiler.ProfileOptionBuilder.trainable_variables_parameter() -with tf.contrib.tfprof.ProfileContext() as pctx: - # Profile time and memory from step 100 to 109. - # Note: High level APIs can run sessions for queue, variable initialization - # or training at different steps. Try profile multiple steps. - pctx.add_auto_profiling('op', opts, range(100, 110)) - # Profile model parameters once at step 10 (since it is static). - pctx.add_auto_profiling('scope', opts2, [10]) + +# Collect traces of steps 10~20, dump the whole profile (with traces of +# step 10~20) at step 20. The dumped profile can be used for further profiling +# with command line interface or Web UI. +with tf.contrib.tfprof.ProfileContext('/tmp/train_dir', + trace_steps=range(10, 20), + dump_steps=[20]) as pctx: + # Run online profiling with 'op' view and 'opts' options at step 15, 18, 20. + pctx.add_auto_profiling('op', opts, [15, 18, 20]) + # Run online profiling with 'scope' view and 'opts2' options at step 20. + pctx.add_auto_profiling('scope', opts2, [20]) # High level API, such as slim, Estimator, etc. train_loop() ``` -```shell -# Profiling from Python API is not interactive. -# Dump the profiles to files and profile with interactive command line or web UI. -with tf.contrib.tfprof.ProfileContext() as pctx: - pctx.add_auto_profile_dump('/tmp/profiles', [100]) - train_loop() - -bazel-bin/tensorflow/core/profiler/profiler \ - --graph_path=/tmp/profiles/graph.pbtxt \ - --run_meta_path=/tmp/profiles/run_meta \ - --op_log_path=/tmp/profiles/tfprof_log \ -tfprof> op -select micros,bytes,occurrence -order_by micros - - -# To be open sourced... -bazel-bin/third_party/tensorflow/python/profiler/profiler_ui \ - --graph_path=/tmp/profiles/graph.pbtxt \ - --run_meta_path=/tmp/profiles/run_meta \ - --op_log_path=/tmp/profiles/tfprof_log \ -``` -![ProfilerUI](g3doc/profiler_ui.jpg) <b>Detail Tutorials</b> diff --git a/tensorflow/core/profiler/internal/advisor/operation_checker.h b/tensorflow/core/profiler/internal/advisor/operation_checker.h index 08f6d3c389..f0bd72fa40 100644 --- a/tensorflow/core/profiler/internal/advisor/operation_checker.h +++ b/tensorflow/core/profiler/internal/advisor/operation_checker.h @@ -45,9 +45,10 @@ class OperationChecker : public Checker { if (node->op_types().find("FusedBatchNorm") != node->op_types().end()) { use_fused_batch_norm = true; } - if (node->op_attrs().find("data_format") != node->op_attrs().end()) { - const AttrValue* attr_val = node->op_attrs().at("data_format"); - if (attr_val->s() == "NHWC" && + + const AttrValue* attr = node->op_attrs("data_format"); + if (attr) { + if (attr->s() == "NHWC" && IsPlacedOnAccelerator(node->canonical_device())) { recommend_nchw = true; } diff --git a/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc b/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc index 23ed287f7b..c39d44b7fa 100644 --- a/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc +++ b/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc @@ -48,7 +48,7 @@ class TFProfAdvisorTest : public ::testing::Test { for (const auto& attr : attrs) { (*def->mutable_attr())[attr.first].set_s(attr.second); } - std::unique_ptr<TFGraphNode> node(new TFGraphNode(def)); + std::unique_ptr<TFGraphNode> node(new TFGraphNode(def, -1)); NodeExecStats node_stat; node_stat.set_all_start_micros(start_miros); diff --git a/tensorflow/core/profiler/internal/print_model_analysis.cc b/tensorflow/core/profiler/internal/print_model_analysis.cc index fd46b957e8..ddf3c7f1f2 100644 --- a/tensorflow/core/profiler/internal/print_model_analysis.cc +++ b/tensorflow/core/profiler/internal/print_model_analysis.cc @@ -90,38 +90,61 @@ bool NewProfiler(const string* graph, const string* op_log) { if (!graph_ptr->ParseFromString(*graph)) { if (!protobuf::TextFormat::ParseFromString(*graph, graph_ptr.get())) { fprintf(stderr, "Failed to parse graph\n"); + return false; } } std::unique_ptr<OpLogProto> op_log_ptr; if (op_log && !op_log->empty()) { op_log_ptr.reset(new OpLogProto()); - op_log_ptr->ParseFromString(*op_log); + if (!op_log_ptr->ParseFromString(*op_log)) { + fprintf(stderr, "Failed to parse OpLogProto.\n"); + return false; + } } tf_stat = new TFStats(std::move(graph_ptr), nullptr, std::move(op_log_ptr), nullptr); return true; } +void ProfilerFromFile(const string* filename) { + CHECK(!tf_stat) << "Currently only 1 living tfprof profiler is allowed"; + CHECK(filename) << "Missing profile filename to init profiler from file"; + tf_stat = new TFStats(*filename, nullptr); +} + void DeleteProfiler() { - delete tf_stat; - tf_stat = nullptr; + if (tf_stat) { + delete tf_stat; + tf_stat = nullptr; + } } -void AddStep(int64 step, const string* run_meta, const string* op_log) { +void AddStep(int64 step, const string* graph, const string* run_meta, + const string* op_log) { CHECK(tf_stat); + + CHECK(graph && !graph->empty()); + std::unique_ptr<GraphDef> graph_ptr(new GraphDef()); + if (!graph_ptr->ParseFromString(*graph)) { + if (!protobuf::TextFormat::ParseFromString(*graph, graph_ptr.get())) { + fprintf(stderr, "Failed to parse graph\n"); + } + } + tf_stat->AddGraph(std::move(graph_ptr)); + CHECK(run_meta && !run_meta->empty()); // TODO(xpan): Better error handling. std::unique_ptr<RunMetadata> run_meta_ptr(new RunMetadata()); run_meta_ptr->ParseFromString(*run_meta); tf_stat->AddRunMeta(step, std::move(run_meta_ptr)); - std::unique_ptr<OpLogProto> op_log_ptr; if (op_log && !op_log->empty()) { + std::unique_ptr<OpLogProto> op_log_ptr; op_log_ptr.reset(new OpLogProto()); op_log_ptr->ParseFromString(*op_log); + tf_stat->AddOpLogProto(std::move(op_log_ptr)); } - tf_stat->AddOpLogProto(std::move(op_log_ptr)); } string Profile(const string* command, const string* options) { @@ -131,6 +154,11 @@ string Profile(const string* command, const string* options) { return RunProfile(*command, *options, tf_stat); } +void WriteProfile(const string* filename) { + CHECK(filename) << "empty file name when asking to write profile."; + tf_stat->WriteProfile(*filename); +} + string PrintModelAnalysis(const string* graph, const string* run_meta, const string* op_log, const string* command, const string* options) { diff --git a/tensorflow/core/profiler/internal/print_model_analysis.h b/tensorflow/core/profiler/internal/print_model_analysis.h index 76a436f71c..e4d01041a8 100644 --- a/tensorflow/core/profiler/internal/print_model_analysis.h +++ b/tensorflow/core/profiler/internal/print_model_analysis.h @@ -35,7 +35,14 @@ bool NewProfiler(const string* graph, const string* op_log); void DeleteProfiler(); -void AddStep(int64 step, const string* run_meta, const string* op_log); +void AddStep(int64 step, const string* graph, const string* run_meta, + const string* op_log); + +// Write the profiler's profile to a proto buffer. +void WriteProfile(const string* filename); + +// Load the profile to profiler from a proto buffer file. +void ProfilerFromFile(const string* filename); string Profile(const string* command, const string* options); diff --git a/tensorflow/core/profiler/internal/tfprof_graph.cc b/tensorflow/core/profiler/internal/tfprof_graph.cc index e58924a614..3766365bf8 100644 --- a/tensorflow/core/profiler/internal/tfprof_graph.cc +++ b/tensorflow/core/profiler/internal/tfprof_graph.cc @@ -30,8 +30,8 @@ GraphNode* TFGraph::CreateParentNode(const string& name) { node_defs_.push_back(std::unique_ptr<NodeDef>(new NodeDef())); node_defs_.back()->set_name(name); node_defs_.back()->set_op(kTFGraphParent); - parent_nodes_[name] = - std::unique_ptr<TFGraphNode>(new TFGraphNode(node_defs_.back().get())); + parent_nodes_[name] = std::unique_ptr<TFGraphNode>( + new TFGraphNode(node_defs_.back().get(), -1)); nodes_map_[name] = std::unique_ptr<GraphNode>(new GraphNode(parent_nodes_[name].get())); return nodes_map_[name].get(); @@ -49,11 +49,11 @@ void TFGraph::Build() { // Filter out the root nodes (node not input of any other node). for (auto it = nodes_map_.begin(); it != nodes_map_.end(); it++) { GraphNode* node = it->second.get(); - const std::map<int, TFGraphNode*>& inputs = node->node->inputs(); + const std::map<int, string>& inputs = node->node->inputs(); for (auto inputs_it = inputs.cbegin(); inputs_it != inputs.cend(); inputs_it++) { - nonroots.insert(inputs_it->second->name()); - auto child_it = nodes_map_.find(inputs_it->second->name()); + nonroots.insert(inputs_it->second); + auto child_it = nodes_map_.find(inputs_it->second); if (child_it != nodes_map_.end()) { node->children.push_back(child_it->second.get()); } diff --git a/tensorflow/core/profiler/internal/tfprof_node.cc b/tensorflow/core/profiler/internal/tfprof_node.cc index 19257968e6..e2be2cf4cf 100644 --- a/tensorflow/core/profiler/internal/tfprof_node.cc +++ b/tensorflow/core/profiler/internal/tfprof_node.cc @@ -51,19 +51,21 @@ bool IsCanonicalDevice(const string& device) { return CountAsCPUTime(device); } void ExecStep::AddTimeStats(const string& dev, const NodeExecStats& step_stat) { devices_.insert(dev); if (step_stat.all_start_micros() > 0) { - if (all_start_micros_ > 0) { - all_start_micros_ = std::min( - all_start_micros_, static_cast<int64>(step_stat.all_start_micros())); + if (exec_.all_start_micros() > 0) { + exec_.set_all_start_micros( + std::min(static_cast<int64>(exec_.all_start_micros()), + static_cast<int64>(step_stat.all_start_micros()))); } else { - all_start_micros_ = step_stat.all_start_micros(); + exec_.set_all_start_micros(step_stat.all_start_micros()); } int64 op_end_rel_micros = step_stat.op_end_rel_micros(); // Round quick execution to 1 micro to be semantically robust. if (op_end_rel_micros == 0) { ++op_end_rel_micros; } - latest_end_micros_ = std::max( - latest_end_micros_, step_stat.all_start_micros() + op_end_rel_micros); + exec_.set_latest_end_micros( + std::max(static_cast<int64>(exec_.latest_end_micros()), + step_stat.all_start_micros() + op_end_rel_micros)); const std::pair<int64, int64> pair = std::make_pair(step_stat.all_start_micros(), op_end_rel_micros); @@ -75,17 +77,17 @@ void ExecStep::AddTimeStats(const string& dev, const NodeExecStats& step_stat) { op_execs_[dev].push_back(pair); // In while-loop, a graph node is executed multiple times under // the same name. - run_count_ += 1; + exec_.set_run_count(exec_.run_count() + 1); } } } void ExecStep::AddMemoryStats(const string& dev, const NodeExecStats& step_stat) { - if (mem_initiated_) { + if (exec_.memory_intialized()) { return; } - mem_initiated_ = true; + exec_.set_memory_intialized(true); for (const auto& mem : step_stat.memory()) { // TODO(xpan): Fix this hack. Currently the allocator name seems quite @@ -93,9 +95,9 @@ void ExecStep::AddMemoryStats(const string& dev, if (mem.allocator_name().find("GPU") == mem.allocator_name().npos) { continue; } - allocator_bytes_in_use_ = - std::max(allocator_bytes_in_use_, - static_cast<int64>(mem.allocator_bytes_in_use())); + exec_.set_allocator_bytes_in_use( + std::max(static_cast<int64>(exec_.allocator_bytes_in_use()), + static_cast<int64>(mem.allocator_bytes_in_use()))); } int64 total_output_bytes = 0; for (const auto& output : step_stat.output()) { @@ -114,16 +116,20 @@ void ExecStep::AddMemoryStats(const string& dev, output_memory_[output.slot()] = std::make_pair(output_bytes, output_ptr); } } - output_bytes_ = total_output_bytes; + exec_.set_output_bytes(total_output_bytes); if (step_stat.has_memory_stats()) { - host_temp_bytes_ += step_stat.memory_stats().host_temp_memory_size(); - host_persistent_bytes_ += - step_stat.memory_stats().host_persistent_memory_size(); - accelerator_temp_bytes_ += - step_stat.memory_stats().device_temp_memory_size(); - accelerator_persistent_bytes_ += - step_stat.memory_stats().device_persistent_memory_size(); + exec_.set_host_temp_bytes(exec_.host_temp_bytes() + + step_stat.memory_stats().host_temp_memory_size()); + exec_.set_host_persistent_bytes( + exec_.host_persistent_bytes() + + step_stat.memory_stats().host_persistent_memory_size()); + exec_.set_accelerator_temp_bytes( + exec_.accelerator_temp_bytes() + + step_stat.memory_stats().device_temp_memory_size()); + exec_.set_accelerator_persistent_bytes( + exec_.accelerator_persistent_bytes() + + step_stat.memory_stats().device_persistent_memory_size()); } int64 residual_bytes = 0; int64 requested_bytes = 0; @@ -133,9 +139,9 @@ void ExecStep::AddMemoryStats(const string& dev, requested_bytes += mem.total_bytes(); peak_bytes += mem.peak_bytes(); } - requested_bytes_ = requested_bytes; - residual_bytes_ = residual_bytes; - peak_bytes_ = peak_bytes; + exec_.set_requested_bytes(requested_bytes); + exec_.set_residual_bytes(residual_bytes); + exec_.set_peak_bytes(peak_bytes); } void TFGraphNode::AddStepStat(int64 step, const string& device, @@ -147,33 +153,33 @@ void TFGraphNode::AddStepStat(int64 step, const string& device, // It can be /job:0/replica:0/xxxx/device:GPU:0, or simply /device:GPU:0. // It can has some ad-hoc suffix, such as /stream:xx or /memcpy:xx. if (IsCanonicalDevice(dev)) { - if (!canonical_device_.empty()) { - if (canonical_device_ != dev) { + if (!node_.canonical_device().empty()) { + if (node_.canonical_device() != dev) { // TODO(xpan): Some RunMetadata node appears at multiple devices. // Need to address it. return; } } else { - canonical_device_ = dev; + node_.set_canonical_device(dev); // TODO(xpan): Support things other than gpu? if (dev.find("sycl") != dev.npos) { - host_device_ = StringReplace(dev, "device:sycl:\\d+", "cpu:0"); + node_.set_host_device(StringReplace(dev, "device:sycl:\\d+", "cpu:0")); } else { - host_device_ = StringReplace(dev, "gpu:\\d+", "cpu:0"); + node_.set_host_device(StringReplace(dev, "gpu:\\d+", "cpu:0")); } - AddOpType(canonical_device_); + AddOpType(node_.canonical_device()); } } auto exec = execs_.find(step); if (exec == execs_.end()) { - execs_.insert(std::pair<int64, ExecStep>(step, ExecStep(this))); + execs_.insert(std::pair<int64, ExecStep>(step, ExecStep())); exec = execs_.find(step); } exec->second.AddTimeStats(dev, step_stat); - if (dev == canonical_device_) { + if (dev == node_.canonical_device()) { exec->second.AddMemoryStats(dev, step_stat); } } diff --git a/tensorflow/core/profiler/internal/tfprof_node.h b/tensorflow/core/profiler/internal/tfprof_node.h index 5ec3da12cf..55d53f3923 100644 --- a/tensorflow/core/profiler/internal/tfprof_node.h +++ b/tensorflow/core/profiler/internal/tfprof_node.h @@ -44,27 +44,13 @@ class TFGraphNode; class ExecStep { public: - ExecStep(TFGraphNode* node) - : node(node), - run_count_(0), - all_start_micros_(0), - latest_end_micros_(0), - mem_initiated_(false), - requested_bytes_(0), - peak_bytes_(0), - residual_bytes_(0), - output_bytes_(0), - host_temp_bytes_(0), - host_persistent_bytes_(0), - accelerator_temp_bytes_(0), - accelerator_persistent_bytes_(0), - allocator_bytes_in_use_(0) {} + ExecStep() {} void AddTimeStats(const string& dev, const NodeExecStats& step_stat); void AddMemoryStats(const string& dev, const NodeExecStats& step_stat); - int64 run_count() const { return run_count_; } + int64 run_count() const { return exec_.run_count(); } // The execution time of an op. If it runs on accelerator, then it's // accelerator_exec_micros(). Otherwise, it's CPU time. int64 exec_micros() const; @@ -77,31 +63,101 @@ class ExecStep { const { return op_execs_; } - int64 all_start_micros() const { return all_start_micros_; } - int64 latest_end_micros() const { return latest_end_micros_; } - - int64 requested_bytes() const { return requested_bytes_; } - int64 peak_bytes() const { return peak_bytes_; } - int64 residual_bytes() const { return residual_bytes_; } - int64 output_bytes() const { return output_bytes_; } - int64 accelerator_temp_bytes() const { return accelerator_temp_bytes_; } - int64 host_temp_bytes() const { return host_temp_bytes_; } + int64 all_start_micros() const { return exec_.all_start_micros(); } + int64 latest_end_micros() const { return exec_.latest_end_micros(); } + + int64 requested_bytes() const { return exec_.requested_bytes(); } + int64 peak_bytes() const { return exec_.peak_bytes(); } + int64 residual_bytes() const { return exec_.residual_bytes(); } + int64 output_bytes() const { return exec_.output_bytes(); } + int64 accelerator_temp_bytes() const { + return exec_.accelerator_temp_bytes(); + } + int64 host_temp_bytes() const { return exec_.host_temp_bytes(); } int64 accelerator_persistent_bytes() const { - return accelerator_persistent_bytes_; + return exec_.accelerator_persistent_bytes(); } - int64 host_persistent_bytes() const { return host_persistent_bytes_; } - const std::map<int64, std::pair<int64, uint64>>& output_memory() const { + int64 host_persistent_bytes() const { return exec_.host_persistent_bytes(); } + const std::map<int32, std::pair<int64, uint64>>& output_memory() const { return output_memory_; } - int64 allocator_bytes_in_use() const { return allocator_bytes_in_use_; } + int64 allocator_bytes_in_use() const { + return exec_.allocator_bytes_in_use(); + } + + const ExecProfile& ToProto() { + exec_.mutable_accelerator_execs()->clear(); + for (const auto& e : accelerator_execs_) { + auto& exec_time = (*exec_.mutable_accelerator_execs())[e.first]; + for (const auto& p : e.second) { + auto* t = exec_time.mutable_times()->Add(); + t->add_int64_values(p.first); + t->add_int64_values(p.second); + } + } + + exec_.mutable_cpu_execs()->clear(); + for (const auto& e : cpu_execs_) { + auto& exec_time = (*exec_.mutable_cpu_execs())[e.first]; + for (const auto& p : e.second) { + auto* t = exec_time.mutable_times()->Add(); + t->add_int64_values(p.first); + t->add_int64_values(p.second); + } + } + + exec_.mutable_devices()->Clear(); + exec_.mutable_devices()->Reserve(devices_.size()); + for (const string& d : devices_) { + exec_.add_devices(d); + } + + exec_.mutable_output_memory()->clear(); + for (const auto& mem : output_memory_) { + auto& mem_pb = (*exec_.mutable_output_memory())[mem.first]; + mem_pb.set_bytes(mem.second.first); + mem_pb.set_ptr(mem.second.second); + } + + return exec_; + } + + void FromProto(const ExecProfile& exec) { + exec_.Clear(); + exec_.MergeFrom(exec); + + devices_.clear(); + devices_.insert(exec.devices().begin(), exec.devices().end()); + + accelerator_execs_.clear(); + cpu_execs_.clear(); + op_execs_.clear(); + + for (const auto& exec_time : exec_.accelerator_execs()) { + auto& exec = accelerator_execs_[exec_time.first]; + auto& op_exec = op_execs_[exec_time.first]; + for (const auto& p : exec_time.second.times()) { + exec.push_back(std::make_pair(p.int64_values(0), p.int64_values(1))); + op_exec.push_back(std::make_pair(p.int64_values(0), p.int64_values(1))); + } + } + for (const auto& exec_time : exec_.cpu_execs()) { + auto& exec = cpu_execs_[exec_time.first]; + auto& op_exec = op_execs_[exec_time.first]; + for (const auto& p : exec_time.second.times()) { + exec.push_back(std::make_pair(p.int64_values(0), p.int64_values(1))); + op_exec.push_back(std::make_pair(p.int64_values(0), p.int64_values(1))); + } + } + for (const auto& output_mem : exec_.output_memory()) { + auto& mem = output_memory_[output_mem.first]; + mem.first = output_mem.second.bytes(); + mem.second = output_mem.second.ptr(); + } + } private: - TFGraphNode* node; - // Can be larger than 1 if run multiple times in loop. - int64 run_count_; - // The earliest/latest time including scheduling and execution. - int64 all_start_micros_; - int64 latest_end_micros_; + ExecProfile exec_; // device -> vector of {op_start_micros, op_exec_micros} pairs. // accelerator_execs: gpu:id/stream:all -> {op_start_micros, op_exec_micros} // For accelerator, vector size can be larger than 1, multiple kernel fires @@ -115,26 +171,8 @@ class ExecStep { // All devices the op is associated with (e.g. gpu:0 (scheduling), // gpu:0:stream:xx (kernel exec), cpu:0 host) std::set<string> devices_; - - bool mem_initiated_; - // Total bytes requested by the op. - int64 requested_bytes_; - // Total bytes requested by the op and released before op end. - int64 peak_bytes_; - // Total bytes requested by the op and not released after op end. - int64 residual_bytes_; - // Total bytes output by the op (not necessarily requested by the op). - int64 output_bytes_; - // Total temporary bytes allocated and released by the op. - int64 host_temp_bytes_; - // Total persistent bytes (e.g. variable) allocated by the op. - int64 host_persistent_bytes_; - int64 accelerator_temp_bytes_; - int64 accelerator_persistent_bytes_; - // The total number of bytes currently allocated by the allocator if >0. - int64 allocator_bytes_in_use_; // output_idx -> {output_bytes, memory_ptr} - std::map<int64, std::pair<int64, uint64>> output_memory_; + std::map<int32, std::pair<int64, uint64>> output_memory_; }; #define GRAPH_NODE_BYTES(type) \ @@ -157,10 +195,18 @@ class ExecStep { class TFGraphNode { public: - TFGraphNode(const NodeDef* node) - : node_(node), float_ops_(0), op_(node->op()) { + TFGraphNode(const ProfileNode& node, const ProfileProto& profile) { + FromProto(node, profile); + } + + TFGraphNode(const NodeDef* node, int64 id) { + node_.set_id(id); + node_.set_name(node->name()); + node_.set_op(node->op()); + node_.set_float_ops(0); + for (const auto& attr : node->attr()) { - op_attrs_[attr.first] = &attr.second; + (*node_.mutable_attrs())[attr.first].MergeFrom(attr.second); if (attr.first == "shape" && attr.second.has_shape()) { if (!shape_.empty()) { fprintf(stderr, "Found duplicated shapes!\n"); @@ -180,10 +226,10 @@ class TFGraphNode { op_types_.insert(node->op()); } - void AddInput(TFGraphNode* input, int64 output_idx, int input_idx) { + void AddInput(TFGraphNode* input, int32 output_idx, int input_idx) { src_output_idx_[input->name()] = output_idx; - inputs_[input_idx] = input; + inputs_[input_idx] = input->name(); const auto& output_shape = input->output_shapes().find(output_idx); // Always create an empty vec even if the shape info might be missing. std::vector<int64>& shape_vec = input_shapes_[input_idx]; @@ -198,28 +244,123 @@ class TFGraphNode { void AddStepStat(int64 step, const string& device, const NodeExecStats& step_stat); - void AddFloatOps(int64 float_ops) { float_ops_ = float_ops; } + void AddFloatOps(int64 float_ops) { node_.set_float_ops(float_ops); } // TODO(xpan): This could take a lot of memory. - void AddCode(const CodeDef& code) { code_.MergeFrom(code); } + void AddCode(const CodeDef& code) { node_.mutable_trace()->MergeFrom(code); } - const string& name() const { return node_->name(); } - const string& op() const { return op_; } - const NodeDef* node_def() { return node_; } + const string& name() const { return node_.name(); } + int64 id() const { return node_.id(); } + const string& op() const { return node_.op(); } + const ProfileNode& node() { return node_; } bool trackable(int64 step) const { auto exec = execs_.find(step); if (exec == execs_.end()) return false; if (exec->second.all_start_micros() == 0) return false; - if (canonical_device_.empty() || host_device_.empty()) { + if (node_.canonical_device().empty() || node_.host_device().empty()) { return false; } return true; } - const std::map<int, TFGraphNode*>& inputs() const { return inputs_; } - const std::map<string, int64>& src_output_idx() const { + const ProfileNode& ToProto( + const std::map<string, std::unique_ptr<TFGraphNode>>& nodes_map) { + node_.clear_shape(); + node_.mutable_shape()->Reserve(shape().size()); + for (int64 s : shape()) { + node_.add_shape(s); + } + + node_.clear_op_types(); + node_.mutable_op_types()->Reserve(op_types().size()); + for (const string& t : op_types()) { + node_.add_op_types(t); + } + + node_.clear_execs(); + for (auto& exec : execs_) { + auto& exec_pb = (*node_.mutable_execs())[exec.first]; + exec_pb.MergeFrom(exec.second.ToProto()); + } + + node_.clear_inputs(); + for (const auto& inp : inputs_) { + (*node_.mutable_inputs())[inp.first] = nodes_map.at(inp.second)->id(); + } + + node_.clear_input_shapes(); + for (const auto& s : input_shapes_) { + auto& shape = (*node_.mutable_input_shapes())[s.first]; + for (int64 d : s.second) { + shape.add_int64_values(d); + } + } + + node_.clear_output_shapes(); + for (const auto& s : output_shapes_) { + auto& shape = (*node_.mutable_output_shapes())[s.first]; + for (int64 d : s.second) { + shape.add_int64_values(d); + } + } + + node_.clear_src_output_index(); + for (const auto& s : src_output_idx_) { + int64 id = nodes_map.at(s.first)->id(); + (*node_.mutable_src_output_index())[id] = s.second; + } + return node_; + } + + void FromProto(const ProfileNode& node, const ProfileProto& profile) { + node_.Clear(); + node_.MergeFrom(node); + + op_types_.clear(); + op_types_.insert(node_.op_types().begin(), node_.op_types().end()); + + shape_.clear(); + for (int64 s : node_.shape()) { + shape_.push_back(s); + } + + execs_.clear(); + for (const auto& exec_pb : node.execs()) { + auto& exec = execs_[exec_pb.first]; + exec.FromProto(exec_pb.second); + } + + inputs_.clear(); + for (const auto& inp : node.inputs()) { + inputs_[inp.first] = profile.nodes().at(inp.second).name(); + } + + input_shapes_.clear(); + for (const auto& s : node.input_shapes()) { + auto& shape = input_shapes_[s.first]; + for (const int64 d : s.second.int64_values()) { + shape.push_back(d); + } + } + + output_shapes_.clear(); + for (const auto& s : node.output_shapes()) { + auto& shape = output_shapes_[s.first]; + for (const int64 d : s.second.int64_values()) { + shape.push_back(d); + } + } + + src_output_idx_.clear(); + for (const auto& s : node.src_output_index()) { + src_output_idx_[profile.nodes().at(s.first).name()] = s.second; + } + } + + const std::map<int32, string>& inputs() const { return inputs_; } + const std::map<string, int32>& src_output_idx() const { return src_output_idx_; } @@ -346,7 +487,7 @@ class TFGraphNode { CHECK(exec != execs_.end()) << "unknown step " << step; return exec->second.host_persistent_bytes(); } - const std::map<int64, std::pair<int64, uint64>>& output_memory( + const std::map<int32, std::pair<int64, uint64>>& output_memory( int64 step) const { auto exec = execs_.find(step); CHECK(exec != execs_.end()) << "unknown step " << step; @@ -382,18 +523,22 @@ class TFGraphNode { int64 float_ops(int64 step) const { // If not run, return static analysis. if (execs_.empty()) { - return float_ops_; + return node_.float_ops(); } // Otherwise, return dynamic float_ops. - return float_ops_ * run_count(step); + return node_.float_ops() * run_count(step); } - const CodeDef& code() { return code_; } - string canonical_device() const { return canonical_device_; } - string host_device() const { return host_device_; } + const CodeDef& code() { return node_.trace(); } + string canonical_device() const { return node_.canonical_device(); } + string host_device() const { return node_.host_device(); } const std::set<string>& op_types() const { return op_types_; } - const std::map<string, const AttrValue*>& op_attrs() const { - return op_attrs_; + const AttrValue* op_attrs(const string& name) const { + const auto it = node_.attrs().find(name); + if (it == node_.attrs().end()) { + return nullptr; + } + return &it->second; } const std::vector<int64>& shape() const { return shape_; } @@ -406,12 +551,10 @@ class TFGraphNode { } private: - std::map<int, TFGraphNode*> inputs_; - std::map<string, int64> src_output_idx_; + std::map<int, string> inputs_; + std::map<string, int32> src_output_idx_; - const NodeDef* node_; - - CodeDef code_; + ProfileNode node_; std::vector<int64> shape_; // Won't missing input_idx. But some shapes might be empty (unknown). @@ -421,18 +564,8 @@ class TFGraphNode { std::map<int, std::vector<int64>> output_shapes_; std::set<string> op_types_; - std::map<string, const AttrValue*> op_attrs_; std::map<int64, ExecStep> execs_; - - // /j:#/t:#/r:#/device:#. A canonical device name without extra suffix. - string canonical_device_; - // The host device name. - string host_device_; - - int64 float_ops_; - - string op_; }; class TFMultiGraphNode { diff --git a/tensorflow/core/profiler/internal/tfprof_scope.cc b/tensorflow/core/profiler/internal/tfprof_scope.cc index d7293c5e52..128b296d5c 100644 --- a/tensorflow/core/profiler/internal/tfprof_scope.cc +++ b/tensorflow/core/profiler/internal/tfprof_scope.cc @@ -34,8 +34,8 @@ ScopeNode* TFScope::CreateParentNode(const string& name) { node_defs_.push_back(std::unique_ptr<NodeDef>(new NodeDef())); node_defs_.back()->set_name(name); node_defs_.back()->set_op(kTFScopeParent); - parent_nodes_[name] = - std::unique_ptr<TFGraphNode>(new TFGraphNode(node_defs_.back().get())); + parent_nodes_[name] = std::unique_ptr<TFGraphNode>( + new TFGraphNode(node_defs_.back().get(), -1)); nodes_map_[name] = std::unique_ptr<ScopeNode>(new ScopeNode(parent_nodes_[name].get())); return nodes_map_[name].get(); diff --git a/tensorflow/core/profiler/internal/tfprof_show_test.cc b/tensorflow/core/profiler/internal/tfprof_show_test.cc index 24cf9afb17..1f19f8c322 100644 --- a/tensorflow/core/profiler/internal/tfprof_show_test.cc +++ b/tensorflow/core/profiler/internal/tfprof_show_test.cc @@ -68,6 +68,23 @@ class TFProfShowTest : public ::testing::Test { tf_stats_->BuildAllViews(); } + string TestToFromProto(const string& cmd, const Options& opts, + bool show_multi_node = false) { + string profile_file = io::JoinPath(testing::TmpDir(), "profile"); + tf_stats_->WriteProfile(profile_file); + TFStats new_stats(profile_file, nullptr); + new_stats.BuildAllViews(); + if (show_multi_node) { + new_stats.ShowMultiGraphNode(cmd, opts); + } else { + new_stats.ShowGraphNode(cmd, opts); + } + string dump_str; + TF_CHECK_OK(ReadFileToString(Env::Default(), + opts.output_options.at("outfile"), &dump_str)); + return dump_str; + } + std::unique_ptr<TFStats> tf_stats_; }; @@ -95,6 +112,8 @@ TEST_F(TFProfShowTest, DumpScopeMode) { "0us/0us, 11us/11us)\n ScalarW (1, 1/1 params, 0/0 flops, 0B/0B, 0B/0B, " "0B/0B, 0B/0B, 0us/0us, 0us/0us, 0us/0us)\n", dump_str); + + EXPECT_EQ(dump_str, TestToFromProto("scope", opts)); } TEST_F(TFProfShowTest, DumpAcceleratorAndCPUMicros) { @@ -140,6 +159,8 @@ TEST_F(TFProfShowTest, DumpAcceleratorAndCPUMicros) { "0us/0us)\n ScalarW/read (0us/0us, 0us/0us)\n init (0us/0us, " "0us/0us)\n", dump_str); + + EXPECT_EQ(dump_str, TestToFromProto("scope", opts)); } TEST_F(TFProfShowTest, DumpOpMode) { @@ -182,6 +203,8 @@ TEST_F(TFProfShowTest, DumpOpMode) { "0|defined*1)\texec_time:0us\ninput_type:0:3x3x3x6\t(run*0|defined*1)" "\texec_time:0us\n\n", StringReplace(dump_str, " ", "")); + + EXPECT_EQ(dump_str, TestToFromProto("op", opts, true)); } } // namespace tfprof } // namespace tensorflow diff --git a/tensorflow/core/profiler/internal/tfprof_stats.cc b/tensorflow/core/profiler/internal/tfprof_stats.cc index 81db766796..5b54958344 100644 --- a/tensorflow/core/profiler/internal/tfprof_stats.cc +++ b/tensorflow/core/profiler/internal/tfprof_stats.cc @@ -46,12 +46,11 @@ TFStats::TFStats(std::unique_ptr<GraphDef> graph, std::unique_ptr<OpLogProto> op_log, std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader) : has_code_traces_(false), - graph_(std::move(graph)), ckpt_reader_(std::move(ckpt_reader)) { - CHECK(graph_) << "Must at least have GraphDef"; + CHECK(graph) << "Must at least have GraphDef"; printf("Parsing Inputs...\n"); - ParseGraph(); + AddGraph(std::move(graph)); if (run_meta && run_meta->has_step_stats()) { AddRunMeta(0, std::move(run_meta)); } @@ -67,6 +66,30 @@ TFStats::TFStats(std::unique_ptr<GraphDef> graph, } } +TFStats::TFStats(const string& filename, + std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader) + : has_code_traces_(false), ckpt_reader_(std::move(ckpt_reader)) { + string str; + Status s = ReadFileToString(Env::Default(), filename, &str); + if (!s.ok()) { + fprintf(stderr, "Failed to read profile: %s", s.ToString().c_str()); + return; + } + + ProfileProto profile; + if (!profile.ParseFromString(str)) { + fprintf(stderr, "Failed to parse profile\n"); + return; + } + + for (const auto& node_pb : profile.nodes()) { + std::unique_ptr<TFGraphNode> node(new TFGraphNode(node_pb.second, profile)); + nodes_map_.insert(std::pair<string, std::unique_ptr<TFGraphNode>>( + node_pb.second.name(), std::move(node))); + } + has_code_traces_ = profile.has_trace(); +} + void TFStats::BuildView(const string& cmd) { if (cmd == kCmds[0] && !scope_view_) { scope_view_.reset(new TFScope(ckpt_reader_.get())); @@ -135,16 +158,20 @@ const MultiGraphNodeProto& TFStats::ShowMultiGraphNode( } } -void TFStats::ParseGraph() { - for (const NodeDef& node : graph_->node()) { - CHECK(nodes_map_.find(node.name()) == nodes_map_.end()); +void TFStats::AddGraph(std::unique_ptr<GraphDef> graph) { + std::map<string, const NodeDef*> node_defs; + for (const NodeDef& node : graph->node()) { + if (nodes_map_.find(node.name()) != nodes_map_.end()) { + continue; + } nodes_map_[node.name()] = - std::unique_ptr<TFGraphNode>(new TFGraphNode(&node)); + std::unique_ptr<TFGraphNode>(new TFGraphNode(&node, nodes_map_.size())); + node_defs[node.name()] = &node; } - for (auto it = nodes_map_.begin(); it != nodes_map_.end(); it++) { - const NodeDef* node_def = it->second->node_def(); - for (int i = 0; i < node_def->input_size(); ++i) { - string node_input = node_def->input(i); + for (auto it = node_defs.begin(); it != node_defs.end(); it++) { + TFGraphNode* node = nodes_map_.at(it->first).get(); + for (int i = 0; i < it->second->input_size(); ++i) { + string node_input = it->second->input(i); int output_idx = 0; // input name format can be: "^node:src_output" auto prefix_pos = node_input.find(":"); @@ -160,10 +187,12 @@ void TFStats::ParseGraph() { node_input = node_input.substr(1); } auto input_node = nodes_map_.find(node_input); + // TODO(xpan): P1: Add the input even if it doesn't exist yet, because + // this can be a partial graph. if (input_node == nodes_map_.end()) { continue; } - it->second->AddInput(input_node->second.get(), output_idx, i); + node->AddInput(input_node->second.get(), output_idx, i); } } } @@ -210,10 +239,8 @@ void TFStats::AddRunMeta(int64 step, std::unique_ptr<RunMetadata> run_meta) { if (node == nodes_map_.end()) { NodeDef def; if (CreateRunMetadataNode(name, &def)) { - NodeDef* ndef = graph_->mutable_node()->Add(); - ndef->MergeFrom(def); - nodes_map_[name] = - std::unique_ptr<TFGraphNode>(new TFGraphNode(ndef)); + nodes_map_[name] = std::unique_ptr<TFGraphNode>( + new TFGraphNode(&def, nodes_map_.size())); nodes_map_.at(name)->AddStepStat(step, dev_stat.device(), node_stat); } } else { @@ -223,6 +250,25 @@ void TFStats::AddRunMeta(int64 step, std::unique_ptr<RunMetadata> run_meta) { } } +void TFStats::WriteProfile(const string& filename) { + ProfileProto profile; + for (auto it = nodes_map_.begin(); it != nodes_map_.end(); it++) { + if (it->second->id() < 0) { + continue; + } + (*profile.mutable_nodes())[it->second->id()].MergeFrom( + it->second->ToProto(nodes_map_)); + if (it->second->code().traces_size() > 0) { + profile.set_has_trace(true); + } + } + Status s = + WriteStringToFile(Env::Default(), filename, profile.SerializeAsString()); + if (!s.ok()) { + fprintf(stderr, "%s\n", s.ToString().c_str()); + } +} + bool TFStats::Validate(const Options& opts) const { if (opts.step >= 0 && steps_.find(opts.step) == steps_.end()) { fprintf(stderr, "Options -step=%lld not found\n", opts.step); diff --git a/tensorflow/core/profiler/internal/tfprof_stats.h b/tensorflow/core/profiler/internal/tfprof_stats.h index 8dbab16fe4..3b1251152d 100644 --- a/tensorflow/core/profiler/internal/tfprof_stats.h +++ b/tensorflow/core/profiler/internal/tfprof_stats.h @@ -55,6 +55,10 @@ class TFStats { std::unique_ptr<RunMetadata> run_meta, std::unique_ptr<OpLogProto> op_log, std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader); + + TFStats(const string& filename, + std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader); + ~TFStats() {} const std::map<string, std::unique_ptr<TFGraphNode>>& nodes() const { @@ -76,23 +80,25 @@ class TFStats { const MultiGraphNodeProto& ShowMultiGraphNode(const string& cmd, const Options& opts) const; + // A a (partial) graph to existing graph. + void AddGraph(std::unique_ptr<GraphDef> graph); + // Add a step of run time meta data. void AddRunMeta(int64 step, std::unique_ptr<RunMetadata> run_meta); // Add tfprof operation meta data, such as customized op type, float_ops, // and code traces. void AddOpLogProto(std::unique_ptr<OpLogProto> op_log); + void WriteProfile(const string& filename); + // For test purpose only. void AddNodeForTest(int64 step, std::unique_ptr<TFGraphNode> node); private: bool Validate(const Options& opts) const; - void ParseGraph(); - std::set<int64> steps_; bool has_code_traces_; - std::unique_ptr<GraphDef> graph_; std::unique_ptr<TFScope> scope_view_; std::unique_ptr<TFGraph> graph_view_; std::unique_ptr<TFCode> code_view_; diff --git a/tensorflow/core/profiler/internal/tfprof_stats_test.cc b/tensorflow/core/profiler/internal/tfprof_stats_test.cc index 0bea14fe66..2f2101d76b 100644 --- a/tensorflow/core/profiler/internal/tfprof_stats_test.cc +++ b/tensorflow/core/profiler/internal/tfprof_stats_test.cc @@ -69,6 +69,14 @@ class TFProfStatsTest : public ::testing::Test { tf_stats_->BuildAllViews(); } + string TestToFromProto(const string& cmd, const Options& opts) { + string profile_file = io::JoinPath(testing::TmpDir(), "profile"); + tf_stats_->WriteProfile(profile_file); + TFStats new_stats(profile_file, nullptr); + new_stats.BuildAllViews(); + return new_stats.ShowGraphNode(cmd, opts).DebugString(); + } + std::unique_ptr<TFStats> tf_stats_; }; @@ -98,6 +106,8 @@ TEST_F(TFProfStatsTest, CustomOpType) { "2\ntotal_definition_count: 3\ntotal_output_bytes: 2560\n", &expected)); EXPECT_EQ(expected.DebugString(), root.DebugString()); + + EXPECT_EQ(root.DebugString(), TestToFromProto("scope", opts)); } TEST_F(TFProfStatsTest, CheckPointOpType) { @@ -126,6 +136,8 @@ TEST_F(TFProfStatsTest, CheckPointOpType) { "2\ntotal_definition_count: 3\ntotal_output_bytes: 2560\n", &expected)); EXPECT_EQ(expected.DebugString(), root.DebugString()); + + EXPECT_EQ(root.DebugString(), TestToFromProto("scope", opts)); } TEST_F(TFProfStatsTest, TestGraph) { @@ -157,6 +169,8 @@ TEST_F(TFProfStatsTest, TestGraph) { "9984\ntotal_residual_bytes: 1280\ntotal_output_bytes: 4864\n", &expected)); EXPECT_EQ(expected.DebugString(), root.DebugString()); + + EXPECT_EQ(root.DebugString(), TestToFromProto("graph", opts)); } TEST_F(TFProfStatsTest, TestFloatOps) { @@ -203,6 +217,8 @@ TEST_F(TFProfStatsTest, TestFloatOps) { "9984\ntotal_residual_bytes: 1280\ntotal_output_bytes: 4864\n", &expected)); EXPECT_EQ(expected.DebugString(), root.DebugString()); + + EXPECT_EQ(root.DebugString(), TestToFromProto("scope", opts)); } TEST_F(TFProfStatsTest, TestAccountShownNameOnly) { @@ -236,6 +252,8 @@ TEST_F(TFProfStatsTest, TestAccountShownNameOnly) { "4096\ntotal_residual_bytes: 512\ntotal_output_bytes: 512\n", &expected)); EXPECT_EQ(expected.DebugString(), root.DebugString()); + + EXPECT_EQ(root.DebugString(), TestToFromProto("scope", opts)); } TEST_F(TFProfStatsTest, TestShowTensorValue) { diff --git a/tensorflow/core/profiler/internal/tfprof_utils.cc b/tensorflow/core/profiler/internal/tfprof_utils.cc index 383c4725b7..1ce59ad755 100644 --- a/tensorflow/core/profiler/internal/tfprof_utils.cc +++ b/tensorflow/core/profiler/internal/tfprof_utils.cc @@ -297,6 +297,7 @@ void PrintHelp() { "See https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/profiler/" "g3doc/command_line.md for command line tool tutorial.\n"); printf( + "profiler --profile_path=<ProfileProto binary file> # required\nor:\n" "profiler --graph_path=<GraphDef proto file> # required\n" " --run_meta_patn=<RunMetadata proto file> # optional\n" " --run_log_path=<OpLogProto proto file> # optional\n\n"); diff --git a/tensorflow/core/profiler/profiler.cc b/tensorflow/core/profiler/profiler.cc index 6acf4ea377..96e0b06bf3 100644 --- a/tensorflow/core/profiler/profiler.cc +++ b/tensorflow/core/profiler/profiler.cc @@ -66,6 +66,7 @@ void completion(const char* buf, linenoiseCompletions* lc) { } int Run(int argc, char** argv) { + string FLAGS_profile_path = ""; string FLAGS_graph_path = ""; string FLAGS_run_meta_path = ""; string FLAGS_op_log_path = ""; @@ -96,6 +97,7 @@ int Run(int argc, char** argv) { } std::vector<Flag> flag_list = { + Flag("profile_path", &FLAGS_profile_path, "Profile binary file name."), Flag("graph_path", &FLAGS_graph_path, "GraphDef proto text file name"), Flag("run_meta_path", &FLAGS_run_meta_path, "Comma-separated list of RunMetadata proto binary " @@ -138,6 +140,12 @@ int Run(int argc, char** argv) { } port::InitMain(argv[0], &argc, &argv); + if (!FLAGS_profile_path.empty() && !FLAGS_graph_path.empty()) { + fprintf(stderr, + "both --graph_path and --profile_path are set. " + "Ignore graph_path\n"); + } + std::vector<string> account_type_regexes = str_util::Split(FLAGS_account_type_regexes, ',', str_util::SkipEmpty()); std::vector<string> start_name_regexes = @@ -157,7 +165,7 @@ int Run(int argc, char** argv) { CHECK(s.ok()) << s.ToString(); string cmd = ""; - if (argc == 1 && FLAGS_graph_path.empty()) { + if (argc == 1 && FLAGS_graph_path.empty() && FLAGS_profile_path.empty()) { PrintHelp(); return 0; } else if (argc > 1) { @@ -173,24 +181,6 @@ int Run(int argc, char** argv) { } printf("Reading Files...\n"); - std::unique_ptr<GraphDef> graph(new GraphDef()); - TF_CHECK_OK( - ReadProtoFile(Env::Default(), FLAGS_graph_path, graph.get(), false)); - - std::unique_ptr<OpLogProto> op_log(new OpLogProto()); - if (!FLAGS_op_log_path.empty()) { - string op_log_str; - s = ReadFileToString(Env::Default(), FLAGS_op_log_path, &op_log_str); - if (!s.ok()) { - fprintf(stderr, "Failed to read op_log_path: %s\n", s.ToString().c_str()); - return 1; - } - if (!ParseProtoUnlimited(op_log.get(), op_log_str)) { - fprintf(stderr, "Failed to parse op_log_path\n"); - return 1; - } - } - std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader; TF_Status* status = TF_NewStatus(); if (!FLAGS_checkpoint_path.empty()) { @@ -204,25 +194,52 @@ int Run(int argc, char** argv) { TF_DeleteStatus(status); } - TFStats tf_stat(std::move(graph), nullptr, std::move(op_log), - std::move(ckpt_reader)); + std::unique_ptr<TFStats> tf_stat; + if (!FLAGS_profile_path.empty()) { + tf_stat.reset(new TFStats(FLAGS_profile_path, std::move(ckpt_reader))); + } else { + printf( + "Try to use a single --profile_path instead of " + "graph_path,op_log_path,run_meta_path\n"); + std::unique_ptr<GraphDef> graph(new GraphDef()); + TF_CHECK_OK( + ReadProtoFile(Env::Default(), FLAGS_graph_path, graph.get(), false)); - std::vector<string> run_meta_files = - str_util::Split(FLAGS_run_meta_path, ',', str_util::SkipEmpty()); - for (int i = 0; i < run_meta_files.size(); ++i) { - std::unique_ptr<RunMetadata> run_meta(new RunMetadata()); - s = ReadProtoFile(Env::Default(), run_meta_files[i], run_meta.get(), true); - if (!s.ok()) { - fprintf(stderr, "Failed to read run_meta_path %s. Status: %s\n", - run_meta_files[i].c_str(), s.ToString().c_str()); - return 1; + std::unique_ptr<OpLogProto> op_log(new OpLogProto()); + if (!FLAGS_op_log_path.empty()) { + string op_log_str; + s = ReadFileToString(Env::Default(), FLAGS_op_log_path, &op_log_str); + if (!s.ok()) { + fprintf(stderr, "Failed to read op_log_path: %s\n", + s.ToString().c_str()); + return 1; + } + if (!ParseProtoUnlimited(op_log.get(), op_log_str)) { + fprintf(stderr, "Failed to parse op_log_path\n"); + return 1; + } + } + tf_stat.reset(new TFStats(std::move(graph), nullptr, std::move(op_log), + std::move(ckpt_reader))); + + std::vector<string> run_meta_files = + str_util::Split(FLAGS_run_meta_path, ',', str_util::SkipEmpty()); + for (int i = 0; i < run_meta_files.size(); ++i) { + std::unique_ptr<RunMetadata> run_meta(new RunMetadata()); + s = ReadProtoFile(Env::Default(), run_meta_files[i], run_meta.get(), + true); + if (!s.ok()) { + fprintf(stderr, "Failed to read run_meta_path %s. Status: %s\n", + run_meta_files[i].c_str(), s.ToString().c_str()); + return 1; + } + tf_stat->AddRunMeta(i, std::move(run_meta)); } - tf_stat.AddRunMeta(i, std::move(run_meta)); } if (cmd == kCmds[4]) { - tf_stat.BuildAllViews(); - Advisor(&tf_stat).Advise(Advisor::DefaultOptions()); + tf_stat->BuildAllViews(); + Advisor(tf_stat.get()).Advise(Advisor::DefaultOptions()); return 0; } @@ -236,12 +253,12 @@ int Run(int argc, char** argv) { select, output_type, output_options); if (cmd == kCmds[2] || cmd == kCmds[3]) { - tf_stat.BuildView(cmd); - tf_stat.ShowMultiGraphNode(cmd, opts); + tf_stat->BuildView(cmd); + tf_stat->ShowMultiGraphNode(cmd, opts); return 0; } else if (cmd == kCmds[0] || cmd == kCmds[1]) { - tf_stat.BuildView(cmd); - tf_stat.ShowGraphNode(cmd, opts); + tf_stat->BuildView(cmd); + tf_stat->ShowGraphNode(cmd, opts); return 0; } @@ -270,14 +287,14 @@ int Run(int argc, char** argv) { } else if (cmd == kCmds[6]) { PrintHelp(); } else if (cmd == kCmds[2] || cmd == kCmds[3]) { - tf_stat.BuildView(cmd); - tf_stat.ShowMultiGraphNode(cmd, new_opts); + tf_stat->BuildView(cmd); + tf_stat->ShowMultiGraphNode(cmd, new_opts); } else if (cmd == kCmds[0] || cmd == kCmds[1]) { - tf_stat.BuildView(cmd); - tf_stat.ShowGraphNode(cmd, new_opts); + tf_stat->BuildView(cmd); + tf_stat->ShowGraphNode(cmd, new_opts); } else if (cmd == kCmds[4]) { - tf_stat.BuildAllViews(); - Advisor(&tf_stat).Advise(Advisor::DefaultOptions()); + tf_stat->BuildAllViews(); + Advisor(tf_stat.get()).Advise(Advisor::DefaultOptions()); } } return 0; diff --git a/tensorflow/core/profiler/tfprof_log.proto b/tensorflow/core/profiler/tfprof_log.proto index 006ee2b7a5..1ce5a5eecf 100644 --- a/tensorflow/core/profiler/tfprof_log.proto +++ b/tensorflow/core/profiler/tfprof_log.proto @@ -2,6 +2,8 @@ syntax = "proto3"; package tensorflow.tfprof; +import "tensorflow/core/framework/attr_value.proto"; + // It specifies the Python callstack that creates an op. message CodeDef { repeated Trace traces = 1; @@ -31,3 +33,94 @@ message OpLogEntry { message OpLogProto { repeated OpLogEntry log_entries = 1; } + +// A proto representation of the profiler's profile. +// It allows serialization, shipping around and deserialization of the profiles. +// +// Please don't depend on the internals of the profile proto. +message ProfileProto { + map<int64, ProfileNode> nodes = 1; + // Whether or not has code traces. + bool has_trace = 2; +} + +message ProfileNode { + // graph node name. + string name = 1; + // graph operation type. + string op = 9; + // A unique id for the node. + int64 id = 13; + + map<int32, int64> inputs = 2; + map<int32, Tuple> input_shapes = 16; + map<int32, int64> outputs = 3; + map<int32, Tuple> output_shapes = 15; + // A map from source node id to its output index to current node. + map<int64, int32> src_output_index = 14; + + repeated int64 shape = 4; + repeated string op_types = 5; + string canonical_device = 6; + string host_device = 7; + + int64 float_ops = 8; + + CodeDef trace = 10; + map<string, AttrValue> attrs = 11; + + map<int64, ExecProfile> execs = 12; +} + +message ExecProfile { + // Can be larger than 1 if run multiple times in loop. + int64 run_count = 1; + // The earliest/latest time including scheduling and execution. + int64 all_start_micros = 2; + int64 latest_end_micros = 3; + + // device -> vector of {op_start_micros, op_exec_micros} pairs. + // accelerator_execs: gpu:id/stream:all -> {op_start_micros, op_exec_micros} + // For accelerator, vector size can be larger than 1, multiple kernel fires + // or in tf.while_loop. + map<string, ExecTime> accelerator_execs = 4; + // cpu_execs: cpu/gpu:id -> {op_start_micros, op_exec_micros} + // For cpu, vector size can be larger than 1 if in tf.while_loop. + map<string, ExecTime> cpu_execs = 5; + + map<int32, Memory> output_memory = 17; + + repeated string devices = 6; + + // Total bytes requested by the op. + int64 requested_bytes = 7; + // Total bytes requested by the op and released before op end. + int64 peak_bytes = 8; + // Total bytes requested by the op and not released after op end. + int64 residual_bytes = 9; + // Total bytes output by the op (not necessarily requested by the op). + int64 output_bytes = 10; + // Total temporary bytes allocated and released by the op. + int64 host_temp_bytes = 11; + // Total persistent bytes (e.g. variable) allocated by the op. + int64 host_persistent_bytes = 12; + int64 accelerator_temp_bytes = 13; + int64 accelerator_persistent_bytes = 14; + // The total number of bytes currently allocated by the allocator if >0. + int64 allocator_bytes_in_use = 15; + + bool memory_intialized = 16; +} + +message ExecTime { + repeated Tuple times = 1; +} + +message Tuple { + repeated int64 int64_values = 1; +} + +message Memory { + int64 bytes = 1; + uint64 ptr = 2; +} |