aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/core/profiler
diff options
context:
space:
mode:
authorGravatar A. Unique TensorFlower <gardener@tensorflow.org>2017-09-06 21:56:33 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2017-09-06 22:01:54 -0700
commit9e9ffa33d154b6c332bd475d6da2030746460fed (patch)
tree64de1c00853926d394f6f4ba82f01f448e990fda /tensorflow/core/profiler
parent27ce2a4ac956941ba8a0b9aaaa77acc0aa861fef (diff)
Unify all profile files (graph,run_meta,op_log) into one.
Also allow profiler to serialize/deserialize to/from file. PiperOrigin-RevId: 167815923
Diffstat (limited to 'tensorflow/core/profiler')
-rw-r--r--tensorflow/core/profiler/README.md92
-rw-r--r--tensorflow/core/profiler/internal/advisor/operation_checker.h7
-rw-r--r--tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc2
-rw-r--r--tensorflow/core/profiler/internal/print_model_analysis.cc40
-rw-r--r--tensorflow/core/profiler/internal/print_model_analysis.h9
-rw-r--r--tensorflow/core/profiler/internal/tfprof_graph.cc10
-rw-r--r--tensorflow/core/profiler/internal/tfprof_node.cc68
-rw-r--r--tensorflow/core/profiler/internal/tfprof_node.h313
-rw-r--r--tensorflow/core/profiler/internal/tfprof_scope.cc4
-rw-r--r--tensorflow/core/profiler/internal/tfprof_show_test.cc23
-rw-r--r--tensorflow/core/profiler/internal/tfprof_stats.cc78
-rw-r--r--tensorflow/core/profiler/internal/tfprof_stats.h12
-rw-r--r--tensorflow/core/profiler/internal/tfprof_stats_test.cc18
-rw-r--r--tensorflow/core/profiler/internal/tfprof_utils.cc1
-rw-r--r--tensorflow/core/profiler/profiler.cc103
-rw-r--r--tensorflow/core/profiler/tfprof_log.proto93
16 files changed, 631 insertions, 242 deletions
diff --git a/tensorflow/core/profiler/README.md b/tensorflow/core/profiler/README.md
index f0d4dafd3e..2ddac0a79b 100644
--- a/tensorflow/core/profiler/README.md
+++ b/tensorflow/core/profiler/README.md
@@ -21,60 +21,70 @@
### Quick Start
```python
-# Enable tracing and profile the time and memory information.
+# When using high-level API, session is usually hidden.
+#
+# Under the default ProfileContext, run a few hundred steps.
+# The ProfileContext will sample some steps and dump the profiles
+# to files. Users can then use command line tool or Web UI for
+# interactive profiling.
+with tf.contrib.tfprof.ProfileContext('/tmp/train_dir') as pctx:
+ # High level API, such as slim, Estimator, etc.
+ train_loop()
+
+bazel-bin/tensorflow/core/profiler/profiler \
+ --profile_path=/tmp/train_dir/profile_xx
+tfprof> op -select micros,bytes,occurrence -order_by micros
+
+# To be open sourced...
+bazel-bin/third_party/tensorflow/python/profiler/profiler_ui \
+ --profile_path=/tmp/profiles/profile_1
+```
+![ProfilerUI](g3doc/profiler_ui.jpg)
+
+```python
+# When using lower-level APIs with a Session object. User can have
+# explicit control of each step.
+#
+# Create options to profile the time and memory information.
builder = tf.profiler.ProfileOptionBuilder
opts = builder(builder.time_and_memory()).order_by('micros').build()
-with tf.contrib.tfprof.ProfileContext() as pctx:
+# Create a profiling context, set contructor argument `trace_steps`,
+# `dump_steps` to empty for explicit control.
+with tf.contrib.tfprof.ProfileContext('/tmp/train_dir',
+ trace_steps=[],
+ dump_steps=[]) as pctx:
with tf.Session() as sess:
- x = build_model()
- sess.run(tf.global_variables_initializer())
- for i in range(10000):
- if i == 10:
- pctx.capture_next_run_meta()
- _ = sess.run(x)
- tf.profiler.profile(sess.graph, run_meta=pctx.run_meta(), cmd='op', options=opts)
- else:
- _ = sess.run(x)
+ # Enable tracing for next session.run.
+ pctx.trace_next_step()
+ # Dump the profile to '/tmp/train_dir' after the step.
+ pctx.dump_next_step()
+ _ = session.run(train_op)
+ pctx.profiler.profile_operations(options=opts)
```
```python
-# When using high-level API, session is usually hidden.
-# Register profiling to ProfileContext that performs profiling automatically.
+# For more advanced usage, user can control the tracing steps and
+# dumping steps. User can also run online profiling during training.
+#
+# Create options to profile time/memory as well as parameters.
builder = tf.profiler.ProfileOptionBuilder
opts = builder(builder.time_and_memory()).order_by('micros').build()
opts2 = tf.profiler.ProfileOptionBuilder.trainable_variables_parameter()
-with tf.contrib.tfprof.ProfileContext() as pctx:
- # Profile time and memory from step 100 to 109.
- # Note: High level APIs can run sessions for queue, variable initialization
- # or training at different steps. Try profile multiple steps.
- pctx.add_auto_profiling('op', opts, range(100, 110))
- # Profile model parameters once at step 10 (since it is static).
- pctx.add_auto_profiling('scope', opts2, [10])
+
+# Collect traces of steps 10~20, dump the whole profile (with traces of
+# step 10~20) at step 20. The dumped profile can be used for further profiling
+# with command line interface or Web UI.
+with tf.contrib.tfprof.ProfileContext('/tmp/train_dir',
+ trace_steps=range(10, 20),
+ dump_steps=[20]) as pctx:
+ # Run online profiling with 'op' view and 'opts' options at step 15, 18, 20.
+ pctx.add_auto_profiling('op', opts, [15, 18, 20])
+ # Run online profiling with 'scope' view and 'opts2' options at step 20.
+ pctx.add_auto_profiling('scope', opts2, [20])
# High level API, such as slim, Estimator, etc.
train_loop()
```
-```shell
-# Profiling from Python API is not interactive.
-# Dump the profiles to files and profile with interactive command line or web UI.
-with tf.contrib.tfprof.ProfileContext() as pctx:
- pctx.add_auto_profile_dump('/tmp/profiles', [100])
- train_loop()
-
-bazel-bin/tensorflow/core/profiler/profiler \
- --graph_path=/tmp/profiles/graph.pbtxt \
- --run_meta_path=/tmp/profiles/run_meta \
- --op_log_path=/tmp/profiles/tfprof_log \
-tfprof> op -select micros,bytes,occurrence -order_by micros
-
-
-# To be open sourced...
-bazel-bin/third_party/tensorflow/python/profiler/profiler_ui \
- --graph_path=/tmp/profiles/graph.pbtxt \
- --run_meta_path=/tmp/profiles/run_meta \
- --op_log_path=/tmp/profiles/tfprof_log \
-```
-![ProfilerUI](g3doc/profiler_ui.jpg)
<b>Detail Tutorials</b>
diff --git a/tensorflow/core/profiler/internal/advisor/operation_checker.h b/tensorflow/core/profiler/internal/advisor/operation_checker.h
index 08f6d3c389..f0bd72fa40 100644
--- a/tensorflow/core/profiler/internal/advisor/operation_checker.h
+++ b/tensorflow/core/profiler/internal/advisor/operation_checker.h
@@ -45,9 +45,10 @@ class OperationChecker : public Checker {
if (node->op_types().find("FusedBatchNorm") != node->op_types().end()) {
use_fused_batch_norm = true;
}
- if (node->op_attrs().find("data_format") != node->op_attrs().end()) {
- const AttrValue* attr_val = node->op_attrs().at("data_format");
- if (attr_val->s() == "NHWC" &&
+
+ const AttrValue* attr = node->op_attrs("data_format");
+ if (attr) {
+ if (attr->s() == "NHWC" &&
IsPlacedOnAccelerator(node->canonical_device())) {
recommend_nchw = true;
}
diff --git a/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc b/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
index 23ed287f7b..c39d44b7fa 100644
--- a/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
+++ b/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
@@ -48,7 +48,7 @@ class TFProfAdvisorTest : public ::testing::Test {
for (const auto& attr : attrs) {
(*def->mutable_attr())[attr.first].set_s(attr.second);
}
- std::unique_ptr<TFGraphNode> node(new TFGraphNode(def));
+ std::unique_ptr<TFGraphNode> node(new TFGraphNode(def, -1));
NodeExecStats node_stat;
node_stat.set_all_start_micros(start_miros);
diff --git a/tensorflow/core/profiler/internal/print_model_analysis.cc b/tensorflow/core/profiler/internal/print_model_analysis.cc
index fd46b957e8..ddf3c7f1f2 100644
--- a/tensorflow/core/profiler/internal/print_model_analysis.cc
+++ b/tensorflow/core/profiler/internal/print_model_analysis.cc
@@ -90,38 +90,61 @@ bool NewProfiler(const string* graph, const string* op_log) {
if (!graph_ptr->ParseFromString(*graph)) {
if (!protobuf::TextFormat::ParseFromString(*graph, graph_ptr.get())) {
fprintf(stderr, "Failed to parse graph\n");
+ return false;
}
}
std::unique_ptr<OpLogProto> op_log_ptr;
if (op_log && !op_log->empty()) {
op_log_ptr.reset(new OpLogProto());
- op_log_ptr->ParseFromString(*op_log);
+ if (!op_log_ptr->ParseFromString(*op_log)) {
+ fprintf(stderr, "Failed to parse OpLogProto.\n");
+ return false;
+ }
}
tf_stat = new TFStats(std::move(graph_ptr), nullptr, std::move(op_log_ptr),
nullptr);
return true;
}
+void ProfilerFromFile(const string* filename) {
+ CHECK(!tf_stat) << "Currently only 1 living tfprof profiler is allowed";
+ CHECK(filename) << "Missing profile filename to init profiler from file";
+ tf_stat = new TFStats(*filename, nullptr);
+}
+
void DeleteProfiler() {
- delete tf_stat;
- tf_stat = nullptr;
+ if (tf_stat) {
+ delete tf_stat;
+ tf_stat = nullptr;
+ }
}
-void AddStep(int64 step, const string* run_meta, const string* op_log) {
+void AddStep(int64 step, const string* graph, const string* run_meta,
+ const string* op_log) {
CHECK(tf_stat);
+
+ CHECK(graph && !graph->empty());
+ std::unique_ptr<GraphDef> graph_ptr(new GraphDef());
+ if (!graph_ptr->ParseFromString(*graph)) {
+ if (!protobuf::TextFormat::ParseFromString(*graph, graph_ptr.get())) {
+ fprintf(stderr, "Failed to parse graph\n");
+ }
+ }
+ tf_stat->AddGraph(std::move(graph_ptr));
+
CHECK(run_meta && !run_meta->empty());
// TODO(xpan): Better error handling.
std::unique_ptr<RunMetadata> run_meta_ptr(new RunMetadata());
run_meta_ptr->ParseFromString(*run_meta);
tf_stat->AddRunMeta(step, std::move(run_meta_ptr));
- std::unique_ptr<OpLogProto> op_log_ptr;
if (op_log && !op_log->empty()) {
+ std::unique_ptr<OpLogProto> op_log_ptr;
op_log_ptr.reset(new OpLogProto());
op_log_ptr->ParseFromString(*op_log);
+ tf_stat->AddOpLogProto(std::move(op_log_ptr));
}
- tf_stat->AddOpLogProto(std::move(op_log_ptr));
}
string Profile(const string* command, const string* options) {
@@ -131,6 +154,11 @@ string Profile(const string* command, const string* options) {
return RunProfile(*command, *options, tf_stat);
}
+void WriteProfile(const string* filename) {
+ CHECK(filename) << "empty file name when asking to write profile.";
+ tf_stat->WriteProfile(*filename);
+}
+
string PrintModelAnalysis(const string* graph, const string* run_meta,
const string* op_log, const string* command,
const string* options) {
diff --git a/tensorflow/core/profiler/internal/print_model_analysis.h b/tensorflow/core/profiler/internal/print_model_analysis.h
index 76a436f71c..e4d01041a8 100644
--- a/tensorflow/core/profiler/internal/print_model_analysis.h
+++ b/tensorflow/core/profiler/internal/print_model_analysis.h
@@ -35,7 +35,14 @@ bool NewProfiler(const string* graph, const string* op_log);
void DeleteProfiler();
-void AddStep(int64 step, const string* run_meta, const string* op_log);
+void AddStep(int64 step, const string* graph, const string* run_meta,
+ const string* op_log);
+
+// Write the profiler's profile to a proto buffer.
+void WriteProfile(const string* filename);
+
+// Load the profile to profiler from a proto buffer file.
+void ProfilerFromFile(const string* filename);
string Profile(const string* command, const string* options);
diff --git a/tensorflow/core/profiler/internal/tfprof_graph.cc b/tensorflow/core/profiler/internal/tfprof_graph.cc
index e58924a614..3766365bf8 100644
--- a/tensorflow/core/profiler/internal/tfprof_graph.cc
+++ b/tensorflow/core/profiler/internal/tfprof_graph.cc
@@ -30,8 +30,8 @@ GraphNode* TFGraph::CreateParentNode(const string& name) {
node_defs_.push_back(std::unique_ptr<NodeDef>(new NodeDef()));
node_defs_.back()->set_name(name);
node_defs_.back()->set_op(kTFGraphParent);
- parent_nodes_[name] =
- std::unique_ptr<TFGraphNode>(new TFGraphNode(node_defs_.back().get()));
+ parent_nodes_[name] = std::unique_ptr<TFGraphNode>(
+ new TFGraphNode(node_defs_.back().get(), -1));
nodes_map_[name] =
std::unique_ptr<GraphNode>(new GraphNode(parent_nodes_[name].get()));
return nodes_map_[name].get();
@@ -49,11 +49,11 @@ void TFGraph::Build() {
// Filter out the root nodes (node not input of any other node).
for (auto it = nodes_map_.begin(); it != nodes_map_.end(); it++) {
GraphNode* node = it->second.get();
- const std::map<int, TFGraphNode*>& inputs = node->node->inputs();
+ const std::map<int, string>& inputs = node->node->inputs();
for (auto inputs_it = inputs.cbegin(); inputs_it != inputs.cend();
inputs_it++) {
- nonroots.insert(inputs_it->second->name());
- auto child_it = nodes_map_.find(inputs_it->second->name());
+ nonroots.insert(inputs_it->second);
+ auto child_it = nodes_map_.find(inputs_it->second);
if (child_it != nodes_map_.end()) {
node->children.push_back(child_it->second.get());
}
diff --git a/tensorflow/core/profiler/internal/tfprof_node.cc b/tensorflow/core/profiler/internal/tfprof_node.cc
index 19257968e6..e2be2cf4cf 100644
--- a/tensorflow/core/profiler/internal/tfprof_node.cc
+++ b/tensorflow/core/profiler/internal/tfprof_node.cc
@@ -51,19 +51,21 @@ bool IsCanonicalDevice(const string& device) { return CountAsCPUTime(device); }
void ExecStep::AddTimeStats(const string& dev, const NodeExecStats& step_stat) {
devices_.insert(dev);
if (step_stat.all_start_micros() > 0) {
- if (all_start_micros_ > 0) {
- all_start_micros_ = std::min(
- all_start_micros_, static_cast<int64>(step_stat.all_start_micros()));
+ if (exec_.all_start_micros() > 0) {
+ exec_.set_all_start_micros(
+ std::min(static_cast<int64>(exec_.all_start_micros()),
+ static_cast<int64>(step_stat.all_start_micros())));
} else {
- all_start_micros_ = step_stat.all_start_micros();
+ exec_.set_all_start_micros(step_stat.all_start_micros());
}
int64 op_end_rel_micros = step_stat.op_end_rel_micros();
// Round quick execution to 1 micro to be semantically robust.
if (op_end_rel_micros == 0) {
++op_end_rel_micros;
}
- latest_end_micros_ = std::max(
- latest_end_micros_, step_stat.all_start_micros() + op_end_rel_micros);
+ exec_.set_latest_end_micros(
+ std::max(static_cast<int64>(exec_.latest_end_micros()),
+ step_stat.all_start_micros() + op_end_rel_micros));
const std::pair<int64, int64> pair =
std::make_pair(step_stat.all_start_micros(), op_end_rel_micros);
@@ -75,17 +77,17 @@ void ExecStep::AddTimeStats(const string& dev, const NodeExecStats& step_stat) {
op_execs_[dev].push_back(pair);
// In while-loop, a graph node is executed multiple times under
// the same name.
- run_count_ += 1;
+ exec_.set_run_count(exec_.run_count() + 1);
}
}
}
void ExecStep::AddMemoryStats(const string& dev,
const NodeExecStats& step_stat) {
- if (mem_initiated_) {
+ if (exec_.memory_intialized()) {
return;
}
- mem_initiated_ = true;
+ exec_.set_memory_intialized(true);
for (const auto& mem : step_stat.memory()) {
// TODO(xpan): Fix this hack. Currently the allocator name seems quite
@@ -93,9 +95,9 @@ void ExecStep::AddMemoryStats(const string& dev,
if (mem.allocator_name().find("GPU") == mem.allocator_name().npos) {
continue;
}
- allocator_bytes_in_use_ =
- std::max(allocator_bytes_in_use_,
- static_cast<int64>(mem.allocator_bytes_in_use()));
+ exec_.set_allocator_bytes_in_use(
+ std::max(static_cast<int64>(exec_.allocator_bytes_in_use()),
+ static_cast<int64>(mem.allocator_bytes_in_use())));
}
int64 total_output_bytes = 0;
for (const auto& output : step_stat.output()) {
@@ -114,16 +116,20 @@ void ExecStep::AddMemoryStats(const string& dev,
output_memory_[output.slot()] = std::make_pair(output_bytes, output_ptr);
}
}
- output_bytes_ = total_output_bytes;
+ exec_.set_output_bytes(total_output_bytes);
if (step_stat.has_memory_stats()) {
- host_temp_bytes_ += step_stat.memory_stats().host_temp_memory_size();
- host_persistent_bytes_ +=
- step_stat.memory_stats().host_persistent_memory_size();
- accelerator_temp_bytes_ +=
- step_stat.memory_stats().device_temp_memory_size();
- accelerator_persistent_bytes_ +=
- step_stat.memory_stats().device_persistent_memory_size();
+ exec_.set_host_temp_bytes(exec_.host_temp_bytes() +
+ step_stat.memory_stats().host_temp_memory_size());
+ exec_.set_host_persistent_bytes(
+ exec_.host_persistent_bytes() +
+ step_stat.memory_stats().host_persistent_memory_size());
+ exec_.set_accelerator_temp_bytes(
+ exec_.accelerator_temp_bytes() +
+ step_stat.memory_stats().device_temp_memory_size());
+ exec_.set_accelerator_persistent_bytes(
+ exec_.accelerator_persistent_bytes() +
+ step_stat.memory_stats().device_persistent_memory_size());
}
int64 residual_bytes = 0;
int64 requested_bytes = 0;
@@ -133,9 +139,9 @@ void ExecStep::AddMemoryStats(const string& dev,
requested_bytes += mem.total_bytes();
peak_bytes += mem.peak_bytes();
}
- requested_bytes_ = requested_bytes;
- residual_bytes_ = residual_bytes;
- peak_bytes_ = peak_bytes;
+ exec_.set_requested_bytes(requested_bytes);
+ exec_.set_residual_bytes(residual_bytes);
+ exec_.set_peak_bytes(peak_bytes);
}
void TFGraphNode::AddStepStat(int64 step, const string& device,
@@ -147,33 +153,33 @@ void TFGraphNode::AddStepStat(int64 step, const string& device,
// It can be /job:0/replica:0/xxxx/device:GPU:0, or simply /device:GPU:0.
// It can has some ad-hoc suffix, such as /stream:xx or /memcpy:xx.
if (IsCanonicalDevice(dev)) {
- if (!canonical_device_.empty()) {
- if (canonical_device_ != dev) {
+ if (!node_.canonical_device().empty()) {
+ if (node_.canonical_device() != dev) {
// TODO(xpan): Some RunMetadata node appears at multiple devices.
// Need to address it.
return;
}
} else {
- canonical_device_ = dev;
+ node_.set_canonical_device(dev);
// TODO(xpan): Support things other than gpu?
if (dev.find("sycl") != dev.npos) {
- host_device_ = StringReplace(dev, "device:sycl:\\d+", "cpu:0");
+ node_.set_host_device(StringReplace(dev, "device:sycl:\\d+", "cpu:0"));
} else {
- host_device_ = StringReplace(dev, "gpu:\\d+", "cpu:0");
+ node_.set_host_device(StringReplace(dev, "gpu:\\d+", "cpu:0"));
}
- AddOpType(canonical_device_);
+ AddOpType(node_.canonical_device());
}
}
auto exec = execs_.find(step);
if (exec == execs_.end()) {
- execs_.insert(std::pair<int64, ExecStep>(step, ExecStep(this)));
+ execs_.insert(std::pair<int64, ExecStep>(step, ExecStep()));
exec = execs_.find(step);
}
exec->second.AddTimeStats(dev, step_stat);
- if (dev == canonical_device_) {
+ if (dev == node_.canonical_device()) {
exec->second.AddMemoryStats(dev, step_stat);
}
}
diff --git a/tensorflow/core/profiler/internal/tfprof_node.h b/tensorflow/core/profiler/internal/tfprof_node.h
index 5ec3da12cf..55d53f3923 100644
--- a/tensorflow/core/profiler/internal/tfprof_node.h
+++ b/tensorflow/core/profiler/internal/tfprof_node.h
@@ -44,27 +44,13 @@ class TFGraphNode;
class ExecStep {
public:
- ExecStep(TFGraphNode* node)
- : node(node),
- run_count_(0),
- all_start_micros_(0),
- latest_end_micros_(0),
- mem_initiated_(false),
- requested_bytes_(0),
- peak_bytes_(0),
- residual_bytes_(0),
- output_bytes_(0),
- host_temp_bytes_(0),
- host_persistent_bytes_(0),
- accelerator_temp_bytes_(0),
- accelerator_persistent_bytes_(0),
- allocator_bytes_in_use_(0) {}
+ ExecStep() {}
void AddTimeStats(const string& dev, const NodeExecStats& step_stat);
void AddMemoryStats(const string& dev, const NodeExecStats& step_stat);
- int64 run_count() const { return run_count_; }
+ int64 run_count() const { return exec_.run_count(); }
// The execution time of an op. If it runs on accelerator, then it's
// accelerator_exec_micros(). Otherwise, it's CPU time.
int64 exec_micros() const;
@@ -77,31 +63,101 @@ class ExecStep {
const {
return op_execs_;
}
- int64 all_start_micros() const { return all_start_micros_; }
- int64 latest_end_micros() const { return latest_end_micros_; }
-
- int64 requested_bytes() const { return requested_bytes_; }
- int64 peak_bytes() const { return peak_bytes_; }
- int64 residual_bytes() const { return residual_bytes_; }
- int64 output_bytes() const { return output_bytes_; }
- int64 accelerator_temp_bytes() const { return accelerator_temp_bytes_; }
- int64 host_temp_bytes() const { return host_temp_bytes_; }
+ int64 all_start_micros() const { return exec_.all_start_micros(); }
+ int64 latest_end_micros() const { return exec_.latest_end_micros(); }
+
+ int64 requested_bytes() const { return exec_.requested_bytes(); }
+ int64 peak_bytes() const { return exec_.peak_bytes(); }
+ int64 residual_bytes() const { return exec_.residual_bytes(); }
+ int64 output_bytes() const { return exec_.output_bytes(); }
+ int64 accelerator_temp_bytes() const {
+ return exec_.accelerator_temp_bytes();
+ }
+ int64 host_temp_bytes() const { return exec_.host_temp_bytes(); }
int64 accelerator_persistent_bytes() const {
- return accelerator_persistent_bytes_;
+ return exec_.accelerator_persistent_bytes();
}
- int64 host_persistent_bytes() const { return host_persistent_bytes_; }
- const std::map<int64, std::pair<int64, uint64>>& output_memory() const {
+ int64 host_persistent_bytes() const { return exec_.host_persistent_bytes(); }
+ const std::map<int32, std::pair<int64, uint64>>& output_memory() const {
return output_memory_;
}
- int64 allocator_bytes_in_use() const { return allocator_bytes_in_use_; }
+ int64 allocator_bytes_in_use() const {
+ return exec_.allocator_bytes_in_use();
+ }
+
+ const ExecProfile& ToProto() {
+ exec_.mutable_accelerator_execs()->clear();
+ for (const auto& e : accelerator_execs_) {
+ auto& exec_time = (*exec_.mutable_accelerator_execs())[e.first];
+ for (const auto& p : e.second) {
+ auto* t = exec_time.mutable_times()->Add();
+ t->add_int64_values(p.first);
+ t->add_int64_values(p.second);
+ }
+ }
+
+ exec_.mutable_cpu_execs()->clear();
+ for (const auto& e : cpu_execs_) {
+ auto& exec_time = (*exec_.mutable_cpu_execs())[e.first];
+ for (const auto& p : e.second) {
+ auto* t = exec_time.mutable_times()->Add();
+ t->add_int64_values(p.first);
+ t->add_int64_values(p.second);
+ }
+ }
+
+ exec_.mutable_devices()->Clear();
+ exec_.mutable_devices()->Reserve(devices_.size());
+ for (const string& d : devices_) {
+ exec_.add_devices(d);
+ }
+
+ exec_.mutable_output_memory()->clear();
+ for (const auto& mem : output_memory_) {
+ auto& mem_pb = (*exec_.mutable_output_memory())[mem.first];
+ mem_pb.set_bytes(mem.second.first);
+ mem_pb.set_ptr(mem.second.second);
+ }
+
+ return exec_;
+ }
+
+ void FromProto(const ExecProfile& exec) {
+ exec_.Clear();
+ exec_.MergeFrom(exec);
+
+ devices_.clear();
+ devices_.insert(exec.devices().begin(), exec.devices().end());
+
+ accelerator_execs_.clear();
+ cpu_execs_.clear();
+ op_execs_.clear();
+
+ for (const auto& exec_time : exec_.accelerator_execs()) {
+ auto& exec = accelerator_execs_[exec_time.first];
+ auto& op_exec = op_execs_[exec_time.first];
+ for (const auto& p : exec_time.second.times()) {
+ exec.push_back(std::make_pair(p.int64_values(0), p.int64_values(1)));
+ op_exec.push_back(std::make_pair(p.int64_values(0), p.int64_values(1)));
+ }
+ }
+ for (const auto& exec_time : exec_.cpu_execs()) {
+ auto& exec = cpu_execs_[exec_time.first];
+ auto& op_exec = op_execs_[exec_time.first];
+ for (const auto& p : exec_time.second.times()) {
+ exec.push_back(std::make_pair(p.int64_values(0), p.int64_values(1)));
+ op_exec.push_back(std::make_pair(p.int64_values(0), p.int64_values(1)));
+ }
+ }
+ for (const auto& output_mem : exec_.output_memory()) {
+ auto& mem = output_memory_[output_mem.first];
+ mem.first = output_mem.second.bytes();
+ mem.second = output_mem.second.ptr();
+ }
+ }
private:
- TFGraphNode* node;
- // Can be larger than 1 if run multiple times in loop.
- int64 run_count_;
- // The earliest/latest time including scheduling and execution.
- int64 all_start_micros_;
- int64 latest_end_micros_;
+ ExecProfile exec_;
// device -> vector of {op_start_micros, op_exec_micros} pairs.
// accelerator_execs: gpu:id/stream:all -> {op_start_micros, op_exec_micros}
// For accelerator, vector size can be larger than 1, multiple kernel fires
@@ -115,26 +171,8 @@ class ExecStep {
// All devices the op is associated with (e.g. gpu:0 (scheduling),
// gpu:0:stream:xx (kernel exec), cpu:0 host)
std::set<string> devices_;
-
- bool mem_initiated_;
- // Total bytes requested by the op.
- int64 requested_bytes_;
- // Total bytes requested by the op and released before op end.
- int64 peak_bytes_;
- // Total bytes requested by the op and not released after op end.
- int64 residual_bytes_;
- // Total bytes output by the op (not necessarily requested by the op).
- int64 output_bytes_;
- // Total temporary bytes allocated and released by the op.
- int64 host_temp_bytes_;
- // Total persistent bytes (e.g. variable) allocated by the op.
- int64 host_persistent_bytes_;
- int64 accelerator_temp_bytes_;
- int64 accelerator_persistent_bytes_;
- // The total number of bytes currently allocated by the allocator if >0.
- int64 allocator_bytes_in_use_;
// output_idx -> {output_bytes, memory_ptr}
- std::map<int64, std::pair<int64, uint64>> output_memory_;
+ std::map<int32, std::pair<int64, uint64>> output_memory_;
};
#define GRAPH_NODE_BYTES(type) \
@@ -157,10 +195,18 @@ class ExecStep {
class TFGraphNode {
public:
- TFGraphNode(const NodeDef* node)
- : node_(node), float_ops_(0), op_(node->op()) {
+ TFGraphNode(const ProfileNode& node, const ProfileProto& profile) {
+ FromProto(node, profile);
+ }
+
+ TFGraphNode(const NodeDef* node, int64 id) {
+ node_.set_id(id);
+ node_.set_name(node->name());
+ node_.set_op(node->op());
+ node_.set_float_ops(0);
+
for (const auto& attr : node->attr()) {
- op_attrs_[attr.first] = &attr.second;
+ (*node_.mutable_attrs())[attr.first].MergeFrom(attr.second);
if (attr.first == "shape" && attr.second.has_shape()) {
if (!shape_.empty()) {
fprintf(stderr, "Found duplicated shapes!\n");
@@ -180,10 +226,10 @@ class TFGraphNode {
op_types_.insert(node->op());
}
- void AddInput(TFGraphNode* input, int64 output_idx, int input_idx) {
+ void AddInput(TFGraphNode* input, int32 output_idx, int input_idx) {
src_output_idx_[input->name()] = output_idx;
- inputs_[input_idx] = input;
+ inputs_[input_idx] = input->name();
const auto& output_shape = input->output_shapes().find(output_idx);
// Always create an empty vec even if the shape info might be missing.
std::vector<int64>& shape_vec = input_shapes_[input_idx];
@@ -198,28 +244,123 @@ class TFGraphNode {
void AddStepStat(int64 step, const string& device,
const NodeExecStats& step_stat);
- void AddFloatOps(int64 float_ops) { float_ops_ = float_ops; }
+ void AddFloatOps(int64 float_ops) { node_.set_float_ops(float_ops); }
// TODO(xpan): This could take a lot of memory.
- void AddCode(const CodeDef& code) { code_.MergeFrom(code); }
+ void AddCode(const CodeDef& code) { node_.mutable_trace()->MergeFrom(code); }
- const string& name() const { return node_->name(); }
- const string& op() const { return op_; }
- const NodeDef* node_def() { return node_; }
+ const string& name() const { return node_.name(); }
+ int64 id() const { return node_.id(); }
+ const string& op() const { return node_.op(); }
+ const ProfileNode& node() { return node_; }
bool trackable(int64 step) const {
auto exec = execs_.find(step);
if (exec == execs_.end()) return false;
if (exec->second.all_start_micros() == 0) return false;
- if (canonical_device_.empty() || host_device_.empty()) {
+ if (node_.canonical_device().empty() || node_.host_device().empty()) {
return false;
}
return true;
}
- const std::map<int, TFGraphNode*>& inputs() const { return inputs_; }
- const std::map<string, int64>& src_output_idx() const {
+ const ProfileNode& ToProto(
+ const std::map<string, std::unique_ptr<TFGraphNode>>& nodes_map) {
+ node_.clear_shape();
+ node_.mutable_shape()->Reserve(shape().size());
+ for (int64 s : shape()) {
+ node_.add_shape(s);
+ }
+
+ node_.clear_op_types();
+ node_.mutable_op_types()->Reserve(op_types().size());
+ for (const string& t : op_types()) {
+ node_.add_op_types(t);
+ }
+
+ node_.clear_execs();
+ for (auto& exec : execs_) {
+ auto& exec_pb = (*node_.mutable_execs())[exec.first];
+ exec_pb.MergeFrom(exec.second.ToProto());
+ }
+
+ node_.clear_inputs();
+ for (const auto& inp : inputs_) {
+ (*node_.mutable_inputs())[inp.first] = nodes_map.at(inp.second)->id();
+ }
+
+ node_.clear_input_shapes();
+ for (const auto& s : input_shapes_) {
+ auto& shape = (*node_.mutable_input_shapes())[s.first];
+ for (int64 d : s.second) {
+ shape.add_int64_values(d);
+ }
+ }
+
+ node_.clear_output_shapes();
+ for (const auto& s : output_shapes_) {
+ auto& shape = (*node_.mutable_output_shapes())[s.first];
+ for (int64 d : s.second) {
+ shape.add_int64_values(d);
+ }
+ }
+
+ node_.clear_src_output_index();
+ for (const auto& s : src_output_idx_) {
+ int64 id = nodes_map.at(s.first)->id();
+ (*node_.mutable_src_output_index())[id] = s.second;
+ }
+ return node_;
+ }
+
+ void FromProto(const ProfileNode& node, const ProfileProto& profile) {
+ node_.Clear();
+ node_.MergeFrom(node);
+
+ op_types_.clear();
+ op_types_.insert(node_.op_types().begin(), node_.op_types().end());
+
+ shape_.clear();
+ for (int64 s : node_.shape()) {
+ shape_.push_back(s);
+ }
+
+ execs_.clear();
+ for (const auto& exec_pb : node.execs()) {
+ auto& exec = execs_[exec_pb.first];
+ exec.FromProto(exec_pb.second);
+ }
+
+ inputs_.clear();
+ for (const auto& inp : node.inputs()) {
+ inputs_[inp.first] = profile.nodes().at(inp.second).name();
+ }
+
+ input_shapes_.clear();
+ for (const auto& s : node.input_shapes()) {
+ auto& shape = input_shapes_[s.first];
+ for (const int64 d : s.second.int64_values()) {
+ shape.push_back(d);
+ }
+ }
+
+ output_shapes_.clear();
+ for (const auto& s : node.output_shapes()) {
+ auto& shape = output_shapes_[s.first];
+ for (const int64 d : s.second.int64_values()) {
+ shape.push_back(d);
+ }
+ }
+
+ src_output_idx_.clear();
+ for (const auto& s : node.src_output_index()) {
+ src_output_idx_[profile.nodes().at(s.first).name()] = s.second;
+ }
+ }
+
+ const std::map<int32, string>& inputs() const { return inputs_; }
+ const std::map<string, int32>& src_output_idx() const {
return src_output_idx_;
}
@@ -346,7 +487,7 @@ class TFGraphNode {
CHECK(exec != execs_.end()) << "unknown step " << step;
return exec->second.host_persistent_bytes();
}
- const std::map<int64, std::pair<int64, uint64>>& output_memory(
+ const std::map<int32, std::pair<int64, uint64>>& output_memory(
int64 step) const {
auto exec = execs_.find(step);
CHECK(exec != execs_.end()) << "unknown step " << step;
@@ -382,18 +523,22 @@ class TFGraphNode {
int64 float_ops(int64 step) const {
// If not run, return static analysis.
if (execs_.empty()) {
- return float_ops_;
+ return node_.float_ops();
}
// Otherwise, return dynamic float_ops.
- return float_ops_ * run_count(step);
+ return node_.float_ops() * run_count(step);
}
- const CodeDef& code() { return code_; }
- string canonical_device() const { return canonical_device_; }
- string host_device() const { return host_device_; }
+ const CodeDef& code() { return node_.trace(); }
+ string canonical_device() const { return node_.canonical_device(); }
+ string host_device() const { return node_.host_device(); }
const std::set<string>& op_types() const { return op_types_; }
- const std::map<string, const AttrValue*>& op_attrs() const {
- return op_attrs_;
+ const AttrValue* op_attrs(const string& name) const {
+ const auto it = node_.attrs().find(name);
+ if (it == node_.attrs().end()) {
+ return nullptr;
+ }
+ return &it->second;
}
const std::vector<int64>& shape() const { return shape_; }
@@ -406,12 +551,10 @@ class TFGraphNode {
}
private:
- std::map<int, TFGraphNode*> inputs_;
- std::map<string, int64> src_output_idx_;
+ std::map<int, string> inputs_;
+ std::map<string, int32> src_output_idx_;
- const NodeDef* node_;
-
- CodeDef code_;
+ ProfileNode node_;
std::vector<int64> shape_;
// Won't missing input_idx. But some shapes might be empty (unknown).
@@ -421,18 +564,8 @@ class TFGraphNode {
std::map<int, std::vector<int64>> output_shapes_;
std::set<string> op_types_;
- std::map<string, const AttrValue*> op_attrs_;
std::map<int64, ExecStep> execs_;
-
- // /j:#/t:#/r:#/device:#. A canonical device name without extra suffix.
- string canonical_device_;
- // The host device name.
- string host_device_;
-
- int64 float_ops_;
-
- string op_;
};
class TFMultiGraphNode {
diff --git a/tensorflow/core/profiler/internal/tfprof_scope.cc b/tensorflow/core/profiler/internal/tfprof_scope.cc
index d7293c5e52..128b296d5c 100644
--- a/tensorflow/core/profiler/internal/tfprof_scope.cc
+++ b/tensorflow/core/profiler/internal/tfprof_scope.cc
@@ -34,8 +34,8 @@ ScopeNode* TFScope::CreateParentNode(const string& name) {
node_defs_.push_back(std::unique_ptr<NodeDef>(new NodeDef()));
node_defs_.back()->set_name(name);
node_defs_.back()->set_op(kTFScopeParent);
- parent_nodes_[name] =
- std::unique_ptr<TFGraphNode>(new TFGraphNode(node_defs_.back().get()));
+ parent_nodes_[name] = std::unique_ptr<TFGraphNode>(
+ new TFGraphNode(node_defs_.back().get(), -1));
nodes_map_[name] =
std::unique_ptr<ScopeNode>(new ScopeNode(parent_nodes_[name].get()));
return nodes_map_[name].get();
diff --git a/tensorflow/core/profiler/internal/tfprof_show_test.cc b/tensorflow/core/profiler/internal/tfprof_show_test.cc
index 24cf9afb17..1f19f8c322 100644
--- a/tensorflow/core/profiler/internal/tfprof_show_test.cc
+++ b/tensorflow/core/profiler/internal/tfprof_show_test.cc
@@ -68,6 +68,23 @@ class TFProfShowTest : public ::testing::Test {
tf_stats_->BuildAllViews();
}
+ string TestToFromProto(const string& cmd, const Options& opts,
+ bool show_multi_node = false) {
+ string profile_file = io::JoinPath(testing::TmpDir(), "profile");
+ tf_stats_->WriteProfile(profile_file);
+ TFStats new_stats(profile_file, nullptr);
+ new_stats.BuildAllViews();
+ if (show_multi_node) {
+ new_stats.ShowMultiGraphNode(cmd, opts);
+ } else {
+ new_stats.ShowGraphNode(cmd, opts);
+ }
+ string dump_str;
+ TF_CHECK_OK(ReadFileToString(Env::Default(),
+ opts.output_options.at("outfile"), &dump_str));
+ return dump_str;
+ }
+
std::unique_ptr<TFStats> tf_stats_;
};
@@ -95,6 +112,8 @@ TEST_F(TFProfShowTest, DumpScopeMode) {
"0us/0us, 11us/11us)\n ScalarW (1, 1/1 params, 0/0 flops, 0B/0B, 0B/0B, "
"0B/0B, 0B/0B, 0us/0us, 0us/0us, 0us/0us)\n",
dump_str);
+
+ EXPECT_EQ(dump_str, TestToFromProto("scope", opts));
}
TEST_F(TFProfShowTest, DumpAcceleratorAndCPUMicros) {
@@ -140,6 +159,8 @@ TEST_F(TFProfShowTest, DumpAcceleratorAndCPUMicros) {
"0us/0us)\n ScalarW/read (0us/0us, 0us/0us)\n init (0us/0us, "
"0us/0us)\n",
dump_str);
+
+ EXPECT_EQ(dump_str, TestToFromProto("scope", opts));
}
TEST_F(TFProfShowTest, DumpOpMode) {
@@ -182,6 +203,8 @@ TEST_F(TFProfShowTest, DumpOpMode) {
"0|defined*1)\texec_time:0us\ninput_type:0:3x3x3x6\t(run*0|defined*1)"
"\texec_time:0us\n\n",
StringReplace(dump_str, " ", ""));
+
+ EXPECT_EQ(dump_str, TestToFromProto("op", opts, true));
}
} // namespace tfprof
} // namespace tensorflow
diff --git a/tensorflow/core/profiler/internal/tfprof_stats.cc b/tensorflow/core/profiler/internal/tfprof_stats.cc
index 81db766796..5b54958344 100644
--- a/tensorflow/core/profiler/internal/tfprof_stats.cc
+++ b/tensorflow/core/profiler/internal/tfprof_stats.cc
@@ -46,12 +46,11 @@ TFStats::TFStats(std::unique_ptr<GraphDef> graph,
std::unique_ptr<OpLogProto> op_log,
std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader)
: has_code_traces_(false),
- graph_(std::move(graph)),
ckpt_reader_(std::move(ckpt_reader)) {
- CHECK(graph_) << "Must at least have GraphDef";
+ CHECK(graph) << "Must at least have GraphDef";
printf("Parsing Inputs...\n");
- ParseGraph();
+ AddGraph(std::move(graph));
if (run_meta && run_meta->has_step_stats()) {
AddRunMeta(0, std::move(run_meta));
}
@@ -67,6 +66,30 @@ TFStats::TFStats(std::unique_ptr<GraphDef> graph,
}
}
+TFStats::TFStats(const string& filename,
+ std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader)
+ : has_code_traces_(false), ckpt_reader_(std::move(ckpt_reader)) {
+ string str;
+ Status s = ReadFileToString(Env::Default(), filename, &str);
+ if (!s.ok()) {
+ fprintf(stderr, "Failed to read profile: %s", s.ToString().c_str());
+ return;
+ }
+
+ ProfileProto profile;
+ if (!profile.ParseFromString(str)) {
+ fprintf(stderr, "Failed to parse profile\n");
+ return;
+ }
+
+ for (const auto& node_pb : profile.nodes()) {
+ std::unique_ptr<TFGraphNode> node(new TFGraphNode(node_pb.second, profile));
+ nodes_map_.insert(std::pair<string, std::unique_ptr<TFGraphNode>>(
+ node_pb.second.name(), std::move(node)));
+ }
+ has_code_traces_ = profile.has_trace();
+}
+
void TFStats::BuildView(const string& cmd) {
if (cmd == kCmds[0] && !scope_view_) {
scope_view_.reset(new TFScope(ckpt_reader_.get()));
@@ -135,16 +158,20 @@ const MultiGraphNodeProto& TFStats::ShowMultiGraphNode(
}
}
-void TFStats::ParseGraph() {
- for (const NodeDef& node : graph_->node()) {
- CHECK(nodes_map_.find(node.name()) == nodes_map_.end());
+void TFStats::AddGraph(std::unique_ptr<GraphDef> graph) {
+ std::map<string, const NodeDef*> node_defs;
+ for (const NodeDef& node : graph->node()) {
+ if (nodes_map_.find(node.name()) != nodes_map_.end()) {
+ continue;
+ }
nodes_map_[node.name()] =
- std::unique_ptr<TFGraphNode>(new TFGraphNode(&node));
+ std::unique_ptr<TFGraphNode>(new TFGraphNode(&node, nodes_map_.size()));
+ node_defs[node.name()] = &node;
}
- for (auto it = nodes_map_.begin(); it != nodes_map_.end(); it++) {
- const NodeDef* node_def = it->second->node_def();
- for (int i = 0; i < node_def->input_size(); ++i) {
- string node_input = node_def->input(i);
+ for (auto it = node_defs.begin(); it != node_defs.end(); it++) {
+ TFGraphNode* node = nodes_map_.at(it->first).get();
+ for (int i = 0; i < it->second->input_size(); ++i) {
+ string node_input = it->second->input(i);
int output_idx = 0;
// input name format can be: "^node:src_output"
auto prefix_pos = node_input.find(":");
@@ -160,10 +187,12 @@ void TFStats::ParseGraph() {
node_input = node_input.substr(1);
}
auto input_node = nodes_map_.find(node_input);
+ // TODO(xpan): P1: Add the input even if it doesn't exist yet, because
+ // this can be a partial graph.
if (input_node == nodes_map_.end()) {
continue;
}
- it->second->AddInput(input_node->second.get(), output_idx, i);
+ node->AddInput(input_node->second.get(), output_idx, i);
}
}
}
@@ -210,10 +239,8 @@ void TFStats::AddRunMeta(int64 step, std::unique_ptr<RunMetadata> run_meta) {
if (node == nodes_map_.end()) {
NodeDef def;
if (CreateRunMetadataNode(name, &def)) {
- NodeDef* ndef = graph_->mutable_node()->Add();
- ndef->MergeFrom(def);
- nodes_map_[name] =
- std::unique_ptr<TFGraphNode>(new TFGraphNode(ndef));
+ nodes_map_[name] = std::unique_ptr<TFGraphNode>(
+ new TFGraphNode(&def, nodes_map_.size()));
nodes_map_.at(name)->AddStepStat(step, dev_stat.device(), node_stat);
}
} else {
@@ -223,6 +250,25 @@ void TFStats::AddRunMeta(int64 step, std::unique_ptr<RunMetadata> run_meta) {
}
}
+void TFStats::WriteProfile(const string& filename) {
+ ProfileProto profile;
+ for (auto it = nodes_map_.begin(); it != nodes_map_.end(); it++) {
+ if (it->second->id() < 0) {
+ continue;
+ }
+ (*profile.mutable_nodes())[it->second->id()].MergeFrom(
+ it->second->ToProto(nodes_map_));
+ if (it->second->code().traces_size() > 0) {
+ profile.set_has_trace(true);
+ }
+ }
+ Status s =
+ WriteStringToFile(Env::Default(), filename, profile.SerializeAsString());
+ if (!s.ok()) {
+ fprintf(stderr, "%s\n", s.ToString().c_str());
+ }
+}
+
bool TFStats::Validate(const Options& opts) const {
if (opts.step >= 0 && steps_.find(opts.step) == steps_.end()) {
fprintf(stderr, "Options -step=%lld not found\n", opts.step);
diff --git a/tensorflow/core/profiler/internal/tfprof_stats.h b/tensorflow/core/profiler/internal/tfprof_stats.h
index 8dbab16fe4..3b1251152d 100644
--- a/tensorflow/core/profiler/internal/tfprof_stats.h
+++ b/tensorflow/core/profiler/internal/tfprof_stats.h
@@ -55,6 +55,10 @@ class TFStats {
std::unique_ptr<RunMetadata> run_meta,
std::unique_ptr<OpLogProto> op_log,
std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader);
+
+ TFStats(const string& filename,
+ std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader);
+
~TFStats() {}
const std::map<string, std::unique_ptr<TFGraphNode>>& nodes() const {
@@ -76,23 +80,25 @@ class TFStats {
const MultiGraphNodeProto& ShowMultiGraphNode(const string& cmd,
const Options& opts) const;
+ // A a (partial) graph to existing graph.
+ void AddGraph(std::unique_ptr<GraphDef> graph);
+
// Add a step of run time meta data.
void AddRunMeta(int64 step, std::unique_ptr<RunMetadata> run_meta);
// Add tfprof operation meta data, such as customized op type, float_ops,
// and code traces.
void AddOpLogProto(std::unique_ptr<OpLogProto> op_log);
+ void WriteProfile(const string& filename);
+
// For test purpose only.
void AddNodeForTest(int64 step, std::unique_ptr<TFGraphNode> node);
private:
bool Validate(const Options& opts) const;
- void ParseGraph();
-
std::set<int64> steps_;
bool has_code_traces_;
- std::unique_ptr<GraphDef> graph_;
std::unique_ptr<TFScope> scope_view_;
std::unique_ptr<TFGraph> graph_view_;
std::unique_ptr<TFCode> code_view_;
diff --git a/tensorflow/core/profiler/internal/tfprof_stats_test.cc b/tensorflow/core/profiler/internal/tfprof_stats_test.cc
index 0bea14fe66..2f2101d76b 100644
--- a/tensorflow/core/profiler/internal/tfprof_stats_test.cc
+++ b/tensorflow/core/profiler/internal/tfprof_stats_test.cc
@@ -69,6 +69,14 @@ class TFProfStatsTest : public ::testing::Test {
tf_stats_->BuildAllViews();
}
+ string TestToFromProto(const string& cmd, const Options& opts) {
+ string profile_file = io::JoinPath(testing::TmpDir(), "profile");
+ tf_stats_->WriteProfile(profile_file);
+ TFStats new_stats(profile_file, nullptr);
+ new_stats.BuildAllViews();
+ return new_stats.ShowGraphNode(cmd, opts).DebugString();
+ }
+
std::unique_ptr<TFStats> tf_stats_;
};
@@ -98,6 +106,8 @@ TEST_F(TFProfStatsTest, CustomOpType) {
"2\ntotal_definition_count: 3\ntotal_output_bytes: 2560\n",
&expected));
EXPECT_EQ(expected.DebugString(), root.DebugString());
+
+ EXPECT_EQ(root.DebugString(), TestToFromProto("scope", opts));
}
TEST_F(TFProfStatsTest, CheckPointOpType) {
@@ -126,6 +136,8 @@ TEST_F(TFProfStatsTest, CheckPointOpType) {
"2\ntotal_definition_count: 3\ntotal_output_bytes: 2560\n",
&expected));
EXPECT_EQ(expected.DebugString(), root.DebugString());
+
+ EXPECT_EQ(root.DebugString(), TestToFromProto("scope", opts));
}
TEST_F(TFProfStatsTest, TestGraph) {
@@ -157,6 +169,8 @@ TEST_F(TFProfStatsTest, TestGraph) {
"9984\ntotal_residual_bytes: 1280\ntotal_output_bytes: 4864\n",
&expected));
EXPECT_EQ(expected.DebugString(), root.DebugString());
+
+ EXPECT_EQ(root.DebugString(), TestToFromProto("graph", opts));
}
TEST_F(TFProfStatsTest, TestFloatOps) {
@@ -203,6 +217,8 @@ TEST_F(TFProfStatsTest, TestFloatOps) {
"9984\ntotal_residual_bytes: 1280\ntotal_output_bytes: 4864\n",
&expected));
EXPECT_EQ(expected.DebugString(), root.DebugString());
+
+ EXPECT_EQ(root.DebugString(), TestToFromProto("scope", opts));
}
TEST_F(TFProfStatsTest, TestAccountShownNameOnly) {
@@ -236,6 +252,8 @@ TEST_F(TFProfStatsTest, TestAccountShownNameOnly) {
"4096\ntotal_residual_bytes: 512\ntotal_output_bytes: 512\n",
&expected));
EXPECT_EQ(expected.DebugString(), root.DebugString());
+
+ EXPECT_EQ(root.DebugString(), TestToFromProto("scope", opts));
}
TEST_F(TFProfStatsTest, TestShowTensorValue) {
diff --git a/tensorflow/core/profiler/internal/tfprof_utils.cc b/tensorflow/core/profiler/internal/tfprof_utils.cc
index 383c4725b7..1ce59ad755 100644
--- a/tensorflow/core/profiler/internal/tfprof_utils.cc
+++ b/tensorflow/core/profiler/internal/tfprof_utils.cc
@@ -297,6 +297,7 @@ void PrintHelp() {
"See https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/profiler/"
"g3doc/command_line.md for command line tool tutorial.\n");
printf(
+ "profiler --profile_path=<ProfileProto binary file> # required\nor:\n"
"profiler --graph_path=<GraphDef proto file> # required\n"
" --run_meta_patn=<RunMetadata proto file> # optional\n"
" --run_log_path=<OpLogProto proto file> # optional\n\n");
diff --git a/tensorflow/core/profiler/profiler.cc b/tensorflow/core/profiler/profiler.cc
index 6acf4ea377..96e0b06bf3 100644
--- a/tensorflow/core/profiler/profiler.cc
+++ b/tensorflow/core/profiler/profiler.cc
@@ -66,6 +66,7 @@ void completion(const char* buf, linenoiseCompletions* lc) {
}
int Run(int argc, char** argv) {
+ string FLAGS_profile_path = "";
string FLAGS_graph_path = "";
string FLAGS_run_meta_path = "";
string FLAGS_op_log_path = "";
@@ -96,6 +97,7 @@ int Run(int argc, char** argv) {
}
std::vector<Flag> flag_list = {
+ Flag("profile_path", &FLAGS_profile_path, "Profile binary file name."),
Flag("graph_path", &FLAGS_graph_path, "GraphDef proto text file name"),
Flag("run_meta_path", &FLAGS_run_meta_path,
"Comma-separated list of RunMetadata proto binary "
@@ -138,6 +140,12 @@ int Run(int argc, char** argv) {
}
port::InitMain(argv[0], &argc, &argv);
+ if (!FLAGS_profile_path.empty() && !FLAGS_graph_path.empty()) {
+ fprintf(stderr,
+ "both --graph_path and --profile_path are set. "
+ "Ignore graph_path\n");
+ }
+
std::vector<string> account_type_regexes =
str_util::Split(FLAGS_account_type_regexes, ',', str_util::SkipEmpty());
std::vector<string> start_name_regexes =
@@ -157,7 +165,7 @@ int Run(int argc, char** argv) {
CHECK(s.ok()) << s.ToString();
string cmd = "";
- if (argc == 1 && FLAGS_graph_path.empty()) {
+ if (argc == 1 && FLAGS_graph_path.empty() && FLAGS_profile_path.empty()) {
PrintHelp();
return 0;
} else if (argc > 1) {
@@ -173,24 +181,6 @@ int Run(int argc, char** argv) {
}
printf("Reading Files...\n");
- std::unique_ptr<GraphDef> graph(new GraphDef());
- TF_CHECK_OK(
- ReadProtoFile(Env::Default(), FLAGS_graph_path, graph.get(), false));
-
- std::unique_ptr<OpLogProto> op_log(new OpLogProto());
- if (!FLAGS_op_log_path.empty()) {
- string op_log_str;
- s = ReadFileToString(Env::Default(), FLAGS_op_log_path, &op_log_str);
- if (!s.ok()) {
- fprintf(stderr, "Failed to read op_log_path: %s\n", s.ToString().c_str());
- return 1;
- }
- if (!ParseProtoUnlimited(op_log.get(), op_log_str)) {
- fprintf(stderr, "Failed to parse op_log_path\n");
- return 1;
- }
- }
-
std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader;
TF_Status* status = TF_NewStatus();
if (!FLAGS_checkpoint_path.empty()) {
@@ -204,25 +194,52 @@ int Run(int argc, char** argv) {
TF_DeleteStatus(status);
}
- TFStats tf_stat(std::move(graph), nullptr, std::move(op_log),
- std::move(ckpt_reader));
+ std::unique_ptr<TFStats> tf_stat;
+ if (!FLAGS_profile_path.empty()) {
+ tf_stat.reset(new TFStats(FLAGS_profile_path, std::move(ckpt_reader)));
+ } else {
+ printf(
+ "Try to use a single --profile_path instead of "
+ "graph_path,op_log_path,run_meta_path\n");
+ std::unique_ptr<GraphDef> graph(new GraphDef());
+ TF_CHECK_OK(
+ ReadProtoFile(Env::Default(), FLAGS_graph_path, graph.get(), false));
- std::vector<string> run_meta_files =
- str_util::Split(FLAGS_run_meta_path, ',', str_util::SkipEmpty());
- for (int i = 0; i < run_meta_files.size(); ++i) {
- std::unique_ptr<RunMetadata> run_meta(new RunMetadata());
- s = ReadProtoFile(Env::Default(), run_meta_files[i], run_meta.get(), true);
- if (!s.ok()) {
- fprintf(stderr, "Failed to read run_meta_path %s. Status: %s\n",
- run_meta_files[i].c_str(), s.ToString().c_str());
- return 1;
+ std::unique_ptr<OpLogProto> op_log(new OpLogProto());
+ if (!FLAGS_op_log_path.empty()) {
+ string op_log_str;
+ s = ReadFileToString(Env::Default(), FLAGS_op_log_path, &op_log_str);
+ if (!s.ok()) {
+ fprintf(stderr, "Failed to read op_log_path: %s\n",
+ s.ToString().c_str());
+ return 1;
+ }
+ if (!ParseProtoUnlimited(op_log.get(), op_log_str)) {
+ fprintf(stderr, "Failed to parse op_log_path\n");
+ return 1;
+ }
+ }
+ tf_stat.reset(new TFStats(std::move(graph), nullptr, std::move(op_log),
+ std::move(ckpt_reader)));
+
+ std::vector<string> run_meta_files =
+ str_util::Split(FLAGS_run_meta_path, ',', str_util::SkipEmpty());
+ for (int i = 0; i < run_meta_files.size(); ++i) {
+ std::unique_ptr<RunMetadata> run_meta(new RunMetadata());
+ s = ReadProtoFile(Env::Default(), run_meta_files[i], run_meta.get(),
+ true);
+ if (!s.ok()) {
+ fprintf(stderr, "Failed to read run_meta_path %s. Status: %s\n",
+ run_meta_files[i].c_str(), s.ToString().c_str());
+ return 1;
+ }
+ tf_stat->AddRunMeta(i, std::move(run_meta));
}
- tf_stat.AddRunMeta(i, std::move(run_meta));
}
if (cmd == kCmds[4]) {
- tf_stat.BuildAllViews();
- Advisor(&tf_stat).Advise(Advisor::DefaultOptions());
+ tf_stat->BuildAllViews();
+ Advisor(tf_stat.get()).Advise(Advisor::DefaultOptions());
return 0;
}
@@ -236,12 +253,12 @@ int Run(int argc, char** argv) {
select, output_type, output_options);
if (cmd == kCmds[2] || cmd == kCmds[3]) {
- tf_stat.BuildView(cmd);
- tf_stat.ShowMultiGraphNode(cmd, opts);
+ tf_stat->BuildView(cmd);
+ tf_stat->ShowMultiGraphNode(cmd, opts);
return 0;
} else if (cmd == kCmds[0] || cmd == kCmds[1]) {
- tf_stat.BuildView(cmd);
- tf_stat.ShowGraphNode(cmd, opts);
+ tf_stat->BuildView(cmd);
+ tf_stat->ShowGraphNode(cmd, opts);
return 0;
}
@@ -270,14 +287,14 @@ int Run(int argc, char** argv) {
} else if (cmd == kCmds[6]) {
PrintHelp();
} else if (cmd == kCmds[2] || cmd == kCmds[3]) {
- tf_stat.BuildView(cmd);
- tf_stat.ShowMultiGraphNode(cmd, new_opts);
+ tf_stat->BuildView(cmd);
+ tf_stat->ShowMultiGraphNode(cmd, new_opts);
} else if (cmd == kCmds[0] || cmd == kCmds[1]) {
- tf_stat.BuildView(cmd);
- tf_stat.ShowGraphNode(cmd, new_opts);
+ tf_stat->BuildView(cmd);
+ tf_stat->ShowGraphNode(cmd, new_opts);
} else if (cmd == kCmds[4]) {
- tf_stat.BuildAllViews();
- Advisor(&tf_stat).Advise(Advisor::DefaultOptions());
+ tf_stat->BuildAllViews();
+ Advisor(tf_stat.get()).Advise(Advisor::DefaultOptions());
}
}
return 0;
diff --git a/tensorflow/core/profiler/tfprof_log.proto b/tensorflow/core/profiler/tfprof_log.proto
index 006ee2b7a5..1ce5a5eecf 100644
--- a/tensorflow/core/profiler/tfprof_log.proto
+++ b/tensorflow/core/profiler/tfprof_log.proto
@@ -2,6 +2,8 @@ syntax = "proto3";
package tensorflow.tfprof;
+import "tensorflow/core/framework/attr_value.proto";
+
// It specifies the Python callstack that creates an op.
message CodeDef {
repeated Trace traces = 1;
@@ -31,3 +33,94 @@ message OpLogEntry {
message OpLogProto {
repeated OpLogEntry log_entries = 1;
}
+
+// A proto representation of the profiler's profile.
+// It allows serialization, shipping around and deserialization of the profiles.
+//
+// Please don't depend on the internals of the profile proto.
+message ProfileProto {
+ map<int64, ProfileNode> nodes = 1;
+ // Whether or not has code traces.
+ bool has_trace = 2;
+}
+
+message ProfileNode {
+ // graph node name.
+ string name = 1;
+ // graph operation type.
+ string op = 9;
+ // A unique id for the node.
+ int64 id = 13;
+
+ map<int32, int64> inputs = 2;
+ map<int32, Tuple> input_shapes = 16;
+ map<int32, int64> outputs = 3;
+ map<int32, Tuple> output_shapes = 15;
+ // A map from source node id to its output index to current node.
+ map<int64, int32> src_output_index = 14;
+
+ repeated int64 shape = 4;
+ repeated string op_types = 5;
+ string canonical_device = 6;
+ string host_device = 7;
+
+ int64 float_ops = 8;
+
+ CodeDef trace = 10;
+ map<string, AttrValue> attrs = 11;
+
+ map<int64, ExecProfile> execs = 12;
+}
+
+message ExecProfile {
+ // Can be larger than 1 if run multiple times in loop.
+ int64 run_count = 1;
+ // The earliest/latest time including scheduling and execution.
+ int64 all_start_micros = 2;
+ int64 latest_end_micros = 3;
+
+ // device -> vector of {op_start_micros, op_exec_micros} pairs.
+ // accelerator_execs: gpu:id/stream:all -> {op_start_micros, op_exec_micros}
+ // For accelerator, vector size can be larger than 1, multiple kernel fires
+ // or in tf.while_loop.
+ map<string, ExecTime> accelerator_execs = 4;
+ // cpu_execs: cpu/gpu:id -> {op_start_micros, op_exec_micros}
+ // For cpu, vector size can be larger than 1 if in tf.while_loop.
+ map<string, ExecTime> cpu_execs = 5;
+
+ map<int32, Memory> output_memory = 17;
+
+ repeated string devices = 6;
+
+ // Total bytes requested by the op.
+ int64 requested_bytes = 7;
+ // Total bytes requested by the op and released before op end.
+ int64 peak_bytes = 8;
+ // Total bytes requested by the op and not released after op end.
+ int64 residual_bytes = 9;
+ // Total bytes output by the op (not necessarily requested by the op).
+ int64 output_bytes = 10;
+ // Total temporary bytes allocated and released by the op.
+ int64 host_temp_bytes = 11;
+ // Total persistent bytes (e.g. variable) allocated by the op.
+ int64 host_persistent_bytes = 12;
+ int64 accelerator_temp_bytes = 13;
+ int64 accelerator_persistent_bytes = 14;
+ // The total number of bytes currently allocated by the allocator if >0.
+ int64 allocator_bytes_in_use = 15;
+
+ bool memory_intialized = 16;
+}
+
+message ExecTime {
+ repeated Tuple times = 1;
+}
+
+message Tuple {
+ repeated int64 int64_values = 1;
+}
+
+message Memory {
+ int64 bytes = 1;
+ uint64 ptr = 2;
+}