diff options
author | Mark Heffernan <meheff@google.com> | 2017-02-10 15:28:12 -0800 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2017-02-10 15:49:40 -0800 |
commit | 56eb0007c04fe47f61323a3014c9a7fb176b0d70 (patch) | |
tree | 49e735249dab837db24d01f5c604ce4a8bf19874 /tensorflow/compiler/xla/service/hlo_execution_profile.cc | |
parent | 674fdc3d90031da055767a2fda5ac8bfd2b2feb5 (diff) |
Add bytes accessed to HLO profile output. Bytes accessed is a measure of the bytes read/written from memory during execution of an HLO op. It is typically the sum of the sizes of the operands and output. Sample line from profile table:
337 cycles ( 47.87%) :: 0.5 usec @ f_nom :: 263.80MFLOP/s :: 12.0KiB :: %multiply = ...
The 12.0KiB is the change.
As part of this change unconditionally gather bytes accessed information with HloCostAnalysis. This requires that the shape size computation be universally accessible so ShapeSizeBytes method was added to xla::Compiler which enabled some cleanup in various places.
Change: 147206509
Diffstat (limited to 'tensorflow/compiler/xla/service/hlo_execution_profile.cc')
-rw-r--r-- | tensorflow/compiler/xla/service/hlo_execution_profile.cc | 34 |
1 files changed, 26 insertions, 8 deletions
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.cc b/tensorflow/compiler/xla/service/hlo_execution_profile.cc index 82c85635f0..e2a81a052c 100644 --- a/tensorflow/compiler/xla/service/hlo_execution_profile.cc +++ b/tensorflow/compiler/xla/service/hlo_execution_profile.cc @@ -60,16 +60,32 @@ string HloExecutionProfile::ToString( return cycles / clock_rate_ghz / 1000.0; }; - auto append_item = [&](int64 cycles, int64 flops, const string& name) { + auto append_item = [&](int64 cycles, int64 flops, int64 bytes_accessed, + const string& name) { double nsecs = cycles / clock_rate_ghz; + string bytes_per_sec; + string bytes_per_cycle; + if (bytes_accessed >= 0) { + bytes_per_sec = tensorflow::strings::HumanReadableNumBytes( + bytes_accessed / (nsecs / 1e9)); + bytes_per_cycle = + tensorflow::strings::HumanReadableNumBytes(bytes_accessed / cycles); + } else { + bytes_per_sec = "<unknown>"; + bytes_per_cycle = "<unknown>"; + } + tensorflow::strings::StrAppend( &result, tensorflow::strings::Printf( - "%15lld cycles (%6.2f%%) :: %12.1f usec @ f_nom :: %18s :: %s", + "%15lld cycles (%6.2f%%) :: %12.1f usec @ f_nom :: %18s :: %12s/s " + ":: " + "%12s/cycle :: " + "%s", cycles, cycles / static_cast<double>(total_cycles) * 100, cycles_to_microseconds(cycles), flops <= 0 ? "<none>" : HumanReadableNumFlops(flops, nsecs).c_str(), - name.c_str())); + bytes_per_sec.c_str(), bytes_per_cycle.c_str(), name.c_str())); }; tensorflow::strings::StrAppend( &result, @@ -77,13 +93,15 @@ string HloExecutionProfile::ToString( tensorflow::strings::HumanReadableElapsedTime( total_cycles / clock_rate_ghz / 1e9) .c_str())); - append_item(total_cycles, -1, "[total]"); + append_item(total_cycles, -1, -1, "[total]"); for (const auto& item : items) { + const HloInstruction* hlo = item.first; tensorflow::strings::StrAppend(&result, "\n\t"); - auto flops = - item.first == nullptr ? -1 : cost_analysis.flop_count(*item.first); - string display = item.first == nullptr ? "<none>" : item.first->ToString(); - append_item(item.second, flops, display); + int64 flops = hlo == nullptr ? -1 : cost_analysis.flop_count(*hlo); + int64 bytes_accessed = + hlo == nullptr ? -1 : cost_analysis.bytes_accessed(*hlo); + string display = hlo == nullptr ? "<none>" : hlo->ToString(); + append_item(item.second, flops, bytes_accessed, display); } MetricTableReport table; |