diff options
author | David Majnemer <majnemer@google.com> | 2017-03-01 17:37:09 -0800 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2017-03-01 17:48:18 -0800 |
commit | af2c7253bb1f9d135ad9b0c6a271741205ab57fd (patch) | |
tree | 5bc2c03e568e73b419531679dce6a86d61064a72 /tensorflow/compiler/xla/service/hlo_execution_profile.cc | |
parent | 5ce1f684d2bb6e9220f68b9541d33342b5452918 (diff) |
[XLA] Add support for profiling multiple computations
While we are here, add support for getting the cost analysis for call HLOs.
Change: 148952748
Diffstat (limited to 'tensorflow/compiler/xla/service/hlo_execution_profile.cc')
-rw-r--r-- | tensorflow/compiler/xla/service/hlo_execution_profile.cc | 35 |
1 files changed, 26 insertions, 9 deletions
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.cc b/tensorflow/compiler/xla/service/hlo_execution_profile.cc index e2a81a052c..447892c8de 100644 --- a/tensorflow/compiler/xla/service/hlo_execution_profile.cc +++ b/tensorflow/compiler/xla/service/hlo_execution_profile.cc @@ -21,6 +21,7 @@ limitations under the License. #include "tensorflow/compiler/xla/metric_table_report.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/hlo_module.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/strings/numbers.h" @@ -32,6 +33,7 @@ namespace xla { void HloExecutionProfile::AddProfileResult(const HloInstruction* hlo, uint64 cycles_taken) { hlo_to_cycles_taken_[hlo] = cycles_taken; + profiled_computations_.insert(hlo->parent()); } uint64 HloExecutionProfile::GetProfileResult(const HloInstruction& hlo) const { @@ -43,17 +45,30 @@ uint64 HloExecutionProfile::GetProfileResult(const HloInstruction& hlo) const { } string HloExecutionProfile::ToString( + const HloComputation& computation, const DeviceDescription& device_description, - const HloCostAnalysis& cost_analysis) const { + const HloCostAnalysis::ShapeSizeFunction& shape_size) const { + HloCostAnalysis cost_analysis(shape_size); + tensorflow::Status analysis_status = + computation.root_instruction()->Accept(&cost_analysis); + if (!analysis_status.ok()) { + return ""; + } + using Item = std::pair<const HloInstruction*, uint64>; - std::vector<Item> items(hlo_to_cycles_taken_.begin(), - hlo_to_cycles_taken_.end()); + std::vector<Item> items; + for (Item item : hlo_to_cycles_taken_) { + // Only include the HLOs which are part of the desired computation. + if (item.first->parent() == &computation) { + items.push_back(item); + } + } auto custom_less = [](const Item& lhs, const Item& rhs) { return lhs.second > rhs.second; }; std::sort(items.begin(), items.end(), custom_less); string result; - const int64 total_cycles = total_cycles_executed(); + const int64 total_cycles = total_cycles_executed(computation); double clock_rate_ghz = device_description.clock_rate_ghz(); const auto cycles_to_microseconds = [&](double cycles) { @@ -88,11 +103,13 @@ string HloExecutionProfile::ToString( bytes_per_sec.c_str(), bytes_per_cycle.c_str(), name.c_str())); }; tensorflow::strings::StrAppend( - &result, - tensorflow::strings::Printf("HLO execution profile: (%s @ f_nom)\n\t", - tensorflow::strings::HumanReadableElapsedTime( - total_cycles / clock_rate_ghz / 1e9) - .c_str())); + &result, tensorflow::strings::Printf( + "HLO execution profile for %s: (%s @ f_nom)\n\t", + computation.name().c_str(), + tensorflow::strings::HumanReadableElapsedTime( + total_cycles / clock_rate_ghz / 1e9) + .c_str())); + append_item(total_cycles, -1, -1, "[total]"); for (const auto& item : items) { const HloInstruction* hlo = item.first; |