[XLA] Add support for profiling multiple computations

While we are here, add support for getting the cost analysis for call HLOs. Change: 148952748
author: David Majnemer <majnemer@google.com> 2017-03-01 17:37:09 -0800
committer: TensorFlower Gardener <gardener@tensorflow.org> 2017-03-01 17:48:18 -0800
commit: af2c7253bb1f9d135ad9b0c6a271741205ab57fd (patch)
tree: 5bc2c03e568e73b419531679dce6a86d61064a72 /tensorflow/compiler/xla/service/hlo_execution_profile.cc
parent: 5ce1f684d2bb6e9220f68b9541d33342b5452918 (diff)
1 files changed, 26 insertions, 9 deletions
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.cc b/tensorflow/compiler/xla/service/hlo_execution_profile.cc
index e2a81a052c..447892c8de 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile.cc
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile.cc
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/metric_table_report.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/strings/numbers.h"
@@ -32,6 +33,7 @@ namespace xla {
 void HloExecutionProfile::AddProfileResult(const HloInstruction* hlo,
                                            uint64 cycles_taken) {
   hlo_to_cycles_taken_[hlo] = cycles_taken;
+  profiled_computations_.insert(hlo->parent());
 }
 
 uint64 HloExecutionProfile::GetProfileResult(const HloInstruction& hlo) const {
@@ -43,17 +45,30 @@ uint64 HloExecutionProfile::GetProfileResult(const HloInstruction& hlo) const {
 }
 
 string HloExecutionProfile::ToString(
+    const HloComputation& computation,
     const DeviceDescription& device_description,
-    const HloCostAnalysis& cost_analysis) const {
+    const HloCostAnalysis::ShapeSizeFunction& shape_size) const {
+  HloCostAnalysis cost_analysis(shape_size);
+  tensorflow::Status analysis_status =
+      computation.root_instruction()->Accept(&cost_analysis);
+  if (!analysis_status.ok()) {
+    return "";
+  }
+
   using Item = std::pair<const HloInstruction*, uint64>;
-  std::vector<Item> items(hlo_to_cycles_taken_.begin(),
-                          hlo_to_cycles_taken_.end());
+  std::vector<Item> items;
+  for (Item item : hlo_to_cycles_taken_) {
+    // Only include the HLOs which are part of the desired computation.
+    if (item.first->parent() == &computation) {
+      items.push_back(item);
+    }
+  }
   auto custom_less = [](const Item& lhs, const Item& rhs) {
     return lhs.second > rhs.second;
   };
   std::sort(items.begin(), items.end(), custom_less);
   string result;
-  const int64 total_cycles = total_cycles_executed();
+  const int64 total_cycles = total_cycles_executed(computation);
   double clock_rate_ghz = device_description.clock_rate_ghz();
 
   const auto cycles_to_microseconds = [&](double cycles) {
@@ -88,11 +103,13 @@ string HloExecutionProfile::ToString(
             bytes_per_sec.c_str(), bytes_per_cycle.c_str(), name.c_str()));
   };
   tensorflow::strings::StrAppend(
-      &result,
-      tensorflow::strings::Printf("HLO execution profile: (%s @ f_nom)\n\t",
-                                  tensorflow::strings::HumanReadableElapsedTime(
-                                      total_cycles / clock_rate_ghz / 1e9)
-                                      .c_str()));
+      &result, tensorflow::strings::Printf(
+                   "HLO execution profile for %s: (%s @ f_nom)\n\t",
+                   computation.name().c_str(),
+                   tensorflow::strings::HumanReadableElapsedTime(
+                       total_cycles / clock_rate_ghz / 1e9)
+                       .c_str()));
+
   append_item(total_cycles, -1, -1, "[total]");
   for (const auto& item : items) {
     const HloInstruction* hlo = item.first;
author	David Majnemer <majnemer@google.com>	2017-03-01 17:37:09 -0800
committer	TensorFlower Gardener <gardener@tensorflow.org>	2017-03-01 17:48:18 -0800
commit	af2c7253bb1f9d135ad9b0c6a271741205ab57fd (patch)
tree	5bc2c03e568e73b419531679dce6a86d61064a72 /tensorflow/compiler/xla/service/hlo_execution_profile.cc
parent	5ce1f684d2bb6e9220f68b9541d33342b5452918 (diff)