8 files changed, 95 insertions, 39 deletions
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index cd9ff4d858..53bf055b9d 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -230,7 +230,8 @@ Status CpuExecutable::ExecuteComputeFunction(
   }
 
   if (hlo_execution_profile != nullptr) {
-    hlo_execution_profile->set_total_cycles_executed(profile_counters.back());
+    hlo_execution_profile->set_total_cycles_executed(
+        *module().entry_computation(), profile_counters.back());
 
     for (auto hlo_prof_idx : hlo_to_profile_idx_) {
       const HloInstruction* hlo = hlo_prof_idx.first;
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
index d6d3d79f4a..0266442d7a 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
@@ -190,8 +190,8 @@ ParallelCpuExecutable::ExecuteOnStream(
   std::list<HloInstruction*> pending;
 
   // Call the function for each HLO instruction in topological order.
-  for (auto* instruction :
-       module().entry_computation()->MakeInstructionPostOrder()) {
+  const HloComputation& entry_computation = *module().entry_computation();
+  for (auto* instruction : entry_computation.MakeInstructionPostOrder()) {
     // Parameters and constants have no functions associated with them. Instead
     // just copy the existing buffer into the map containing instruction
     // results..
@@ -299,7 +299,8 @@ ParallelCpuExecutable::ExecuteOnStream(
     execution_profile_.set_compute_cycle_count(profile_counters.back());
   }
   if (hlo_execution_profile != nullptr) {
-    hlo_execution_profile->set_total_cycles_executed(profile_counters.back());
+    hlo_execution_profile->set_total_cycles_executed(entry_computation,
+                                                     profile_counters.back());
 
     for (auto hlo_prof_idx : hlo_to_profile_idx_) {
       const HloInstruction* hlo = hlo_prof_idx.first;
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index dc2b048878..ceb897ab9c 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -47,8 +47,12 @@ class HloExecutionProfiler {
  public:
   // If profiling is enabled, start an execution timer running.
   explicit HloExecutionProfiler(bool do_profile, HloExecutionProfile* profile,
-                                se::Stream* stream)
-      : do_profile_(do_profile), profile_(profile), stream_(stream) {
+                                se::Stream* stream,
+                                const HloComputation* computation)
+      : do_profile_(do_profile),
+        profile_(profile),
+        stream_(stream),
+        computation_(computation) {
     if (do_profile_) {
       clock_rate_ghz_ =
           stream->parent()->GetDeviceDescription().clock_rate_ghz();
@@ -66,8 +70,8 @@ class HloExecutionProfiler {
     if (do_profile_) {
       stream_->ThenStopTimer(execution_timer_.get());
       stream_->BlockHostUntilDone();
-      profile_->set_total_cycles_executed(execution_timer_->Nanoseconds() *
-                                          clock_rate_ghz_);
+      profile_->set_total_cycles_executed(
+          *computation_, execution_timer_->Nanoseconds() * clock_rate_ghz_);
     }
   }
 
@@ -94,6 +98,7 @@ class HloExecutionProfiler {
   double clock_rate_ghz_;
   HloExecutionProfile* profile_;
   se::Stream* stream_;
+  const HloComputation* computation_;
   std::unique_ptr<se::Timer> execution_timer_;
   std::unique_ptr<se::Timer> per_op_timer_;
 };
@@ -119,7 +124,8 @@ Status GpuExecutable::ExecuteThunks(
   if (do_profile) {
     LOG(WARNING) << "PROFILING: profiling is enabled";
   }
-  HloExecutionProfiler profiler(do_profile, hlo_execution_profile, main_stream);
+  HloExecutionProfiler profiler(do_profile, hlo_execution_profile, main_stream,
+                                hlo_module_->entry_computation());
 
   std::vector<std::unique_ptr<se::Stream>> sub_streams;
   // Stream 0 indicates `main_stream` and substreams start from stream 1.
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index 41be997584..8fe1897e75 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -369,7 +369,13 @@ Status HloCostAnalysis::HandleFusion(HloInstruction* fusion) {
 Status HloCostAnalysis::HandleCall(
     HloInstruction* call, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
     HloComputation* computation) {
-  return Unimplemented("call");
+  HloCostAnalysis computation_visitor(shape_size_);
+  TF_RETURN_IF_ERROR(computation->Accept(&computation_visitor));
+
+  current_flop_count_ = computation_visitor.flop_count();
+  current_transcendental_count_ = computation_visitor.transcendental_count();
+  current_bytes_accessed_ = computation_visitor.bytes_accessed();
+  return Status::OK();
 }
 
 Status HloCostAnalysis::HandleCustomCall(
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.cc b/tensorflow/compiler/xla/service/hlo_execution_profile.cc
index e2a81a052c..447892c8de 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile.cc
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile.cc
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/metric_table_report.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/strings/numbers.h"
@@ -32,6 +33,7 @@ namespace xla {
 void HloExecutionProfile::AddProfileResult(const HloInstruction* hlo,
                                            uint64 cycles_taken) {
   hlo_to_cycles_taken_[hlo] = cycles_taken;
+  profiled_computations_.insert(hlo->parent());
 }
 
 uint64 HloExecutionProfile::GetProfileResult(const HloInstruction& hlo) const {
@@ -43,17 +45,30 @@ uint64 HloExecutionProfile::GetProfileResult(const HloInstruction& hlo) const {
 }
 
 string HloExecutionProfile::ToString(
+    const HloComputation& computation,
     const DeviceDescription& device_description,
-    const HloCostAnalysis& cost_analysis) const {
+    const HloCostAnalysis::ShapeSizeFunction& shape_size) const {
+  HloCostAnalysis cost_analysis(shape_size);
+  tensorflow::Status analysis_status =
+      computation.root_instruction()->Accept(&cost_analysis);
+  if (!analysis_status.ok()) {
+    return "";
+  }
+
   using Item = std::pair<const HloInstruction*, uint64>;
-  std::vector<Item> items(hlo_to_cycles_taken_.begin(),
-                          hlo_to_cycles_taken_.end());
+  std::vector<Item> items;
+  for (Item item : hlo_to_cycles_taken_) {
+    // Only include the HLOs which are part of the desired computation.
+    if (item.first->parent() == &computation) {
+      items.push_back(item);
+    }
+  }
   auto custom_less = [](const Item& lhs, const Item& rhs) {
     return lhs.second > rhs.second;
   };
   std::sort(items.begin(), items.end(), custom_less);
   string result;
-  const int64 total_cycles = total_cycles_executed();
+  const int64 total_cycles = total_cycles_executed(computation);
   double clock_rate_ghz = device_description.clock_rate_ghz();
 
   const auto cycles_to_microseconds = [&](double cycles) {
@@ -88,11 +103,13 @@ string HloExecutionProfile::ToString(
             bytes_per_sec.c_str(), bytes_per_cycle.c_str(), name.c_str()));
   };
   tensorflow::strings::StrAppend(
-      &result,
-      tensorflow::strings::Printf("HLO execution profile: (%s @ f_nom)\n\t",
-                                  tensorflow::strings::HumanReadableElapsedTime(
-                                      total_cycles / clock_rate_ghz / 1e9)
-                                      .c_str()));
+      &result, tensorflow::strings::Printf(
+                   "HLO execution profile for %s: (%s @ f_nom)\n\t",
+                   computation.name().c_str(),
+                   tensorflow::strings::HumanReadableElapsedTime(
+                       total_cycles / clock_rate_ghz / 1e9)
+                       .c_str()));
+
   append_item(total_cycles, -1, -1, "[total]");
   for (const auto& item : items) {
     const HloInstruction* hlo = item.first;
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.h b/tensorflow/compiler/xla/service/hlo_execution_profile.h
index 6cc2079813..70b94a3f95 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile.h
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile.h
@@ -43,27 +43,45 @@ class HloExecutionProfile {
   uint64 GetProfileResult(const HloInstruction& hlo) const;
 
   // Return the number of cycles this computation took to execute.
-  uint64 total_cycles_executed() const { return total_cycles_executed_; }
+  uint64 total_cycles_executed(const HloComputation& computation) const {
+    auto it = total_cycles_executed_.find(&computation);
+    if (it != total_cycles_executed_.end()) {
+      return it->second;
+    }
+    return 0;
+  }
 
-  // Record how many cycles the entire computation took to execute.
-  void set_total_cycles_executed(uint64 total_cycles_executed) {
-    total_cycles_executed_ = total_cycles_executed;
+  // Record how many cycles a computation took to execute.
+  void set_total_cycles_executed(const HloComputation& computation,
+                                 uint64 total_cycles_executed) {
+    total_cycles_executed_[&computation] = total_cycles_executed;
   }
 
   // Returns a version of the execution profile suitable for performance
   // debugging; e.g. emits cycle counts, execution time at the nominal device
   // frequency, and the effective throughput given the provided cost_analysis
-  // for the operations.
-  string ToString(const DeviceDescription& device_description,
-                  const HloCostAnalysis& cost_analysis) const;
+  // for the operations in a given computation.
+  // Returns an empty string if it wasn't possible to generate a printable
+  // version.
+  string ToString(const HloComputation& computation,
+                  const DeviceDescription& device_description,
+                  const HloCostAnalysis::ShapeSizeFunction& shape_size) const;
+
+  // Returns the computations we have profiled.
+  std::unordered_set<const HloComputation*> profiled_computations() const {
+    return profiled_computations_;
+  }
 
  private:
   // Contains a mapping from HLO to the number of cycles it took to execute it.
   std::unordered_map<const HloInstruction*, uint64> hlo_to_cycles_taken_;
 
-  // If non-empty, contains the total number of cycles this computation took to
+  // If non-empty, contains the total number of cycles a computation took to
   // execute.
-  uint64 total_cycles_executed_ = 0;
+  std::unordered_map<const HloComputation*, uint64> total_cycles_executed_;
+
+  // The computations we have profiled.
+  std::unordered_set<const HloComputation*> profiled_computations_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index d7d8722ccc..4308d39860 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -329,7 +329,7 @@ string InstructionSequenceGraph(
       auto hlo_cycles_executed =
           hlo_execution_profile->GetProfileResult(*instruction);
       auto total_cycles_executed =
-          hlo_execution_profile->total_cycles_executed();
+          hlo_execution_profile->total_cycles_executed(*instruction->parent());
       if (hlo_cycles_executed > 0 && total_cycles_executed > 0) {
         Appendf(&label, "\\n%% of cycles executed=%.2f",
                 (static_cast<double>(hlo_cycles_executed) /
@@ -405,7 +405,7 @@ string ComputationToDotGraph(const HloComputation& computation,
                              const HloExecutionProfile* hlo_execution_profile) {
   string graph_label = StrCat(label, "\\n", computation.name());
   if (hlo_execution_profile != nullptr) {
-    auto cycles = hlo_execution_profile->total_cycles_executed();
+    auto cycles = hlo_execution_profile->total_cycles_executed(computation);
     Appendf(&graph_label, "\\ntotal cycles = %lld (%s)", cycles,
             tensorflow::strings::HumanReadableNum(cycles).c_str());
   }
diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h
index ce07489fe0..1cad6866ad 100644
--- a/tensorflow/compiler/xla/service/service.h
+++ b/tensorflow/compiler/xla/service/service.h
@@ -450,16 +450,23 @@ ReturnT Service::ExecuteOnStreamWrapper(
   }
 
   if (profile_ptr != nullptr) {
-    HloCostAnalysis analysis([this](const Shape& shape) {
+    HloCostAnalysis::ShapeSizeFunction shape_size = [this](const Shape& shape) {
       return execute_backend_->compiler()->ShapeSizeBytes(shape);
-    });
-    tensorflow::Status analysis_status =
-        executable->module().entry_computation()->root_instruction()->Accept(
-            &analysis);
-    if (analysis_status.ok()) {
-      XLA_LOG_LINES(tensorflow::INFO,
-                    profile_ptr->ToString(
-                        stream->parent()->GetDeviceDescription(), analysis));
+    };
+    std::unordered_set<const xla::HloComputation*> profiled_computations =
+        profile_ptr->profiled_computations();
+    // To ensure we have print the profiles in a stable order, iterate over the
+    // computations in post order.
+    std::list<xla::HloComputation*> all_computations =
+        executable->module().MakeComputationPostOrder();
+    for (xla::HloComputation* computation : all_computations) {
+      if (profiled_computations.count(computation) > 0) {
+        string profile_string = profile_ptr->ToString(
+            *computation, stream->parent()->GetDeviceDescription(), shape_size);
+        if (!profile_string.empty()) {
+          XLA_LOG_LINES(tensorflow::INFO, profile_string);
+        }
+      }
     }
     DumpExecutedHlo(executable->module(), "Service::Execute", profile_ptr);
   }