Split up HloExecutionProfile into a set of re-usable components

The end goal is to have Hlo profiling support in XlaJitCompiledCpuFunction and eventually AOT compiled XlaCompiledCpuFunction. This change leaves the HloExecutionProfile interface mostly intact -- internally it uses the new split out components to do what it did before. However, in future CLs: - I'll extract out a HloExecutionProfilePrototype that contains the HloProfilePrinter, the OwningHloProfilePrinterStaticData and the HloToProfileIndex. This will then live in the Executable (if profiling is enabled). - The HloExecutionProfile for a specific execution will have a pointer to the parent HloExecutionProfilePrototype, which it will use to paginate profile_counters_. - The CPU backend will use the HloToProfileIndex in the HloExecutionProfilePrototype to map hlo instructions to profile counter offsets. This will make the indices in the generated code "line up" with the indices that the HloProfilePrinter expects. These changes will allow the XlaJitCompiledCpuFunction (and later AOT) clients to pass in an appropriately sized zeroed buffer to the generated function and then pass that same buffer to the appropriate HloProfilePrinter to get a textual Hlo profile. PiperOrigin-RevId: 175613737
author: Sanjoy Das <sanjoy@google.com> 2017-11-13 17:20:26 -0800
committer: TensorFlower Gardener <gardener@tensorflow.org> 2017-11-13 17:24:25 -0800
commit: 58c1aaf77721268a4ef87ebd2ab520a6d5a62f79 (patch)
tree: b2962dd6f8e8fa4ce217534f466b36c67b691bab /tensorflow/compiler/xla/service/hlo_execution_profile.h
parent: d44f37161d34f0de012e10d5aebc2acfdb292be2 (diff)
1 files changed, 66 insertions, 21 deletions
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.h b/tensorflow/compiler/xla/service/hlo_execution_profile.h
index cdce77cff4..f945b9d84c 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile.h
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile.h
@@ -18,7 +18,9 @@ limitations under the License.
 
 #include <unordered_map>
 
+#include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
+#include "tensorflow/compiler/xla/service/hlo_profile_printer.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/types.h"
@@ -27,6 +29,54 @@ namespace xla {
 
 class HloInstruction;
 
+// Maps all HloInstructions and HloComputions in an HloModule to integers.
+// These integers form the contiguous range [0, GetTotalCount()).
+class HloToProfileIndex {
+ public:
+  // Scans `module` to populate this instance of HloToProfileIndex.
+  explicit HloToProfileIndex(const HloModule& module);
+
+  HloToProfileIndex(const HloToProfileIndex&) = default;
+  HloToProfileIndex(HloToProfileIndex&&) = default;
+
+  HloToProfileIndex& operator=(const HloToProfileIndex&) = default;
+  HloToProfileIndex& operator=(HloToProfileIndex&&) = default;
+
+  size_t GetProfileIndexFor(const HloInstruction& instruction) const {
+    return FindOrDie(instruction_to_profile_idx(), &instruction);
+  }
+
+  size_t GetProfileIndexFor(const HloComputation& computation) const {
+    return FindOrDie(computation_to_profile_idx(), &computation);
+  }
+
+  size_t instruction_count() const {
+    return instruction_to_profile_idx().size();
+  }
+
+  size_t computation_count() const {
+    return computation_to_profile_idx().size();
+  }
+
+  size_t total_count() const {
+    return instruction_count() + computation_count();
+  }
+
+  const std::unordered_map<const HloInstruction*, int64>&
+  instruction_to_profile_idx() const {
+    return instruction_to_profile_idx_;
+  }
+
+  const std::unordered_map<const HloComputation*, int64>&
+  computation_to_profile_idx() const {
+    return computation_to_profile_idx_;
+  }
+
+ private:
+  std::unordered_map<const HloInstruction*, int64> instruction_to_profile_idx_;
+  std::unordered_map<const HloComputation*, int64> computation_to_profile_idx_;
+};
+
 // Describes how much time each HLO operation took.
 //
 // Each HloComputation takes a certain number of cycles.  This class helps break
@@ -35,6 +85,9 @@ class HloExecutionProfile {
  public:
   using DeviceDescription = perftools::gputools::DeviceDescription;
 
+  HloExecutionProfile(const HloModule& module,
+                      const HloCostAnalysis& cost_analysis);
+
   // Record how many cycles this HLO took to execute.
   void SetCyclesTakenBy(const HloInstruction* hlo, uint64 cycles_taken);
 
@@ -44,17 +97,15 @@ class HloExecutionProfile {
 
   // Return the number of cycles this computation took to execute.
   uint64 total_cycles_executed(const HloComputation& computation) const {
-    auto it = total_cycles_executed_.find(&computation);
-    if (it != total_cycles_executed_.end()) {
-      return it->second;
-    }
-    return 0;
+    return profile_counters_[hlo_to_profile_index_.GetProfileIndexFor(
+        computation)];
   }
 
   // Record how many cycles a computation took to execute.
   void set_total_cycles_executed(const HloComputation& computation,
                                  uint64 total_cycles_executed) {
-    total_cycles_executed_[&computation] = total_cycles_executed;
+    profile_counters_[hlo_to_profile_index_.GetProfileIndexFor(computation)] =
+        total_cycles_executed;
   }
 
   // Returns a version of the execution profile suitable for performance
@@ -63,25 +114,19 @@ class HloExecutionProfile {
   // for the operations in a given computation. Returns an empty string if it
   // wasn't possible to generate a printable version. cost_analysis should be a
   // clean analysis that can be used to visit the computation.
-  string ToString(const HloComputation& computation,
-                  const DeviceDescription& device_description,
-                  HloCostAnalysis* cost_analysis) const;
-
-  // Returns the computations we have profiled.
-  std::unordered_set<const HloComputation*> profiled_computations() const {
-    return profiled_computations_;
-  }
+  string ToString(const DeviceDescription& device_description) const;
 
  private:
-  // Contains a mapping from HLO to the number of cycles it took to execute it.
-  std::unordered_map<const HloInstruction*, uint64> hlo_to_cycles_taken_;
+  // hlo_to_profile_index_ maps an Hlo entity (computation or instruction) to an
+  // index in profile_counters_.
+  HloToProfileIndex hlo_to_profile_index_;
 
-  // If non-empty, contains the total number of cycles a computation took to
-  // execute.
-  std::unordered_map<const HloComputation*, uint64> total_cycles_executed_;
+  // Used to print profile_counters_ in a human readable form.
+  HloProfilePrinter hlo_profile_printer_;
 
-  // The computations we have profiled.
-  std::unordered_set<const HloComputation*> profiled_computations_;
+  // Stores per-Hlo profile counters.  This is the only thing that changes when
+  // we execute an XLA computation.
+  std::vector<int64> profile_counters_;
 };
 
 }  // namespace xla
author	Sanjoy Das <sanjoy@google.com>	2017-11-13 17:20:26 -0800
committer	TensorFlower Gardener <gardener@tensorflow.org>	2017-11-13 17:24:25 -0800
commit	58c1aaf77721268a4ef87ebd2ab520a6d5a62f79 (patch)
tree	b2962dd6f8e8fa4ce217534f466b36c67b691bab /tensorflow/compiler/xla/service/hlo_execution_profile.h
parent	d44f37161d34f0de012e10d5aebc2acfdb292be2 (diff)