aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/compiler/xla/service/hlo_execution_profile.h
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow/compiler/xla/service/hlo_execution_profile.h')
-rw-r--r--tensorflow/compiler/xla/service/hlo_execution_profile.h87
1 files changed, 66 insertions, 21 deletions
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.h b/tensorflow/compiler/xla/service/hlo_execution_profile.h
index cdce77cff4..f945b9d84c 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile.h
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile.h
@@ -18,7 +18,9 @@ limitations under the License.
#include <unordered_map>
+#include "tensorflow/compiler/xla/map_util.h"
#include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
+#include "tensorflow/compiler/xla/service/hlo_profile_printer.h"
#include "tensorflow/compiler/xla/types.h"
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
#include "tensorflow/core/platform/types.h"
@@ -27,6 +29,54 @@ namespace xla {
class HloInstruction;
+// Maps all HloInstructions and HloComputions in an HloModule to integers.
+// These integers form the contiguous range [0, GetTotalCount()).
+class HloToProfileIndex {
+ public:
+ // Scans `module` to populate this instance of HloToProfileIndex.
+ explicit HloToProfileIndex(const HloModule& module);
+
+ HloToProfileIndex(const HloToProfileIndex&) = default;
+ HloToProfileIndex(HloToProfileIndex&&) = default;
+
+ HloToProfileIndex& operator=(const HloToProfileIndex&) = default;
+ HloToProfileIndex& operator=(HloToProfileIndex&&) = default;
+
+ size_t GetProfileIndexFor(const HloInstruction& instruction) const {
+ return FindOrDie(instruction_to_profile_idx(), &instruction);
+ }
+
+ size_t GetProfileIndexFor(const HloComputation& computation) const {
+ return FindOrDie(computation_to_profile_idx(), &computation);
+ }
+
+ size_t instruction_count() const {
+ return instruction_to_profile_idx().size();
+ }
+
+ size_t computation_count() const {
+ return computation_to_profile_idx().size();
+ }
+
+ size_t total_count() const {
+ return instruction_count() + computation_count();
+ }
+
+ const std::unordered_map<const HloInstruction*, int64>&
+ instruction_to_profile_idx() const {
+ return instruction_to_profile_idx_;
+ }
+
+ const std::unordered_map<const HloComputation*, int64>&
+ computation_to_profile_idx() const {
+ return computation_to_profile_idx_;
+ }
+
+ private:
+ std::unordered_map<const HloInstruction*, int64> instruction_to_profile_idx_;
+ std::unordered_map<const HloComputation*, int64> computation_to_profile_idx_;
+};
+
// Describes how much time each HLO operation took.
//
// Each HloComputation takes a certain number of cycles. This class helps break
@@ -35,6 +85,9 @@ class HloExecutionProfile {
public:
using DeviceDescription = perftools::gputools::DeviceDescription;
+ HloExecutionProfile(const HloModule& module,
+ const HloCostAnalysis& cost_analysis);
+
// Record how many cycles this HLO took to execute.
void SetCyclesTakenBy(const HloInstruction* hlo, uint64 cycles_taken);
@@ -44,17 +97,15 @@ class HloExecutionProfile {
// Return the number of cycles this computation took to execute.
uint64 total_cycles_executed(const HloComputation& computation) const {
- auto it = total_cycles_executed_.find(&computation);
- if (it != total_cycles_executed_.end()) {
- return it->second;
- }
- return 0;
+ return profile_counters_[hlo_to_profile_index_.GetProfileIndexFor(
+ computation)];
}
// Record how many cycles a computation took to execute.
void set_total_cycles_executed(const HloComputation& computation,
uint64 total_cycles_executed) {
- total_cycles_executed_[&computation] = total_cycles_executed;
+ profile_counters_[hlo_to_profile_index_.GetProfileIndexFor(computation)] =
+ total_cycles_executed;
}
// Returns a version of the execution profile suitable for performance
@@ -63,25 +114,19 @@ class HloExecutionProfile {
// for the operations in a given computation. Returns an empty string if it
// wasn't possible to generate a printable version. cost_analysis should be a
// clean analysis that can be used to visit the computation.
- string ToString(const HloComputation& computation,
- const DeviceDescription& device_description,
- HloCostAnalysis* cost_analysis) const;
-
- // Returns the computations we have profiled.
- std::unordered_set<const HloComputation*> profiled_computations() const {
- return profiled_computations_;
- }
+ string ToString(const DeviceDescription& device_description) const;
private:
- // Contains a mapping from HLO to the number of cycles it took to execute it.
- std::unordered_map<const HloInstruction*, uint64> hlo_to_cycles_taken_;
+ // hlo_to_profile_index_ maps an Hlo entity (computation or instruction) to an
+ // index in profile_counters_.
+ HloToProfileIndex hlo_to_profile_index_;
- // If non-empty, contains the total number of cycles a computation took to
- // execute.
- std::unordered_map<const HloComputation*, uint64> total_cycles_executed_;
+ // Used to print profile_counters_ in a human readable form.
+ HloProfilePrinter hlo_profile_printer_;
- // The computations we have profiled.
- std::unordered_set<const HloComputation*> profiled_computations_;
+ // Stores per-Hlo profile counters. This is the only thing that changes when
+ // we execute an XLA computation.
+ std::vector<int64> profile_counters_;
};
} // namespace xla