diff options
Diffstat (limited to 'tensorflow/compiler/xla/service/hlo_execution_profile.h')
-rw-r--r-- | tensorflow/compiler/xla/service/hlo_execution_profile.h | 87 |
1 files changed, 66 insertions, 21 deletions
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.h b/tensorflow/compiler/xla/service/hlo_execution_profile.h index cdce77cff4..f945b9d84c 100644 --- a/tensorflow/compiler/xla/service/hlo_execution_profile.h +++ b/tensorflow/compiler/xla/service/hlo_execution_profile.h @@ -18,7 +18,9 @@ limitations under the License. #include <unordered_map> +#include "tensorflow/compiler/xla/map_util.h" #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h" +#include "tensorflow/compiler/xla/service/hlo_profile_printer.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" #include "tensorflow/core/platform/types.h" @@ -27,6 +29,54 @@ namespace xla { class HloInstruction; +// Maps all HloInstructions and HloComputions in an HloModule to integers. +// These integers form the contiguous range [0, GetTotalCount()). +class HloToProfileIndex { + public: + // Scans `module` to populate this instance of HloToProfileIndex. + explicit HloToProfileIndex(const HloModule& module); + + HloToProfileIndex(const HloToProfileIndex&) = default; + HloToProfileIndex(HloToProfileIndex&&) = default; + + HloToProfileIndex& operator=(const HloToProfileIndex&) = default; + HloToProfileIndex& operator=(HloToProfileIndex&&) = default; + + size_t GetProfileIndexFor(const HloInstruction& instruction) const { + return FindOrDie(instruction_to_profile_idx(), &instruction); + } + + size_t GetProfileIndexFor(const HloComputation& computation) const { + return FindOrDie(computation_to_profile_idx(), &computation); + } + + size_t instruction_count() const { + return instruction_to_profile_idx().size(); + } + + size_t computation_count() const { + return computation_to_profile_idx().size(); + } + + size_t total_count() const { + return instruction_count() + computation_count(); + } + + const std::unordered_map<const HloInstruction*, int64>& + instruction_to_profile_idx() const { + return instruction_to_profile_idx_; + } + + const std::unordered_map<const HloComputation*, int64>& + computation_to_profile_idx() const { + return computation_to_profile_idx_; + } + + private: + std::unordered_map<const HloInstruction*, int64> instruction_to_profile_idx_; + std::unordered_map<const HloComputation*, int64> computation_to_profile_idx_; +}; + // Describes how much time each HLO operation took. // // Each HloComputation takes a certain number of cycles. This class helps break @@ -35,6 +85,9 @@ class HloExecutionProfile { public: using DeviceDescription = perftools::gputools::DeviceDescription; + HloExecutionProfile(const HloModule& module, + const HloCostAnalysis& cost_analysis); + // Record how many cycles this HLO took to execute. void SetCyclesTakenBy(const HloInstruction* hlo, uint64 cycles_taken); @@ -44,17 +97,15 @@ class HloExecutionProfile { // Return the number of cycles this computation took to execute. uint64 total_cycles_executed(const HloComputation& computation) const { - auto it = total_cycles_executed_.find(&computation); - if (it != total_cycles_executed_.end()) { - return it->second; - } - return 0; + return profile_counters_[hlo_to_profile_index_.GetProfileIndexFor( + computation)]; } // Record how many cycles a computation took to execute. void set_total_cycles_executed(const HloComputation& computation, uint64 total_cycles_executed) { - total_cycles_executed_[&computation] = total_cycles_executed; + profile_counters_[hlo_to_profile_index_.GetProfileIndexFor(computation)] = + total_cycles_executed; } // Returns a version of the execution profile suitable for performance @@ -63,25 +114,19 @@ class HloExecutionProfile { // for the operations in a given computation. Returns an empty string if it // wasn't possible to generate a printable version. cost_analysis should be a // clean analysis that can be used to visit the computation. - string ToString(const HloComputation& computation, - const DeviceDescription& device_description, - HloCostAnalysis* cost_analysis) const; - - // Returns the computations we have profiled. - std::unordered_set<const HloComputation*> profiled_computations() const { - return profiled_computations_; - } + string ToString(const DeviceDescription& device_description) const; private: - // Contains a mapping from HLO to the number of cycles it took to execute it. - std::unordered_map<const HloInstruction*, uint64> hlo_to_cycles_taken_; + // hlo_to_profile_index_ maps an Hlo entity (computation or instruction) to an + // index in profile_counters_. + HloToProfileIndex hlo_to_profile_index_; - // If non-empty, contains the total number of cycles a computation took to - // execute. - std::unordered_map<const HloComputation*, uint64> total_cycles_executed_; + // Used to print profile_counters_ in a human readable form. + HloProfilePrinter hlo_profile_printer_; - // The computations we have profiled. - std::unordered_set<const HloComputation*> profiled_computations_; + // Stores per-Hlo profile counters. This is the only thing that changes when + // we execute an XLA computation. + std::vector<int64> profile_counters_; }; } // namespace xla |