diff options
15 files changed, 193 insertions, 155 deletions
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc index 56940b8d63..ff6042ae19 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc @@ -197,28 +197,35 @@ void InitializeLLVMCommandLineOptions(const HloModuleConfig& config) { class CollectProfileCandidates : public DfsHloVisitorWithDefault { public: static StatusOr<std::unordered_map<const HloInstruction*, size_t>> - GetCandidatesForComputation(HloComputation* computation) { + GetCandidatesForComputation( + HloComputation* computation, + const std::unordered_map<const HloInstruction*, int64>& + assigned_indices) { std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx; CollectProfileCandidates profile_candidates_for_computation( - &hlo_to_profile_idx); + &hlo_to_profile_idx, assigned_indices); TF_RETURN_IF_ERROR( computation->Accept(&profile_candidates_for_computation)); return hlo_to_profile_idx; } private: - explicit CollectProfileCandidates( - std::unordered_map<const HloInstruction*, size_t>* hlo_to_profile_idx) - : hlo_to_profile_idx_(hlo_to_profile_idx) {} + CollectProfileCandidates( + std::unordered_map<const HloInstruction*, size_t>* hlo_to_profile_idx, + const std::unordered_map<const HloInstruction*, int64>& assigned_indices) + : hlo_to_profile_idx_(hlo_to_profile_idx), + assigned_indices_(assigned_indices) {} Status DefaultAction(HloInstruction* hlo_instruction) override { - hlo_to_profile_idx_->insert({hlo_instruction, hlo_to_profile_idx_->size()}); + hlo_to_profile_idx_->insert( + {hlo_instruction, FindOrDie(assigned_indices_, hlo_instruction)}); return Status::OK(); } Status HandleCall(HloInstruction* call) override { TF_RETURN_IF_ERROR(DefaultAction(call)); - CollectProfileCandidates candidates_for_call(hlo_to_profile_idx_); + CollectProfileCandidates candidates_for_call(hlo_to_profile_idx_, + assigned_indices_); TF_RETURN_IF_ERROR(call->to_apply()->Accept(&candidates_for_call)); return Status::OK(); } @@ -232,17 +239,20 @@ class CollectProfileCandidates : public DfsHloVisitorWithDefault { Status HandleWhile(HloInstruction* xla_while) override { TF_RETURN_IF_ERROR(DefaultAction(xla_while)); - CollectProfileCandidates candidates_for_condition(hlo_to_profile_idx_); + CollectProfileCandidates candidates_for_condition(hlo_to_profile_idx_, + assigned_indices_); TF_RETURN_IF_ERROR( xla_while->while_condition()->Accept(&candidates_for_condition)); - CollectProfileCandidates candidates_for_body(hlo_to_profile_idx_); + CollectProfileCandidates candidates_for_body(hlo_to_profile_idx_, + assigned_indices_); TF_RETURN_IF_ERROR(xla_while->while_body()->Accept(&candidates_for_body)); return Status::OK(); } std::unordered_map<const HloInstruction*, size_t>* hlo_to_profile_idx_; + const std::unordered_map<const HloInstruction*, int64>& assigned_indices_; }; } // namespace @@ -475,10 +485,27 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend( HloComputation* computation = module->entry_computation(); std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx; + std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map; + std::unique_ptr<HloProfilePrinter> hlo_profile_printer; if (module->config().hlo_profiling_enabled()) { + hlo_profile_index_map = MakeUnique<HloProfileIndexMap>(*module); + TF_ASSIGN_OR_RETURN( hlo_to_profile_idx, - CollectProfileCandidates::GetCandidatesForComputation(computation)); + CollectProfileCandidates::GetCandidatesForComputation( + computation, hlo_profile_index_map->instruction_to_profile_idx())); + + auto shape_size_bytes = [](const Shape& shape) { + // On the cpu, opaques are pointers. + if (ShapeUtil::IsOpaque(shape)) { + return static_cast<int64>(sizeof(void*)); + } + return ShapeUtil::ByteSizeOf(shape, sizeof(void*)); + }; + + HloCostAnalysis cost_analysis(shape_size_bytes); + hlo_profile_printer = + CreateHloProfilePrinter(*hlo_profile_index_map, cost_analysis); } std::unique_ptr<Executable> cpu_executable; @@ -544,8 +571,16 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend( parallel_computations.emplace(to_apply, instruction); } + // We always profile the entire computation as a whole, even if hlo + // profiling is disabled. When hlo profiling is diabled, we pass in a + // profile counter array of just one element, which corresponds to the whole + // computation. + size_t entry_computation_profile_idx = + hlo_profile_index_map ? hlo_profile_index_map->GetProfileIndexFor( + *module->entry_computation()) + : 0; IrEmitter ir_emitter(*module, *assignment, llvm_module.get(), - hlo_to_profile_idx, hlo_to_profile_idx.size(), + hlo_to_profile_idx, entry_computation_profile_idx, jit->target_machine(), jit->external_constant_pool()); std::unique_ptr<HloInstructionMap<string>> function_names( @@ -586,8 +621,8 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend( jit->AddModule(std::move(llvm_module)); cpu_executable.reset(new ParallelCpuExecutable( std::move(jit), std::move(assignment), std::move(module), - std::move(function_names), std::move(hlo_to_profile_idx), - std::move(aligned_constants))); + std::move(function_names), std::move(aligned_constants), + std::move(hlo_profile_printer), std::move(hlo_profile_index_map))); if (embed_ir_in_executable) { static_cast<CpuExecutable&>(*cpu_executable) @@ -620,12 +655,22 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend( TF_RETURN_IF_ERROR(protobuf_util::DumpProtoToDirectory( proto, xla_dump_hlo_proto_to, module->name())); } + // We always profile the entire computation as a whole, even if hlo + // profiling is disabled. When hlo profiling is diabled, we pass in a + // profile counter array of just one element, which corresponds to the whole + // computation. + size_t entry_computation_profile_idx = + hlo_profile_index_map ? hlo_profile_index_map->GetProfileIndexFor( + *module->entry_computation()) + : 0; + // Each computation is a single function. Emit all embedded computations // before the entry computation. The order of computations returned from // GetEmbeddedComputations guarantees that a called computation occurs // before a caller computation. + IrEmitter ir_emitter(*module, *assignment, llvm_module.get(), - hlo_to_profile_idx, hlo_to_profile_idx.size(), + hlo_to_profile_idx, entry_computation_profile_idx, jit->target_machine(), jit->external_constant_pool()); for (auto embedded_computation : @@ -659,7 +704,7 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend( jit->AddModule(std::move(llvm_module)); cpu_executable.reset(new CpuExecutable( std::move(jit), std::move(assignment), std::move(module), function_name, - std::move(hlo_to_profile_idx))); + std::move(hlo_profile_printer), std::move(hlo_profile_index_map))); if (embed_ir_in_executable) { static_cast<CpuExecutable&>(*cpu_executable) diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc index e6ef9d6314..e956f478b8 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc @@ -55,11 +55,12 @@ CpuExecutable::CpuExecutable( std::unique_ptr<const BufferAssignment> assignment, std::unique_ptr<const HloModule> hlo_module, const string& entry_function_name, - std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx) - : Executable(std::move(hlo_module)), + std::unique_ptr<HloProfilePrinter> hlo_profile_printer, + std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map) + : Executable(std::move(hlo_module), std::move(hlo_profile_printer), + std::move(hlo_profile_index_map)), jit_(std::move(jit)), - assignment_(std::move(assignment)), - hlo_to_profile_idx_(std::move(hlo_to_profile_idx)) { + assignment_(std::move(assignment)) { // Resolve symbols in the constructor rather than at execution time to avoid // races because FindSymbol is not thread safe. llvm::JITSymbol sym = jit_->FindSymbol(entry_function_name); @@ -183,9 +184,16 @@ Status CpuExecutable::ExecuteComputeFunction( uint64 start_micros = tensorflow::Env::Default()->NowMicros(); // Allocate profiling counters for each hlo instruction that we would like to - // profile. Allocate an additional profile counter for the entire - // computation. - std::vector<uint64> profile_counters(hlo_to_profile_idx_.size() + 1); + // profile. Even when not Hlo profiling, we allocate a counter for the entire + // computation, which we use to update ExecutionProfile below. + std::vector<int64>* profile_counters = nullptr; + std::vector<int64> profile_counter_for_entry_computation; + if (hlo_execution_profile) { + profile_counters = hlo_execution_profile->mutable_profile_counters(); + } else { + profile_counters = &profile_counter_for_entry_computation; + profile_counter_for_entry_computation.push_back(0); + } // Call the computation function following the calling convention. std::vector<void*> buffer_pointers; @@ -200,7 +208,7 @@ Status CpuExecutable::ExecuteComputeFunction( VLOG(3) << tensorflow::strings::Printf( " func(void* result, void* params[%zu], void* temps[%zu], " "uint64 profile_counters[%zu])", - args_array.size(), buffer_pointers.size(), profile_counters.size()); + args_array.size(), buffer_pointers.size(), profile_counters->size()); VLOG(3) << tensorflow::strings::Printf(" result = %p", result_buffer); auto ptr_printer = [](string* out, const void* p) { tensorflow::strings::StrAppend(out, tensorflow::strings::Printf("%p", p)); @@ -212,11 +220,11 @@ Status CpuExecutable::ExecuteComputeFunction( " temps = [%s]", tensorflow::str_util::Join(buffer_pointers, ", ", ptr_printer).c_str()); VLOG(3) << tensorflow::strings::Printf(" profile_counters = %p", - profile_counters.data()); + profile_counters->data()); } compute_function_(result_buffer, run_options, args_array.data(), - buffer_pointers.data(), profile_counters.data()); + buffer_pointers.data(), profile_counters->data()); uint64 end_micros = tensorflow::Env::Default()->NowMicros(); @@ -225,20 +233,15 @@ Status CpuExecutable::ExecuteComputeFunction( const double nanoseconds = (end_micros - start_micros) * 1000.0; execution_profile_.set_compute_time_ns(std::max(nanoseconds, 1.0)); - // The last profile counter is used for the computation as a whole. - execution_profile_.set_compute_cycle_count(profile_counters.back()); - } - - if (hlo_execution_profile != nullptr) { - hlo_execution_profile->set_total_cycles_executed( - *module().entry_computation(), profile_counters.back()); - - for (auto hlo_prof_idx : hlo_to_profile_idx_) { - const HloInstruction* hlo = hlo_prof_idx.first; - uint64 cycles_taken = profile_counters[hlo_prof_idx.second]; - hlo_execution_profile->SetCyclesTakenBy(hlo, cycles_taken); + if (hlo_execution_profile) { + execution_profile_.set_compute_cycle_count( + hlo_execution_profile->total_cycles_executed( + *module().entry_computation())); + } else { + execution_profile_.set_compute_cycle_count(profile_counters->back()); } } + return Status::OK(); } @@ -428,9 +431,5 @@ const PointsToSet& CpuExecutable::GetRootPointsToSet() const { module().entry_computation()->root_instruction()); } -std::unique_ptr<HloCostAnalysis> CpuExecutable::CreateCostAnalysis() const { - return MakeUnique<HloCostAnalysis>(ShapeSizeBytes); -} - } // namespace cpu } // namespace xla diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.h b/tensorflow/compiler/xla/service/cpu/cpu_executable.h index 238bc9b46a..17ee2d673e 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_executable.h +++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.h @@ -47,12 +47,12 @@ namespace cpu { // architecture, so JIT-ed code and host code share the same ABI. class CpuExecutable : public Executable { public: - CpuExecutable( - std::unique_ptr<SimpleOrcJIT> jit, - std::unique_ptr<const BufferAssignment> assignment, - std::unique_ptr<const HloModule> hlo_module, - const string& entry_function_name, - std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx); + CpuExecutable(std::unique_ptr<SimpleOrcJIT> jit, + std::unique_ptr<const BufferAssignment> assignment, + std::unique_ptr<const HloModule> hlo_module, + const string& entry_function_name, + std::unique_ptr<HloProfilePrinter> hlo_profile_printer, + std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map); ~CpuExecutable() override {} StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteOnStream( @@ -85,12 +85,10 @@ class CpuExecutable : public Executable { static int64 ShapeSizeBytes(const Shape& shape); - std::unique_ptr<HloCostAnalysis> CreateCostAnalysis() const override; - // Type of the computation function we expect in the JIT. using ComputeFunctionType = void (*)( void* /*result*/, const ExecutableRunOptions* /*run_options*/, - const void** /*args*/, void** /*temps*/, uint64* /*profile_counters*/); + const void** /*args*/, void** /*temps*/, int64* /*profile_counters*/); const ComputeFunctionType& compute_function() const { return compute_function_; @@ -145,9 +143,6 @@ class CpuExecutable : public Executable { // Entry function name for the computation. const string entry_function_name_; - // Maps HLOs to their index into the profile counter array. - const std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx_; - TF_DISALLOW_COPY_AND_ASSIGN(CpuExecutable); }; diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc index aff61296ce..0077e344e2 100644 --- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc +++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc @@ -59,19 +59,20 @@ ParallelCpuExecutable::ParallelCpuExecutable( std::unique_ptr<const BufferAssignment> assignment, std::unique_ptr<const HloModule> hlo_module, std::unique_ptr<const HloInstructionMap<string>> function_names, - std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx, std::unordered_map<const HloInstruction*, std::unique_ptr<unsigned char[]>> - aligned_constants) - : Executable(std::move(hlo_module)), + aligned_constants, + std::unique_ptr<HloProfilePrinter> hlo_profile_printer, + std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map) + : Executable(std::move(hlo_module), std::move(hlo_profile_printer), + std::move(hlo_profile_index_map)), jit_(std::move(jit)), assignment_(std::move(assignment)), function_names_(std::move(function_names)), - hlo_to_profile_idx_(std::move(hlo_to_profile_idx)), aligned_constants_(std::move(aligned_constants)) {} // Type of the computation function we expect in the JIT. using ComputeFunctionType = void (*)(void*, const void*, const void**, void**, - int64*, uint64*); + int64*, int64*); // Given a pointer to an output buffer (following the CPU JIT calling // conventions), mark addresses that are "live". The initial pointer itself is @@ -106,7 +107,7 @@ class Executor { const ServiceExecutableRunOptions* run_options, std::list<HloInstruction*>* pending, HloInstructionMap<const void*>* results, void** temps_array, - uint64* profile_counters_array, const BufferAssignment* assignment) + int64* profile_counters_array, const BufferAssignment* assignment) : functions_(functions), run_options_(run_options), pending_(pending), @@ -147,7 +148,7 @@ class Executor { std::list<HloInstruction*>* pending_; HloInstructionMap<const void*>* results_; void** temps_array_; - uint64* profile_counters_array_; + int64* profile_counters_array_; tensorflow::thread::ThreadPool* thread_pool_; const BufferAssignment* assignment_; @@ -389,9 +390,11 @@ Status ParallelCpuExecutable::ExecuteComputeFunctions( tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> buffers, HloExecutionProfile* hlo_execution_profile) { // Allocate profiling counters for each hlo instruction that we would like to - // profile. Allocate an additional profile counter for the entire - // computation. - std::vector<uint64> profile_counters(hlo_to_profile_idx_.size() + 1); + // profile. + std::vector<int64>* profile_counters = nullptr; + if (hlo_execution_profile) { + profile_counters = hlo_execution_profile->mutable_profile_counters(); + } std::vector<void*> buffer_pointers; buffer_pointers.reserve(buffers.size()); @@ -441,9 +444,9 @@ Status ParallelCpuExecutable::ExecuteComputeFunctions( // For example, if we expect a library conv/matmul call to run at max // concurrency, we should not dispatch runnable instructions until the // library call is finished (to avoid expensive cache invalidation). - Executor executor(functions, run_options, &pending, &results, - buffer_pointers.data(), profile_counters.data(), - assignment_.get()); + Executor executor( + functions, run_options, &pending, &results, buffer_pointers.data(), + profile_counters ? profile_counters->data() : nullptr, assignment_.get()); TF_RETURN_IF_ERROR(executor.Run()); @@ -453,18 +456,6 @@ Status ParallelCpuExecutable::ExecuteComputeFunctions( tensorflow::mutex_lock lock(mutex_); double nanoseconds = (end_micros - start_micros) * 1000.0; execution_profile_.set_compute_time_ns(std::max(nanoseconds, 1.0)); - // The last profile counter is used for the computation as a whole. - execution_profile_.set_compute_cycle_count(profile_counters.back()); - } - if (hlo_execution_profile != nullptr) { - hlo_execution_profile->set_total_cycles_executed(entry_computation, - profile_counters.back()); - - for (auto hlo_prof_idx : hlo_to_profile_idx_) { - const HloInstruction* hlo = hlo_prof_idx.first; - uint64 cycles_taken = profile_counters[hlo_prof_idx.second]; - hlo_execution_profile->SetCyclesTakenBy(hlo, cycles_taken); - } } return Status::OK(); @@ -618,10 +609,5 @@ const PointsToSet& ParallelCpuExecutable::GetRootPointsToSet() const { module().entry_computation()->root_instruction()); } -std::unique_ptr<HloCostAnalysis> ParallelCpuExecutable::CreateCostAnalysis() - const { - return MakeUnique<HloCostAnalysis>(ShapeSizeBytes); -} - } // namespace cpu } // namespace xla diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h index db16aaf48b..d65e3f42f3 100644 --- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h +++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h @@ -52,10 +52,11 @@ class ParallelCpuExecutable : public Executable { std::unique_ptr<const BufferAssignment> assignment, std::unique_ptr<const HloModule> hlo_module, std::unique_ptr<const HloInstructionMap<string>> function_names, - std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx, std::unordered_map<const HloInstruction*, std::unique_ptr<unsigned char[]>> - aligned_constants); + aligned_constants, + std::unique_ptr<HloProfilePrinter> hlo_profile_printer, + std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map); ~ParallelCpuExecutable() override {} StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteOnStream( @@ -95,8 +96,6 @@ class ParallelCpuExecutable : public Executable { "Equality test on CPU parallel executable is not implemented."); } - std::unique_ptr<HloCostAnalysis> CreateCostAnalysis() const override; - private: // Allocate buffers required for execution and assign them to the elements of // "buffers". "buffers" should be sized to the number of buffers in buffer @@ -143,9 +142,6 @@ class ParallelCpuExecutable : public Executable { // Map containing the JITted function names for each HLO instruction. const std::unique_ptr<const HloInstructionMap<string>> function_names_; - // Maps HLOs to their index into the profile counter array. - const std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx_; - // Map from HLO Constant instructions to a pointer to their literal data. // The data stored in the protocol buffer might be insufficiently aligned, // we create a sufficiently aligned copy and store it in this map. diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h index 2135707371..08862308c9 100644 --- a/tensorflow/compiler/xla/service/executable.h +++ b/tensorflow/compiler/xla/service/executable.h @@ -44,8 +44,15 @@ namespace xla { // interface that is used for launching compiled programs across platforms. class Executable { public: - explicit Executable(std::unique_ptr<const HloModule> hlo_module) - : hlo_module_(std::move(hlo_module)) {} + explicit Executable(std::unique_ptr<const HloModule> hlo_module, + std::unique_ptr<HloProfilePrinter> hlo_profile_printer, + std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map) + : hlo_module_(std::move(hlo_module)), + hlo_profile_printer_(std::move(hlo_profile_printer)), + hlo_profile_index_map_(std::move(hlo_profile_index_map)) { + CHECK_EQ(hlo_profile_printer_.get() == nullptr, + hlo_profile_index_map_.get() == nullptr); + } virtual ~Executable() {} // Enqueues the compilation result on the provided stream, passing the given @@ -123,12 +130,20 @@ class Executable { "Equality test on this executable is not implemented."); } + const HloProfilePrinter& hlo_profile_printer() const { + CHECK(hlo_profiling_enabled()); + return *hlo_profile_printer_; + } + + const HloProfileIndexMap& hlo_profile_index_map() const { + CHECK(hlo_profiling_enabled()); + return *hlo_profile_index_map_; + } + // Returns whether this executable was compiled with HLO profilings support // enabled. If not, the caller should not expect an hlo_execution_profile // passed to ExecuteOnStream above to be populated during execution. - bool hlo_profiling_enabled() const { - return hlo_module_->config().hlo_profiling_enabled(); - } + bool hlo_profiling_enabled() const { return hlo_profile_printer_ != nullptr; } const HloModule& module() const { return *hlo_module_; } @@ -160,10 +175,6 @@ class Executable { static Status DumpToDirectory(const string& directory_path, string filename, const SessionModule& session_module); - // Returns a cost analysis object appropriate for the platform on which this - // executable can run. - virtual std::unique_ptr<HloCostAnalysis> CreateCostAnalysis() const = 0; - protected: mutable tensorflow::mutex mutex_; @@ -181,6 +192,9 @@ class Executable { // Execution count, used to generate a unique filename for each dumped // execution. int64 execution_count_ = 0; + + std::unique_ptr<HloProfilePrinter> hlo_profile_printer_; + std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map_; }; template <typename ReturnT, typename ArgT> @@ -200,7 +214,8 @@ StatusOr<ReturnT> Executable::ExecuteOnStreamWrapper( std::unique_ptr<HloExecutionProfile> profile_ptr = module_config().debug_options().xla_hlo_profile() && hlo_profiling_enabled() - ? MakeUnique<HloExecutionProfile>(module(), *CreateCostAnalysis()) + ? MakeUnique<HloExecutionProfile>(&hlo_profile_printer(), + &hlo_profile_index_map()) : nullptr; auto return_value = diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc index 92c53265d0..fcd73fd37a 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc @@ -465,10 +465,20 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend( VLOG(2) << "Printing the thunk schedule..."; XLA_VLOG_LINES(2, thunk_schedule->ToString()); - auto* gpu_executable = - new GpuExecutable(ptx, cubin, {cc_major, cc_minor}, - std::move(thunk_schedule), std::move(module), - std::move(buffer_assignment), ShapeSizeBytesFunction()); + std::unique_ptr<HloProfileIndexMap> profile_index_map; + std::unique_ptr<HloProfilePrinter> profile_printer; + + if (module->config().hlo_profiling_enabled()) { + HloCostAnalysis cost_analysis(ShapeSizeBytesFunction()); + profile_index_map = MakeUnique<HloProfileIndexMap>(*module); + profile_printer = + CreateHloProfilePrinter(*profile_index_map, cost_analysis); + } + + auto* gpu_executable = new GpuExecutable( + ptx, cubin, {cc_major, cc_minor}, std::move(thunk_schedule), + std::move(module), std::move(buffer_assignment), + std::move(profile_printer), std::move(profile_index_map)); if (embed_ir_in_executable) { DCHECK_NE("", ir_module_string_before_opt); gpu_executable->set_ir_module_string(ir_module_string_before_opt); diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc index c6f23f9b05..0fd85e4fb0 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc @@ -113,14 +113,15 @@ GpuExecutable::GpuExecutable( std::unique_ptr<const ThunkSchedule> thunk_schedule, std::unique_ptr<const HloModule> hlo_module, std::unique_ptr<const BufferAssignment> assignment, - HloCostAnalysis::ShapeSizeFunction shape_size_function) - : Executable(std::move(hlo_module)), + std::unique_ptr<HloProfilePrinter> hlo_profile_printer, + std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map) + : Executable(std::move(hlo_module), std::move(hlo_profile_printer), + std::move(hlo_profile_index_map)), ptx_(ptx), cubin_(cubin), compute_capability_(compute_capability), thunk_schedule_(std::move(thunk_schedule)), - assignment_(std::move(assignment)), - shape_size_function_(std::move(shape_size_function)) {} + assignment_(std::move(assignment)) {} Status GpuExecutable::ExecuteThunks( const ServiceExecutableRunOptions* run_options, @@ -358,9 +359,5 @@ const PointsToSet& GpuExecutable::GetRootPointsToSet() const { module().entry_computation()->root_instruction()); } -std::unique_ptr<HloCostAnalysis> GpuExecutable::CreateCostAnalysis() const { - return MakeUnique<HloCostAnalysis>(shape_size_function_); -} - } // namespace gpu } // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h index a3815370c1..e7307e07c0 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h +++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h @@ -54,7 +54,8 @@ class GpuExecutable : public Executable { std::unique_ptr<const ThunkSchedule> thunk_schedule, std::unique_ptr<const HloModule> hlo_module, std::unique_ptr<const BufferAssignment> assignment, - HloCostAnalysis::ShapeSizeFunction shape_size_function); + std::unique_ptr<HloProfilePrinter> hlo_profile_printer, + std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map); // This should be called after set_ir_module_string. const string& ir_module_string() const { return ir_module_string_; } @@ -95,8 +96,6 @@ class GpuExecutable : public Executable { return Unimplemented("Equality test on GPU executable is not implemented."); } - std::unique_ptr<HloCostAnalysis> CreateCostAnalysis() const override; - private: // If `block_host_until_done` is false, execution will not block the host // until the kernels have completed. This is used as an optimization for @@ -140,9 +139,6 @@ class GpuExecutable : public Executable { // memory for every output/temp buffers. const std::unique_ptr<const BufferAssignment> assignment_; - // Function to compute the size of a given Shape, in bytes. - const HloCostAnalysis::ShapeSizeFunction shape_size_function_; - TF_DISALLOW_COPY_AND_ASSIGN(GpuExecutable); }; diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.cc b/tensorflow/compiler/xla/service/hlo_execution_profile.cc index 9e256b9b37..ba75e2ef1b 100644 --- a/tensorflow/compiler/xla/service/hlo_execution_profile.cc +++ b/tensorflow/compiler/xla/service/hlo_execution_profile.cc @@ -40,7 +40,7 @@ HloProfileIndexMap::HloProfileIndexMap(const HloModule& module) { } } -static HloProfilePrinter CreateOwnedHloProfilePrinter( +std::unique_ptr<HloProfilePrinter> CreateHloProfilePrinter( const HloProfileIndexMap& hlo_profile_index_map, const HloCostAnalysis& cost_analysis) { using HloComputationInfo = HloProfilePrinter::HloComputationInfo; @@ -108,15 +108,15 @@ static HloProfilePrinter CreateOwnedHloProfilePrinter( delete[] computation_infos; }; - return HloProfilePrinter(computation_infos, - hlo_profile_index_map.computation_count(), deleter); + return MakeUnique<HloProfilePrinter>( + computation_infos, hlo_profile_index_map.computation_count(), deleter); } -HloExecutionProfile::HloExecutionProfile(const HloModule& module, - const HloCostAnalysis& cost_analysis) - : hlo_profile_index_map_(module), - hlo_profile_printer_( - CreateOwnedHloProfilePrinter(hlo_profile_index_map_, cost_analysis)), +HloExecutionProfile::HloExecutionProfile( + const HloProfilePrinter* hlo_profile_printer, + const HloProfileIndexMap* hlo_profile_index_map) + : hlo_profile_printer_(*hlo_profile_printer), + hlo_profile_index_map_(*hlo_profile_index_map), profile_counters_( /*count*/ hlo_profile_index_map_.total_count(), /*value*/ 0) {} @@ -131,10 +131,4 @@ uint64 HloExecutionProfile::GetCyclesTakenBy(const HloInstruction& hlo) const { return profile_counters_[hlo_profile_index_map_.GetProfileIndexFor(hlo)]; } -string HloExecutionProfile::ToString( - const DeviceDescription& device_description) const { - return hlo_profile_printer_.ToString(profile_counters_.data(), - device_description.clock_rate_ghz()); -} - } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.h b/tensorflow/compiler/xla/service/hlo_execution_profile.h index 84702680c0..470fd4ce3c 100644 --- a/tensorflow/compiler/xla/service/hlo_execution_profile.h +++ b/tensorflow/compiler/xla/service/hlo_execution_profile.h @@ -77,6 +77,11 @@ class HloProfileIndexMap { std::unordered_map<const HloComputation*, int64> computation_to_profile_idx_; }; +// Create an instance of `HloProfilePrinter` that owns its memory. +std::unique_ptr<HloProfilePrinter> CreateHloProfilePrinter( + const HloProfileIndexMap& hlo_profile_index_map, + const HloCostAnalysis& cost_analysis); + // Describes how much time each HLO operation took. // // Each HloComputation takes a certain number of cycles. This class helps break @@ -85,8 +90,8 @@ class HloExecutionProfile { public: using DeviceDescription = perftools::gputools::DeviceDescription; - HloExecutionProfile(const HloModule& module, - const HloCostAnalysis& cost_analysis); + HloExecutionProfile(const HloProfilePrinter* hlo_profile_printer, + const HloProfileIndexMap* hlo_profile_index_map); // Record how many cycles this HLO took to execute. void SetCyclesTakenBy(const HloInstruction* hlo, uint64 cycles_taken); @@ -114,15 +119,16 @@ class HloExecutionProfile { // for the operations in a given computation. Returns an empty string if it // wasn't possible to generate a printable version. cost_analysis should be a // clean analysis that can be used to visit the computation. - string ToString(const DeviceDescription& device_description) const; + string ToString(const DeviceDescription& device_description) const { + return hlo_profile_printer_.ToString(profile_counters_.data(), + device_description.clock_rate_ghz()); + } - private: - // hlo_profile_index_map_ maps an Hlo entity (computation or instruction) to - // an index in profile_counters_. - HloProfileIndexMap hlo_profile_index_map_; + std::vector<int64>* mutable_profile_counters() { return &profile_counters_; } - // Used to print profile_counters_ in a human readable form. - HloProfilePrinter hlo_profile_printer_; + private: + const HloProfilePrinter& hlo_profile_printer_; + const HloProfileIndexMap& hlo_profile_index_map_; // Stores per-Hlo profile counters. This is the only thing that changes when // we execute an XLA computation. diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc b/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc index 5ba31296ea..b1e6729e2b 100644 --- a/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc +++ b/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc @@ -72,7 +72,11 @@ TEST_F(HloExecutionProfileTest, Basic) { }; HloCostAnalysis cost_analysis(shape_size_function); - HloExecutionProfile execution_profile(*hlo_module, cost_analysis); + HloProfileIndexMap profile_index_map(*hlo_module); + std::unique_ptr<HloProfilePrinter> profile_printer = + CreateHloProfilePrinter(profile_index_map, cost_analysis); + HloExecutionProfile execution_profile(profile_printer.get(), + &profile_index_map); const int64 add_cycles = 1000; const int64 dot_cycles = 4000; diff --git a/tensorflow/compiler/xla/service/interpreter/executable.cc b/tensorflow/compiler/xla/service/interpreter/executable.cc index 96f937caf9..9183a1d1bf 100644 --- a/tensorflow/compiler/xla/service/interpreter/executable.cc +++ b/tensorflow/compiler/xla/service/interpreter/executable.cc @@ -42,7 +42,8 @@ namespace sep = ::perftools::gputools::interpreter; InterpreterExecutable::InterpreterExecutable( std::unique_ptr<const HloModule> hlo_module) - : Executable(std::move(hlo_module)) {} + : Executable(std::move(hlo_module), /*hlo_profile_printer=*/nullptr, + /*hlo_profile_index_map=*/nullptr) {} InterpreterExecutable::~InterpreterExecutable() {} @@ -156,10 +157,5 @@ StatusOr<se::DeviceMemoryBase> InterpreterExecutable::ExecuteAsyncOnStream( return ShapeUtil::ByteSizeOf(shape, sizeof(void*)); } -std::unique_ptr<HloCostAnalysis> InterpreterExecutable::CreateCostAnalysis() - const { - return MakeUnique<HloCostAnalysis>(ShapeSizeBytes); -} - } // namespace interpreter } // namespace xla diff --git a/tensorflow/compiler/xla/service/interpreter/executable.h b/tensorflow/compiler/xla/service/interpreter/executable.h index c69b0d036d..0e87eb90bf 100644 --- a/tensorflow/compiler/xla/service/interpreter/executable.h +++ b/tensorflow/compiler/xla/service/interpreter/executable.h @@ -61,8 +61,6 @@ class InterpreterExecutable : public Executable { static int64 ShapeSizeBytes(const Shape& shape); - std::unique_ptr<HloCostAnalysis> CreateCostAnalysis() const override; - private: TF_DISALLOW_COPY_AND_ASSIGN(InterpreterExecutable); }; diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc index 902a1afb45..d997cab83f 100644 --- a/tensorflow/compiler/xla/service/service.cc +++ b/tensorflow/compiler/xla/service/service.cc @@ -575,12 +575,13 @@ Service::ExecuteParallelAndRegisterResult( // profile. for (auto& index_to_profiled_stream : index_to_profiled_streams) { int64 device = index_to_profiled_stream.first; - auto& module = executables[device]->module(); se::Stream* stream = index_to_profiled_stream.second; - HloExecutionProfile hlo_profile(module, - *executables[device]->CreateCostAnalysis()); - TF_RETURN_IF_ERROR(executables[device]->PopulateExecutionProfile( - &hlo_profile, stream->parent())); + Executable* executable = executables[device]; + const HloModule& module = executable->module(); + HloExecutionProfile hlo_profile(&executable->hlo_profile_printer(), + &executable->hlo_profile_index_map()); + TF_RETURN_IF_ERROR( + executable->PopulateExecutionProfile(&hlo_profile, stream->parent())); XLA_LOG_LINES( tensorflow::INFO, hlo_profile.ToString(streams[0]->parent()->GetDeviceDescription())); |