diff options
8 files changed, 95 insertions, 39 deletions
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc index cd9ff4d858..53bf055b9d 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc @@ -230,7 +230,8 @@ Status CpuExecutable::ExecuteComputeFunction( } if (hlo_execution_profile != nullptr) { - hlo_execution_profile->set_total_cycles_executed(profile_counters.back()); + hlo_execution_profile->set_total_cycles_executed( + *module().entry_computation(), profile_counters.back()); for (auto hlo_prof_idx : hlo_to_profile_idx_) { const HloInstruction* hlo = hlo_prof_idx.first; diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc index d6d3d79f4a..0266442d7a 100644 --- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc +++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc @@ -190,8 +190,8 @@ ParallelCpuExecutable::ExecuteOnStream( std::list<HloInstruction*> pending; // Call the function for each HLO instruction in topological order. - for (auto* instruction : - module().entry_computation()->MakeInstructionPostOrder()) { + const HloComputation& entry_computation = *module().entry_computation(); + for (auto* instruction : entry_computation.MakeInstructionPostOrder()) { // Parameters and constants have no functions associated with them. Instead // just copy the existing buffer into the map containing instruction // results.. @@ -299,7 +299,8 @@ ParallelCpuExecutable::ExecuteOnStream( execution_profile_.set_compute_cycle_count(profile_counters.back()); } if (hlo_execution_profile != nullptr) { - hlo_execution_profile->set_total_cycles_executed(profile_counters.back()); + hlo_execution_profile->set_total_cycles_executed(entry_computation, + profile_counters.back()); for (auto hlo_prof_idx : hlo_to_profile_idx_) { const HloInstruction* hlo = hlo_prof_idx.first; diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc index dc2b048878..ceb897ab9c 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc @@ -47,8 +47,12 @@ class HloExecutionProfiler { public: // If profiling is enabled, start an execution timer running. explicit HloExecutionProfiler(bool do_profile, HloExecutionProfile* profile, - se::Stream* stream) - : do_profile_(do_profile), profile_(profile), stream_(stream) { + se::Stream* stream, + const HloComputation* computation) + : do_profile_(do_profile), + profile_(profile), + stream_(stream), + computation_(computation) { if (do_profile_) { clock_rate_ghz_ = stream->parent()->GetDeviceDescription().clock_rate_ghz(); @@ -66,8 +70,8 @@ class HloExecutionProfiler { if (do_profile_) { stream_->ThenStopTimer(execution_timer_.get()); stream_->BlockHostUntilDone(); - profile_->set_total_cycles_executed(execution_timer_->Nanoseconds() * - clock_rate_ghz_); + profile_->set_total_cycles_executed( + *computation_, execution_timer_->Nanoseconds() * clock_rate_ghz_); } } @@ -94,6 +98,7 @@ class HloExecutionProfiler { double clock_rate_ghz_; HloExecutionProfile* profile_; se::Stream* stream_; + const HloComputation* computation_; std::unique_ptr<se::Timer> execution_timer_; std::unique_ptr<se::Timer> per_op_timer_; }; @@ -119,7 +124,8 @@ Status GpuExecutable::ExecuteThunks( if (do_profile) { LOG(WARNING) << "PROFILING: profiling is enabled"; } - HloExecutionProfiler profiler(do_profile, hlo_execution_profile, main_stream); + HloExecutionProfiler profiler(do_profile, hlo_execution_profile, main_stream, + hlo_module_->entry_computation()); std::vector<std::unique_ptr<se::Stream>> sub_streams; // Stream 0 indicates `main_stream` and substreams start from stream 1. diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc index 41be997584..8fe1897e75 100644 --- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc +++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc @@ -369,7 +369,13 @@ Status HloCostAnalysis::HandleFusion(HloInstruction* fusion) { Status HloCostAnalysis::HandleCall( HloInstruction* call, tensorflow::gtl::ArraySlice<HloInstruction*> operands, HloComputation* computation) { - return Unimplemented("call"); + HloCostAnalysis computation_visitor(shape_size_); + TF_RETURN_IF_ERROR(computation->Accept(&computation_visitor)); + + current_flop_count_ = computation_visitor.flop_count(); + current_transcendental_count_ = computation_visitor.transcendental_count(); + current_bytes_accessed_ = computation_visitor.bytes_accessed(); + return Status::OK(); } Status HloCostAnalysis::HandleCustomCall( diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.cc b/tensorflow/compiler/xla/service/hlo_execution_profile.cc index e2a81a052c..447892c8de 100644 --- a/tensorflow/compiler/xla/service/hlo_execution_profile.cc +++ b/tensorflow/compiler/xla/service/hlo_execution_profile.cc @@ -21,6 +21,7 @@ limitations under the License. #include "tensorflow/compiler/xla/metric_table_report.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/hlo_module.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/lib/strings/numbers.h" @@ -32,6 +33,7 @@ namespace xla { void HloExecutionProfile::AddProfileResult(const HloInstruction* hlo, uint64 cycles_taken) { hlo_to_cycles_taken_[hlo] = cycles_taken; + profiled_computations_.insert(hlo->parent()); } uint64 HloExecutionProfile::GetProfileResult(const HloInstruction& hlo) const { @@ -43,17 +45,30 @@ uint64 HloExecutionProfile::GetProfileResult(const HloInstruction& hlo) const { } string HloExecutionProfile::ToString( + const HloComputation& computation, const DeviceDescription& device_description, - const HloCostAnalysis& cost_analysis) const { + const HloCostAnalysis::ShapeSizeFunction& shape_size) const { + HloCostAnalysis cost_analysis(shape_size); + tensorflow::Status analysis_status = + computation.root_instruction()->Accept(&cost_analysis); + if (!analysis_status.ok()) { + return ""; + } + using Item = std::pair<const HloInstruction*, uint64>; - std::vector<Item> items(hlo_to_cycles_taken_.begin(), - hlo_to_cycles_taken_.end()); + std::vector<Item> items; + for (Item item : hlo_to_cycles_taken_) { + // Only include the HLOs which are part of the desired computation. + if (item.first->parent() == &computation) { + items.push_back(item); + } + } auto custom_less = [](const Item& lhs, const Item& rhs) { return lhs.second > rhs.second; }; std::sort(items.begin(), items.end(), custom_less); string result; - const int64 total_cycles = total_cycles_executed(); + const int64 total_cycles = total_cycles_executed(computation); double clock_rate_ghz = device_description.clock_rate_ghz(); const auto cycles_to_microseconds = [&](double cycles) { @@ -88,11 +103,13 @@ string HloExecutionProfile::ToString( bytes_per_sec.c_str(), bytes_per_cycle.c_str(), name.c_str())); }; tensorflow::strings::StrAppend( - &result, - tensorflow::strings::Printf("HLO execution profile: (%s @ f_nom)\n\t", - tensorflow::strings::HumanReadableElapsedTime( - total_cycles / clock_rate_ghz / 1e9) - .c_str())); + &result, tensorflow::strings::Printf( + "HLO execution profile for %s: (%s @ f_nom)\n\t", + computation.name().c_str(), + tensorflow::strings::HumanReadableElapsedTime( + total_cycles / clock_rate_ghz / 1e9) + .c_str())); + append_item(total_cycles, -1, -1, "[total]"); for (const auto& item : items) { const HloInstruction* hlo = item.first; diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.h b/tensorflow/compiler/xla/service/hlo_execution_profile.h index 6cc2079813..70b94a3f95 100644 --- a/tensorflow/compiler/xla/service/hlo_execution_profile.h +++ b/tensorflow/compiler/xla/service/hlo_execution_profile.h @@ -43,27 +43,45 @@ class HloExecutionProfile { uint64 GetProfileResult(const HloInstruction& hlo) const; // Return the number of cycles this computation took to execute. - uint64 total_cycles_executed() const { return total_cycles_executed_; } + uint64 total_cycles_executed(const HloComputation& computation) const { + auto it = total_cycles_executed_.find(&computation); + if (it != total_cycles_executed_.end()) { + return it->second; + } + return 0; + } - // Record how many cycles the entire computation took to execute. - void set_total_cycles_executed(uint64 total_cycles_executed) { - total_cycles_executed_ = total_cycles_executed; + // Record how many cycles a computation took to execute. + void set_total_cycles_executed(const HloComputation& computation, + uint64 total_cycles_executed) { + total_cycles_executed_[&computation] = total_cycles_executed; } // Returns a version of the execution profile suitable for performance // debugging; e.g. emits cycle counts, execution time at the nominal device // frequency, and the effective throughput given the provided cost_analysis - // for the operations. - string ToString(const DeviceDescription& device_description, - const HloCostAnalysis& cost_analysis) const; + // for the operations in a given computation. + // Returns an empty string if it wasn't possible to generate a printable + // version. + string ToString(const HloComputation& computation, + const DeviceDescription& device_description, + const HloCostAnalysis::ShapeSizeFunction& shape_size) const; + + // Returns the computations we have profiled. + std::unordered_set<const HloComputation*> profiled_computations() const { + return profiled_computations_; + } private: // Contains a mapping from HLO to the number of cycles it took to execute it. std::unordered_map<const HloInstruction*, uint64> hlo_to_cycles_taken_; - // If non-empty, contains the total number of cycles this computation took to + // If non-empty, contains the total number of cycles a computation took to // execute. - uint64 total_cycles_executed_ = 0; + std::unordered_map<const HloComputation*, uint64> total_cycles_executed_; + + // The computations we have profiled. + std::unordered_set<const HloComputation*> profiled_computations_; }; } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc index d7d8722ccc..4308d39860 100644 --- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc +++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc @@ -329,7 +329,7 @@ string InstructionSequenceGraph( auto hlo_cycles_executed = hlo_execution_profile->GetProfileResult(*instruction); auto total_cycles_executed = - hlo_execution_profile->total_cycles_executed(); + hlo_execution_profile->total_cycles_executed(*instruction->parent()); if (hlo_cycles_executed > 0 && total_cycles_executed > 0) { Appendf(&label, "\\n%% of cycles executed=%.2f", (static_cast<double>(hlo_cycles_executed) / @@ -405,7 +405,7 @@ string ComputationToDotGraph(const HloComputation& computation, const HloExecutionProfile* hlo_execution_profile) { string graph_label = StrCat(label, "\\n", computation.name()); if (hlo_execution_profile != nullptr) { - auto cycles = hlo_execution_profile->total_cycles_executed(); + auto cycles = hlo_execution_profile->total_cycles_executed(computation); Appendf(&graph_label, "\\ntotal cycles = %lld (%s)", cycles, tensorflow::strings::HumanReadableNum(cycles).c_str()); } diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h index ce07489fe0..1cad6866ad 100644 --- a/tensorflow/compiler/xla/service/service.h +++ b/tensorflow/compiler/xla/service/service.h @@ -450,16 +450,23 @@ ReturnT Service::ExecuteOnStreamWrapper( } if (profile_ptr != nullptr) { - HloCostAnalysis analysis([this](const Shape& shape) { + HloCostAnalysis::ShapeSizeFunction shape_size = [this](const Shape& shape) { return execute_backend_->compiler()->ShapeSizeBytes(shape); - }); - tensorflow::Status analysis_status = - executable->module().entry_computation()->root_instruction()->Accept( - &analysis); - if (analysis_status.ok()) { - XLA_LOG_LINES(tensorflow::INFO, - profile_ptr->ToString( - stream->parent()->GetDeviceDescription(), analysis)); + }; + std::unordered_set<const xla::HloComputation*> profiled_computations = + profile_ptr->profiled_computations(); + // To ensure we have print the profiles in a stable order, iterate over the + // computations in post order. + std::list<xla::HloComputation*> all_computations = + executable->module().MakeComputationPostOrder(); + for (xla::HloComputation* computation : all_computations) { + if (profiled_computations.count(computation) > 0) { + string profile_string = profile_ptr->ToString( + *computation, stream->parent()->GetDeviceDescription(), shape_size); + if (!profile_string.empty()) { + XLA_LOG_LINES(tensorflow::INFO, profile_string); + } + } } DumpExecutedHlo(executable->module(), "Service::Execute", profile_ptr); } |