aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--tensorflow/compiler/xla/service/cpu/cpu_executable.cc3
-rw-r--r--tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc7
-rw-r--r--tensorflow/compiler/xla/service/gpu/gpu_executable.cc16
-rw-r--r--tensorflow/compiler/xla/service/hlo_cost_analysis.cc8
-rw-r--r--tensorflow/compiler/xla/service/hlo_execution_profile.cc35
-rw-r--r--tensorflow/compiler/xla/service/hlo_execution_profile.h36
-rw-r--r--tensorflow/compiler/xla/service/hlo_graph_dumper.cc4
-rw-r--r--tensorflow/compiler/xla/service/service.h25
8 files changed, 95 insertions, 39 deletions
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index cd9ff4d858..53bf055b9d 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -230,7 +230,8 @@ Status CpuExecutable::ExecuteComputeFunction(
}
if (hlo_execution_profile != nullptr) {
- hlo_execution_profile->set_total_cycles_executed(profile_counters.back());
+ hlo_execution_profile->set_total_cycles_executed(
+ *module().entry_computation(), profile_counters.back());
for (auto hlo_prof_idx : hlo_to_profile_idx_) {
const HloInstruction* hlo = hlo_prof_idx.first;
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
index d6d3d79f4a..0266442d7a 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
@@ -190,8 +190,8 @@ ParallelCpuExecutable::ExecuteOnStream(
std::list<HloInstruction*> pending;
// Call the function for each HLO instruction in topological order.
- for (auto* instruction :
- module().entry_computation()->MakeInstructionPostOrder()) {
+ const HloComputation& entry_computation = *module().entry_computation();
+ for (auto* instruction : entry_computation.MakeInstructionPostOrder()) {
// Parameters and constants have no functions associated with them. Instead
// just copy the existing buffer into the map containing instruction
// results..
@@ -299,7 +299,8 @@ ParallelCpuExecutable::ExecuteOnStream(
execution_profile_.set_compute_cycle_count(profile_counters.back());
}
if (hlo_execution_profile != nullptr) {
- hlo_execution_profile->set_total_cycles_executed(profile_counters.back());
+ hlo_execution_profile->set_total_cycles_executed(entry_computation,
+ profile_counters.back());
for (auto hlo_prof_idx : hlo_to_profile_idx_) {
const HloInstruction* hlo = hlo_prof_idx.first;
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index dc2b048878..ceb897ab9c 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -47,8 +47,12 @@ class HloExecutionProfiler {
public:
// If profiling is enabled, start an execution timer running.
explicit HloExecutionProfiler(bool do_profile, HloExecutionProfile* profile,
- se::Stream* stream)
- : do_profile_(do_profile), profile_(profile), stream_(stream) {
+ se::Stream* stream,
+ const HloComputation* computation)
+ : do_profile_(do_profile),
+ profile_(profile),
+ stream_(stream),
+ computation_(computation) {
if (do_profile_) {
clock_rate_ghz_ =
stream->parent()->GetDeviceDescription().clock_rate_ghz();
@@ -66,8 +70,8 @@ class HloExecutionProfiler {
if (do_profile_) {
stream_->ThenStopTimer(execution_timer_.get());
stream_->BlockHostUntilDone();
- profile_->set_total_cycles_executed(execution_timer_->Nanoseconds() *
- clock_rate_ghz_);
+ profile_->set_total_cycles_executed(
+ *computation_, execution_timer_->Nanoseconds() * clock_rate_ghz_);
}
}
@@ -94,6 +98,7 @@ class HloExecutionProfiler {
double clock_rate_ghz_;
HloExecutionProfile* profile_;
se::Stream* stream_;
+ const HloComputation* computation_;
std::unique_ptr<se::Timer> execution_timer_;
std::unique_ptr<se::Timer> per_op_timer_;
};
@@ -119,7 +124,8 @@ Status GpuExecutable::ExecuteThunks(
if (do_profile) {
LOG(WARNING) << "PROFILING: profiling is enabled";
}
- HloExecutionProfiler profiler(do_profile, hlo_execution_profile, main_stream);
+ HloExecutionProfiler profiler(do_profile, hlo_execution_profile, main_stream,
+ hlo_module_->entry_computation());
std::vector<std::unique_ptr<se::Stream>> sub_streams;
// Stream 0 indicates `main_stream` and substreams start from stream 1.
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index 41be997584..8fe1897e75 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -369,7 +369,13 @@ Status HloCostAnalysis::HandleFusion(HloInstruction* fusion) {
Status HloCostAnalysis::HandleCall(
HloInstruction* call, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
HloComputation* computation) {
- return Unimplemented("call");
+ HloCostAnalysis computation_visitor(shape_size_);
+ TF_RETURN_IF_ERROR(computation->Accept(&computation_visitor));
+
+ current_flop_count_ = computation_visitor.flop_count();
+ current_transcendental_count_ = computation_visitor.transcendental_count();
+ current_bytes_accessed_ = computation_visitor.bytes_accessed();
+ return Status::OK();
}
Status HloCostAnalysis::HandleCustomCall(
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.cc b/tensorflow/compiler/xla/service/hlo_execution_profile.cc
index e2a81a052c..447892c8de 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile.cc
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile.cc
@@ -21,6 +21,7 @@ limitations under the License.
#include "tensorflow/compiler/xla/metric_table_report.h"
#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
#include "tensorflow/compiler/xla/types.h"
#include "tensorflow/compiler/xla/util.h"
#include "tensorflow/core/lib/strings/numbers.h"
@@ -32,6 +33,7 @@ namespace xla {
void HloExecutionProfile::AddProfileResult(const HloInstruction* hlo,
uint64 cycles_taken) {
hlo_to_cycles_taken_[hlo] = cycles_taken;
+ profiled_computations_.insert(hlo->parent());
}
uint64 HloExecutionProfile::GetProfileResult(const HloInstruction& hlo) const {
@@ -43,17 +45,30 @@ uint64 HloExecutionProfile::GetProfileResult(const HloInstruction& hlo) const {
}
string HloExecutionProfile::ToString(
+ const HloComputation& computation,
const DeviceDescription& device_description,
- const HloCostAnalysis& cost_analysis) const {
+ const HloCostAnalysis::ShapeSizeFunction& shape_size) const {
+ HloCostAnalysis cost_analysis(shape_size);
+ tensorflow::Status analysis_status =
+ computation.root_instruction()->Accept(&cost_analysis);
+ if (!analysis_status.ok()) {
+ return "";
+ }
+
using Item = std::pair<const HloInstruction*, uint64>;
- std::vector<Item> items(hlo_to_cycles_taken_.begin(),
- hlo_to_cycles_taken_.end());
+ std::vector<Item> items;
+ for (Item item : hlo_to_cycles_taken_) {
+ // Only include the HLOs which are part of the desired computation.
+ if (item.first->parent() == &computation) {
+ items.push_back(item);
+ }
+ }
auto custom_less = [](const Item& lhs, const Item& rhs) {
return lhs.second > rhs.second;
};
std::sort(items.begin(), items.end(), custom_less);
string result;
- const int64 total_cycles = total_cycles_executed();
+ const int64 total_cycles = total_cycles_executed(computation);
double clock_rate_ghz = device_description.clock_rate_ghz();
const auto cycles_to_microseconds = [&](double cycles) {
@@ -88,11 +103,13 @@ string HloExecutionProfile::ToString(
bytes_per_sec.c_str(), bytes_per_cycle.c_str(), name.c_str()));
};
tensorflow::strings::StrAppend(
- &result,
- tensorflow::strings::Printf("HLO execution profile: (%s @ f_nom)\n\t",
- tensorflow::strings::HumanReadableElapsedTime(
- total_cycles / clock_rate_ghz / 1e9)
- .c_str()));
+ &result, tensorflow::strings::Printf(
+ "HLO execution profile for %s: (%s @ f_nom)\n\t",
+ computation.name().c_str(),
+ tensorflow::strings::HumanReadableElapsedTime(
+ total_cycles / clock_rate_ghz / 1e9)
+ .c_str()));
+
append_item(total_cycles, -1, -1, "[total]");
for (const auto& item : items) {
const HloInstruction* hlo = item.first;
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.h b/tensorflow/compiler/xla/service/hlo_execution_profile.h
index 6cc2079813..70b94a3f95 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile.h
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile.h
@@ -43,27 +43,45 @@ class HloExecutionProfile {
uint64 GetProfileResult(const HloInstruction& hlo) const;
// Return the number of cycles this computation took to execute.
- uint64 total_cycles_executed() const { return total_cycles_executed_; }
+ uint64 total_cycles_executed(const HloComputation& computation) const {
+ auto it = total_cycles_executed_.find(&computation);
+ if (it != total_cycles_executed_.end()) {
+ return it->second;
+ }
+ return 0;
+ }
- // Record how many cycles the entire computation took to execute.
- void set_total_cycles_executed(uint64 total_cycles_executed) {
- total_cycles_executed_ = total_cycles_executed;
+ // Record how many cycles a computation took to execute.
+ void set_total_cycles_executed(const HloComputation& computation,
+ uint64 total_cycles_executed) {
+ total_cycles_executed_[&computation] = total_cycles_executed;
}
// Returns a version of the execution profile suitable for performance
// debugging; e.g. emits cycle counts, execution time at the nominal device
// frequency, and the effective throughput given the provided cost_analysis
- // for the operations.
- string ToString(const DeviceDescription& device_description,
- const HloCostAnalysis& cost_analysis) const;
+ // for the operations in a given computation.
+ // Returns an empty string if it wasn't possible to generate a printable
+ // version.
+ string ToString(const HloComputation& computation,
+ const DeviceDescription& device_description,
+ const HloCostAnalysis::ShapeSizeFunction& shape_size) const;
+
+ // Returns the computations we have profiled.
+ std::unordered_set<const HloComputation*> profiled_computations() const {
+ return profiled_computations_;
+ }
private:
// Contains a mapping from HLO to the number of cycles it took to execute it.
std::unordered_map<const HloInstruction*, uint64> hlo_to_cycles_taken_;
- // If non-empty, contains the total number of cycles this computation took to
+ // If non-empty, contains the total number of cycles a computation took to
// execute.
- uint64 total_cycles_executed_ = 0;
+ std::unordered_map<const HloComputation*, uint64> total_cycles_executed_;
+
+ // The computations we have profiled.
+ std::unordered_set<const HloComputation*> profiled_computations_;
};
} // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index d7d8722ccc..4308d39860 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -329,7 +329,7 @@ string InstructionSequenceGraph(
auto hlo_cycles_executed =
hlo_execution_profile->GetProfileResult(*instruction);
auto total_cycles_executed =
- hlo_execution_profile->total_cycles_executed();
+ hlo_execution_profile->total_cycles_executed(*instruction->parent());
if (hlo_cycles_executed > 0 && total_cycles_executed > 0) {
Appendf(&label, "\\n%% of cycles executed=%.2f",
(static_cast<double>(hlo_cycles_executed) /
@@ -405,7 +405,7 @@ string ComputationToDotGraph(const HloComputation& computation,
const HloExecutionProfile* hlo_execution_profile) {
string graph_label = StrCat(label, "\\n", computation.name());
if (hlo_execution_profile != nullptr) {
- auto cycles = hlo_execution_profile->total_cycles_executed();
+ auto cycles = hlo_execution_profile->total_cycles_executed(computation);
Appendf(&graph_label, "\\ntotal cycles = %lld (%s)", cycles,
tensorflow::strings::HumanReadableNum(cycles).c_str());
}
diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h
index ce07489fe0..1cad6866ad 100644
--- a/tensorflow/compiler/xla/service/service.h
+++ b/tensorflow/compiler/xla/service/service.h
@@ -450,16 +450,23 @@ ReturnT Service::ExecuteOnStreamWrapper(
}
if (profile_ptr != nullptr) {
- HloCostAnalysis analysis([this](const Shape& shape) {
+ HloCostAnalysis::ShapeSizeFunction shape_size = [this](const Shape& shape) {
return execute_backend_->compiler()->ShapeSizeBytes(shape);
- });
- tensorflow::Status analysis_status =
- executable->module().entry_computation()->root_instruction()->Accept(
- &analysis);
- if (analysis_status.ok()) {
- XLA_LOG_LINES(tensorflow::INFO,
- profile_ptr->ToString(
- stream->parent()->GetDeviceDescription(), analysis));
+ };
+ std::unordered_set<const xla::HloComputation*> profiled_computations =
+ profile_ptr->profiled_computations();
+ // To ensure we have print the profiles in a stable order, iterate over the
+ // computations in post order.
+ std::list<xla::HloComputation*> all_computations =
+ executable->module().MakeComputationPostOrder();
+ for (xla::HloComputation* computation : all_computations) {
+ if (profiled_computations.count(computation) > 0) {
+ string profile_string = profile_ptr->ToString(
+ *computation, stream->parent()->GetDeviceDescription(), shape_size);
+ if (!profile_string.empty()) {
+ XLA_LOG_LINES(tensorflow::INFO, profile_string);
+ }
+ }
}
DumpExecutedHlo(executable->module(), "Service::Execute", profile_ptr);
}