aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--tensorflow/compiler/xla/service/cpu/cpu_compiler.cc75
-rw-r--r--tensorflow/compiler/xla/service/cpu/cpu_executable.cc51
-rw-r--r--tensorflow/compiler/xla/service/cpu/cpu_executable.h19
-rw-r--r--tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc46
-rw-r--r--tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h10
-rw-r--r--tensorflow/compiler/xla/service/executable.h35
-rw-r--r--tensorflow/compiler/xla/service/gpu/gpu_compiler.cc18
-rw-r--r--tensorflow/compiler/xla/service/gpu/gpu_executable.cc13
-rw-r--r--tensorflow/compiler/xla/service/gpu/gpu_executable.h8
-rw-r--r--tensorflow/compiler/xla/service/hlo_execution_profile.cc22
-rw-r--r--tensorflow/compiler/xla/service/hlo_execution_profile.h24
-rw-r--r--tensorflow/compiler/xla/service/hlo_execution_profile_test.cc6
-rw-r--r--tensorflow/compiler/xla/service/interpreter/executable.cc8
-rw-r--r--tensorflow/compiler/xla/service/interpreter/executable.h2
-rw-r--r--tensorflow/compiler/xla/service/service.cc11
15 files changed, 193 insertions, 155 deletions
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index 56940b8d63..ff6042ae19 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -197,28 +197,35 @@ void InitializeLLVMCommandLineOptions(const HloModuleConfig& config) {
class CollectProfileCandidates : public DfsHloVisitorWithDefault {
public:
static StatusOr<std::unordered_map<const HloInstruction*, size_t>>
- GetCandidatesForComputation(HloComputation* computation) {
+ GetCandidatesForComputation(
+ HloComputation* computation,
+ const std::unordered_map<const HloInstruction*, int64>&
+ assigned_indices) {
std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx;
CollectProfileCandidates profile_candidates_for_computation(
- &hlo_to_profile_idx);
+ &hlo_to_profile_idx, assigned_indices);
TF_RETURN_IF_ERROR(
computation->Accept(&profile_candidates_for_computation));
return hlo_to_profile_idx;
}
private:
- explicit CollectProfileCandidates(
- std::unordered_map<const HloInstruction*, size_t>* hlo_to_profile_idx)
- : hlo_to_profile_idx_(hlo_to_profile_idx) {}
+ CollectProfileCandidates(
+ std::unordered_map<const HloInstruction*, size_t>* hlo_to_profile_idx,
+ const std::unordered_map<const HloInstruction*, int64>& assigned_indices)
+ : hlo_to_profile_idx_(hlo_to_profile_idx),
+ assigned_indices_(assigned_indices) {}
Status DefaultAction(HloInstruction* hlo_instruction) override {
- hlo_to_profile_idx_->insert({hlo_instruction, hlo_to_profile_idx_->size()});
+ hlo_to_profile_idx_->insert(
+ {hlo_instruction, FindOrDie(assigned_indices_, hlo_instruction)});
return Status::OK();
}
Status HandleCall(HloInstruction* call) override {
TF_RETURN_IF_ERROR(DefaultAction(call));
- CollectProfileCandidates candidates_for_call(hlo_to_profile_idx_);
+ CollectProfileCandidates candidates_for_call(hlo_to_profile_idx_,
+ assigned_indices_);
TF_RETURN_IF_ERROR(call->to_apply()->Accept(&candidates_for_call));
return Status::OK();
}
@@ -232,17 +239,20 @@ class CollectProfileCandidates : public DfsHloVisitorWithDefault {
Status HandleWhile(HloInstruction* xla_while) override {
TF_RETURN_IF_ERROR(DefaultAction(xla_while));
- CollectProfileCandidates candidates_for_condition(hlo_to_profile_idx_);
+ CollectProfileCandidates candidates_for_condition(hlo_to_profile_idx_,
+ assigned_indices_);
TF_RETURN_IF_ERROR(
xla_while->while_condition()->Accept(&candidates_for_condition));
- CollectProfileCandidates candidates_for_body(hlo_to_profile_idx_);
+ CollectProfileCandidates candidates_for_body(hlo_to_profile_idx_,
+ assigned_indices_);
TF_RETURN_IF_ERROR(xla_while->while_body()->Accept(&candidates_for_body));
return Status::OK();
}
std::unordered_map<const HloInstruction*, size_t>* hlo_to_profile_idx_;
+ const std::unordered_map<const HloInstruction*, int64>& assigned_indices_;
};
} // namespace
@@ -475,10 +485,27 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
HloComputation* computation = module->entry_computation();
std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx;
+ std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map;
+ std::unique_ptr<HloProfilePrinter> hlo_profile_printer;
if (module->config().hlo_profiling_enabled()) {
+ hlo_profile_index_map = MakeUnique<HloProfileIndexMap>(*module);
+
TF_ASSIGN_OR_RETURN(
hlo_to_profile_idx,
- CollectProfileCandidates::GetCandidatesForComputation(computation));
+ CollectProfileCandidates::GetCandidatesForComputation(
+ computation, hlo_profile_index_map->instruction_to_profile_idx()));
+
+ auto shape_size_bytes = [](const Shape& shape) {
+ // On the cpu, opaques are pointers.
+ if (ShapeUtil::IsOpaque(shape)) {
+ return static_cast<int64>(sizeof(void*));
+ }
+ return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
+ };
+
+ HloCostAnalysis cost_analysis(shape_size_bytes);
+ hlo_profile_printer =
+ CreateHloProfilePrinter(*hlo_profile_index_map, cost_analysis);
}
std::unique_ptr<Executable> cpu_executable;
@@ -544,8 +571,16 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
parallel_computations.emplace(to_apply, instruction);
}
+ // We always profile the entire computation as a whole, even if hlo
+ // profiling is disabled. When hlo profiling is diabled, we pass in a
+ // profile counter array of just one element, which corresponds to the whole
+ // computation.
+ size_t entry_computation_profile_idx =
+ hlo_profile_index_map ? hlo_profile_index_map->GetProfileIndexFor(
+ *module->entry_computation())
+ : 0;
IrEmitter ir_emitter(*module, *assignment, llvm_module.get(),
- hlo_to_profile_idx, hlo_to_profile_idx.size(),
+ hlo_to_profile_idx, entry_computation_profile_idx,
jit->target_machine(), jit->external_constant_pool());
std::unique_ptr<HloInstructionMap<string>> function_names(
@@ -586,8 +621,8 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
jit->AddModule(std::move(llvm_module));
cpu_executable.reset(new ParallelCpuExecutable(
std::move(jit), std::move(assignment), std::move(module),
- std::move(function_names), std::move(hlo_to_profile_idx),
- std::move(aligned_constants)));
+ std::move(function_names), std::move(aligned_constants),
+ std::move(hlo_profile_printer), std::move(hlo_profile_index_map)));
if (embed_ir_in_executable) {
static_cast<CpuExecutable&>(*cpu_executable)
@@ -620,12 +655,22 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
TF_RETURN_IF_ERROR(protobuf_util::DumpProtoToDirectory(
proto, xla_dump_hlo_proto_to, module->name()));
}
+ // We always profile the entire computation as a whole, even if hlo
+ // profiling is disabled. When hlo profiling is diabled, we pass in a
+ // profile counter array of just one element, which corresponds to the whole
+ // computation.
+ size_t entry_computation_profile_idx =
+ hlo_profile_index_map ? hlo_profile_index_map->GetProfileIndexFor(
+ *module->entry_computation())
+ : 0;
+
// Each computation is a single function. Emit all embedded computations
// before the entry computation. The order of computations returned from
// GetEmbeddedComputations guarantees that a called computation occurs
// before a caller computation.
+
IrEmitter ir_emitter(*module, *assignment, llvm_module.get(),
- hlo_to_profile_idx, hlo_to_profile_idx.size(),
+ hlo_to_profile_idx, entry_computation_profile_idx,
jit->target_machine(), jit->external_constant_pool());
for (auto embedded_computation :
@@ -659,7 +704,7 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
jit->AddModule(std::move(llvm_module));
cpu_executable.reset(new CpuExecutable(
std::move(jit), std::move(assignment), std::move(module), function_name,
- std::move(hlo_to_profile_idx)));
+ std::move(hlo_profile_printer), std::move(hlo_profile_index_map)));
if (embed_ir_in_executable) {
static_cast<CpuExecutable&>(*cpu_executable)
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index e6ef9d6314..e956f478b8 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -55,11 +55,12 @@ CpuExecutable::CpuExecutable(
std::unique_ptr<const BufferAssignment> assignment,
std::unique_ptr<const HloModule> hlo_module,
const string& entry_function_name,
- std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx)
- : Executable(std::move(hlo_module)),
+ std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
+ std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
+ : Executable(std::move(hlo_module), std::move(hlo_profile_printer),
+ std::move(hlo_profile_index_map)),
jit_(std::move(jit)),
- assignment_(std::move(assignment)),
- hlo_to_profile_idx_(std::move(hlo_to_profile_idx)) {
+ assignment_(std::move(assignment)) {
// Resolve symbols in the constructor rather than at execution time to avoid
// races because FindSymbol is not thread safe.
llvm::JITSymbol sym = jit_->FindSymbol(entry_function_name);
@@ -183,9 +184,16 @@ Status CpuExecutable::ExecuteComputeFunction(
uint64 start_micros = tensorflow::Env::Default()->NowMicros();
// Allocate profiling counters for each hlo instruction that we would like to
- // profile. Allocate an additional profile counter for the entire
- // computation.
- std::vector<uint64> profile_counters(hlo_to_profile_idx_.size() + 1);
+ // profile. Even when not Hlo profiling, we allocate a counter for the entire
+ // computation, which we use to update ExecutionProfile below.
+ std::vector<int64>* profile_counters = nullptr;
+ std::vector<int64> profile_counter_for_entry_computation;
+ if (hlo_execution_profile) {
+ profile_counters = hlo_execution_profile->mutable_profile_counters();
+ } else {
+ profile_counters = &profile_counter_for_entry_computation;
+ profile_counter_for_entry_computation.push_back(0);
+ }
// Call the computation function following the calling convention.
std::vector<void*> buffer_pointers;
@@ -200,7 +208,7 @@ Status CpuExecutable::ExecuteComputeFunction(
VLOG(3) << tensorflow::strings::Printf(
" func(void* result, void* params[%zu], void* temps[%zu], "
"uint64 profile_counters[%zu])",
- args_array.size(), buffer_pointers.size(), profile_counters.size());
+ args_array.size(), buffer_pointers.size(), profile_counters->size());
VLOG(3) << tensorflow::strings::Printf(" result = %p", result_buffer);
auto ptr_printer = [](string* out, const void* p) {
tensorflow::strings::StrAppend(out, tensorflow::strings::Printf("%p", p));
@@ -212,11 +220,11 @@ Status CpuExecutable::ExecuteComputeFunction(
" temps = [%s]",
tensorflow::str_util::Join(buffer_pointers, ", ", ptr_printer).c_str());
VLOG(3) << tensorflow::strings::Printf(" profile_counters = %p",
- profile_counters.data());
+ profile_counters->data());
}
compute_function_(result_buffer, run_options, args_array.data(),
- buffer_pointers.data(), profile_counters.data());
+ buffer_pointers.data(), profile_counters->data());
uint64 end_micros = tensorflow::Env::Default()->NowMicros();
@@ -225,20 +233,15 @@ Status CpuExecutable::ExecuteComputeFunction(
const double nanoseconds = (end_micros - start_micros) * 1000.0;
execution_profile_.set_compute_time_ns(std::max(nanoseconds, 1.0));
- // The last profile counter is used for the computation as a whole.
- execution_profile_.set_compute_cycle_count(profile_counters.back());
- }
-
- if (hlo_execution_profile != nullptr) {
- hlo_execution_profile->set_total_cycles_executed(
- *module().entry_computation(), profile_counters.back());
-
- for (auto hlo_prof_idx : hlo_to_profile_idx_) {
- const HloInstruction* hlo = hlo_prof_idx.first;
- uint64 cycles_taken = profile_counters[hlo_prof_idx.second];
- hlo_execution_profile->SetCyclesTakenBy(hlo, cycles_taken);
+ if (hlo_execution_profile) {
+ execution_profile_.set_compute_cycle_count(
+ hlo_execution_profile->total_cycles_executed(
+ *module().entry_computation()));
+ } else {
+ execution_profile_.set_compute_cycle_count(profile_counters->back());
}
}
+
return Status::OK();
}
@@ -428,9 +431,5 @@ const PointsToSet& CpuExecutable::GetRootPointsToSet() const {
module().entry_computation()->root_instruction());
}
-std::unique_ptr<HloCostAnalysis> CpuExecutable::CreateCostAnalysis() const {
- return MakeUnique<HloCostAnalysis>(ShapeSizeBytes);
-}
-
} // namespace cpu
} // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.h b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
index 238bc9b46a..17ee2d673e 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
@@ -47,12 +47,12 @@ namespace cpu {
// architecture, so JIT-ed code and host code share the same ABI.
class CpuExecutable : public Executable {
public:
- CpuExecutable(
- std::unique_ptr<SimpleOrcJIT> jit,
- std::unique_ptr<const BufferAssignment> assignment,
- std::unique_ptr<const HloModule> hlo_module,
- const string& entry_function_name,
- std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx);
+ CpuExecutable(std::unique_ptr<SimpleOrcJIT> jit,
+ std::unique_ptr<const BufferAssignment> assignment,
+ std::unique_ptr<const HloModule> hlo_module,
+ const string& entry_function_name,
+ std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
+ std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
~CpuExecutable() override {}
StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteOnStream(
@@ -85,12 +85,10 @@ class CpuExecutable : public Executable {
static int64 ShapeSizeBytes(const Shape& shape);
- std::unique_ptr<HloCostAnalysis> CreateCostAnalysis() const override;
-
// Type of the computation function we expect in the JIT.
using ComputeFunctionType = void (*)(
void* /*result*/, const ExecutableRunOptions* /*run_options*/,
- const void** /*args*/, void** /*temps*/, uint64* /*profile_counters*/);
+ const void** /*args*/, void** /*temps*/, int64* /*profile_counters*/);
const ComputeFunctionType& compute_function() const {
return compute_function_;
@@ -145,9 +143,6 @@ class CpuExecutable : public Executable {
// Entry function name for the computation.
const string entry_function_name_;
- // Maps HLOs to their index into the profile counter array.
- const std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx_;
-
TF_DISALLOW_COPY_AND_ASSIGN(CpuExecutable);
};
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
index aff61296ce..0077e344e2 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
@@ -59,19 +59,20 @@ ParallelCpuExecutable::ParallelCpuExecutable(
std::unique_ptr<const BufferAssignment> assignment,
std::unique_ptr<const HloModule> hlo_module,
std::unique_ptr<const HloInstructionMap<string>> function_names,
- std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx,
std::unordered_map<const HloInstruction*, std::unique_ptr<unsigned char[]>>
- aligned_constants)
- : Executable(std::move(hlo_module)),
+ aligned_constants,
+ std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
+ std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
+ : Executable(std::move(hlo_module), std::move(hlo_profile_printer),
+ std::move(hlo_profile_index_map)),
jit_(std::move(jit)),
assignment_(std::move(assignment)),
function_names_(std::move(function_names)),
- hlo_to_profile_idx_(std::move(hlo_to_profile_idx)),
aligned_constants_(std::move(aligned_constants)) {}
// Type of the computation function we expect in the JIT.
using ComputeFunctionType = void (*)(void*, const void*, const void**, void**,
- int64*, uint64*);
+ int64*, int64*);
// Given a pointer to an output buffer (following the CPU JIT calling
// conventions), mark addresses that are "live". The initial pointer itself is
@@ -106,7 +107,7 @@ class Executor {
const ServiceExecutableRunOptions* run_options,
std::list<HloInstruction*>* pending,
HloInstructionMap<const void*>* results, void** temps_array,
- uint64* profile_counters_array, const BufferAssignment* assignment)
+ int64* profile_counters_array, const BufferAssignment* assignment)
: functions_(functions),
run_options_(run_options),
pending_(pending),
@@ -147,7 +148,7 @@ class Executor {
std::list<HloInstruction*>* pending_;
HloInstructionMap<const void*>* results_;
void** temps_array_;
- uint64* profile_counters_array_;
+ int64* profile_counters_array_;
tensorflow::thread::ThreadPool* thread_pool_;
const BufferAssignment* assignment_;
@@ -389,9 +390,11 @@ Status ParallelCpuExecutable::ExecuteComputeFunctions(
tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> buffers,
HloExecutionProfile* hlo_execution_profile) {
// Allocate profiling counters for each hlo instruction that we would like to
- // profile. Allocate an additional profile counter for the entire
- // computation.
- std::vector<uint64> profile_counters(hlo_to_profile_idx_.size() + 1);
+ // profile.
+ std::vector<int64>* profile_counters = nullptr;
+ if (hlo_execution_profile) {
+ profile_counters = hlo_execution_profile->mutable_profile_counters();
+ }
std::vector<void*> buffer_pointers;
buffer_pointers.reserve(buffers.size());
@@ -441,9 +444,9 @@ Status ParallelCpuExecutable::ExecuteComputeFunctions(
// For example, if we expect a library conv/matmul call to run at max
// concurrency, we should not dispatch runnable instructions until the
// library call is finished (to avoid expensive cache invalidation).
- Executor executor(functions, run_options, &pending, &results,
- buffer_pointers.data(), profile_counters.data(),
- assignment_.get());
+ Executor executor(
+ functions, run_options, &pending, &results, buffer_pointers.data(),
+ profile_counters ? profile_counters->data() : nullptr, assignment_.get());
TF_RETURN_IF_ERROR(executor.Run());
@@ -453,18 +456,6 @@ Status ParallelCpuExecutable::ExecuteComputeFunctions(
tensorflow::mutex_lock lock(mutex_);
double nanoseconds = (end_micros - start_micros) * 1000.0;
execution_profile_.set_compute_time_ns(std::max(nanoseconds, 1.0));
- // The last profile counter is used for the computation as a whole.
- execution_profile_.set_compute_cycle_count(profile_counters.back());
- }
- if (hlo_execution_profile != nullptr) {
- hlo_execution_profile->set_total_cycles_executed(entry_computation,
- profile_counters.back());
-
- for (auto hlo_prof_idx : hlo_to_profile_idx_) {
- const HloInstruction* hlo = hlo_prof_idx.first;
- uint64 cycles_taken = profile_counters[hlo_prof_idx.second];
- hlo_execution_profile->SetCyclesTakenBy(hlo, cycles_taken);
- }
}
return Status::OK();
@@ -618,10 +609,5 @@ const PointsToSet& ParallelCpuExecutable::GetRootPointsToSet() const {
module().entry_computation()->root_instruction());
}
-std::unique_ptr<HloCostAnalysis> ParallelCpuExecutable::CreateCostAnalysis()
- const {
- return MakeUnique<HloCostAnalysis>(ShapeSizeBytes);
-}
-
} // namespace cpu
} // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
index db16aaf48b..d65e3f42f3 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
@@ -52,10 +52,11 @@ class ParallelCpuExecutable : public Executable {
std::unique_ptr<const BufferAssignment> assignment,
std::unique_ptr<const HloModule> hlo_module,
std::unique_ptr<const HloInstructionMap<string>> function_names,
- std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx,
std::unordered_map<const HloInstruction*,
std::unique_ptr<unsigned char[]>>
- aligned_constants);
+ aligned_constants,
+ std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
+ std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
~ParallelCpuExecutable() override {}
StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteOnStream(
@@ -95,8 +96,6 @@ class ParallelCpuExecutable : public Executable {
"Equality test on CPU parallel executable is not implemented.");
}
- std::unique_ptr<HloCostAnalysis> CreateCostAnalysis() const override;
-
private:
// Allocate buffers required for execution and assign them to the elements of
// "buffers". "buffers" should be sized to the number of buffers in buffer
@@ -143,9 +142,6 @@ class ParallelCpuExecutable : public Executable {
// Map containing the JITted function names for each HLO instruction.
const std::unique_ptr<const HloInstructionMap<string>> function_names_;
- // Maps HLOs to their index into the profile counter array.
- const std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx_;
-
// Map from HLO Constant instructions to a pointer to their literal data.
// The data stored in the protocol buffer might be insufficiently aligned,
// we create a sufficiently aligned copy and store it in this map.
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index 2135707371..08862308c9 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -44,8 +44,15 @@ namespace xla {
// interface that is used for launching compiled programs across platforms.
class Executable {
public:
- explicit Executable(std::unique_ptr<const HloModule> hlo_module)
- : hlo_module_(std::move(hlo_module)) {}
+ explicit Executable(std::unique_ptr<const HloModule> hlo_module,
+ std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
+ std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
+ : hlo_module_(std::move(hlo_module)),
+ hlo_profile_printer_(std::move(hlo_profile_printer)),
+ hlo_profile_index_map_(std::move(hlo_profile_index_map)) {
+ CHECK_EQ(hlo_profile_printer_.get() == nullptr,
+ hlo_profile_index_map_.get() == nullptr);
+ }
virtual ~Executable() {}
// Enqueues the compilation result on the provided stream, passing the given
@@ -123,12 +130,20 @@ class Executable {
"Equality test on this executable is not implemented.");
}
+ const HloProfilePrinter& hlo_profile_printer() const {
+ CHECK(hlo_profiling_enabled());
+ return *hlo_profile_printer_;
+ }
+
+ const HloProfileIndexMap& hlo_profile_index_map() const {
+ CHECK(hlo_profiling_enabled());
+ return *hlo_profile_index_map_;
+ }
+
// Returns whether this executable was compiled with HLO profilings support
// enabled. If not, the caller should not expect an hlo_execution_profile
// passed to ExecuteOnStream above to be populated during execution.
- bool hlo_profiling_enabled() const {
- return hlo_module_->config().hlo_profiling_enabled();
- }
+ bool hlo_profiling_enabled() const { return hlo_profile_printer_ != nullptr; }
const HloModule& module() const { return *hlo_module_; }
@@ -160,10 +175,6 @@ class Executable {
static Status DumpToDirectory(const string& directory_path, string filename,
const SessionModule& session_module);
- // Returns a cost analysis object appropriate for the platform on which this
- // executable can run.
- virtual std::unique_ptr<HloCostAnalysis> CreateCostAnalysis() const = 0;
-
protected:
mutable tensorflow::mutex mutex_;
@@ -181,6 +192,9 @@ class Executable {
// Execution count, used to generate a unique filename for each dumped
// execution.
int64 execution_count_ = 0;
+
+ std::unique_ptr<HloProfilePrinter> hlo_profile_printer_;
+ std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map_;
};
template <typename ReturnT, typename ArgT>
@@ -200,7 +214,8 @@ StatusOr<ReturnT> Executable::ExecuteOnStreamWrapper(
std::unique_ptr<HloExecutionProfile> profile_ptr =
module_config().debug_options().xla_hlo_profile() &&
hlo_profiling_enabled()
- ? MakeUnique<HloExecutionProfile>(module(), *CreateCostAnalysis())
+ ? MakeUnique<HloExecutionProfile>(&hlo_profile_printer(),
+ &hlo_profile_index_map())
: nullptr;
auto return_value =
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index 92c53265d0..fcd73fd37a 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -465,10 +465,20 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
VLOG(2) << "Printing the thunk schedule...";
XLA_VLOG_LINES(2, thunk_schedule->ToString());
- auto* gpu_executable =
- new GpuExecutable(ptx, cubin, {cc_major, cc_minor},
- std::move(thunk_schedule), std::move(module),
- std::move(buffer_assignment), ShapeSizeBytesFunction());
+ std::unique_ptr<HloProfileIndexMap> profile_index_map;
+ std::unique_ptr<HloProfilePrinter> profile_printer;
+
+ if (module->config().hlo_profiling_enabled()) {
+ HloCostAnalysis cost_analysis(ShapeSizeBytesFunction());
+ profile_index_map = MakeUnique<HloProfileIndexMap>(*module);
+ profile_printer =
+ CreateHloProfilePrinter(*profile_index_map, cost_analysis);
+ }
+
+ auto* gpu_executable = new GpuExecutable(
+ ptx, cubin, {cc_major, cc_minor}, std::move(thunk_schedule),
+ std::move(module), std::move(buffer_assignment),
+ std::move(profile_printer), std::move(profile_index_map));
if (embed_ir_in_executable) {
DCHECK_NE("", ir_module_string_before_opt);
gpu_executable->set_ir_module_string(ir_module_string_before_opt);
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index c6f23f9b05..0fd85e4fb0 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -113,14 +113,15 @@ GpuExecutable::GpuExecutable(
std::unique_ptr<const ThunkSchedule> thunk_schedule,
std::unique_ptr<const HloModule> hlo_module,
std::unique_ptr<const BufferAssignment> assignment,
- HloCostAnalysis::ShapeSizeFunction shape_size_function)
- : Executable(std::move(hlo_module)),
+ std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
+ std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
+ : Executable(std::move(hlo_module), std::move(hlo_profile_printer),
+ std::move(hlo_profile_index_map)),
ptx_(ptx),
cubin_(cubin),
compute_capability_(compute_capability),
thunk_schedule_(std::move(thunk_schedule)),
- assignment_(std::move(assignment)),
- shape_size_function_(std::move(shape_size_function)) {}
+ assignment_(std::move(assignment)) {}
Status GpuExecutable::ExecuteThunks(
const ServiceExecutableRunOptions* run_options,
@@ -358,9 +359,5 @@ const PointsToSet& GpuExecutable::GetRootPointsToSet() const {
module().entry_computation()->root_instruction());
}
-std::unique_ptr<HloCostAnalysis> GpuExecutable::CreateCostAnalysis() const {
- return MakeUnique<HloCostAnalysis>(shape_size_function_);
-}
-
} // namespace gpu
} // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
index a3815370c1..e7307e07c0 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@@ -54,7 +54,8 @@ class GpuExecutable : public Executable {
std::unique_ptr<const ThunkSchedule> thunk_schedule,
std::unique_ptr<const HloModule> hlo_module,
std::unique_ptr<const BufferAssignment> assignment,
- HloCostAnalysis::ShapeSizeFunction shape_size_function);
+ std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
+ std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
// This should be called after set_ir_module_string.
const string& ir_module_string() const { return ir_module_string_; }
@@ -95,8 +96,6 @@ class GpuExecutable : public Executable {
return Unimplemented("Equality test on GPU executable is not implemented.");
}
- std::unique_ptr<HloCostAnalysis> CreateCostAnalysis() const override;
-
private:
// If `block_host_until_done` is false, execution will not block the host
// until the kernels have completed. This is used as an optimization for
@@ -140,9 +139,6 @@ class GpuExecutable : public Executable {
// memory for every output/temp buffers.
const std::unique_ptr<const BufferAssignment> assignment_;
- // Function to compute the size of a given Shape, in bytes.
- const HloCostAnalysis::ShapeSizeFunction shape_size_function_;
-
TF_DISALLOW_COPY_AND_ASSIGN(GpuExecutable);
};
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.cc b/tensorflow/compiler/xla/service/hlo_execution_profile.cc
index 9e256b9b37..ba75e2ef1b 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile.cc
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile.cc
@@ -40,7 +40,7 @@ HloProfileIndexMap::HloProfileIndexMap(const HloModule& module) {
}
}
-static HloProfilePrinter CreateOwnedHloProfilePrinter(
+std::unique_ptr<HloProfilePrinter> CreateHloProfilePrinter(
const HloProfileIndexMap& hlo_profile_index_map,
const HloCostAnalysis& cost_analysis) {
using HloComputationInfo = HloProfilePrinter::HloComputationInfo;
@@ -108,15 +108,15 @@ static HloProfilePrinter CreateOwnedHloProfilePrinter(
delete[] computation_infos;
};
- return HloProfilePrinter(computation_infos,
- hlo_profile_index_map.computation_count(), deleter);
+ return MakeUnique<HloProfilePrinter>(
+ computation_infos, hlo_profile_index_map.computation_count(), deleter);
}
-HloExecutionProfile::HloExecutionProfile(const HloModule& module,
- const HloCostAnalysis& cost_analysis)
- : hlo_profile_index_map_(module),
- hlo_profile_printer_(
- CreateOwnedHloProfilePrinter(hlo_profile_index_map_, cost_analysis)),
+HloExecutionProfile::HloExecutionProfile(
+ const HloProfilePrinter* hlo_profile_printer,
+ const HloProfileIndexMap* hlo_profile_index_map)
+ : hlo_profile_printer_(*hlo_profile_printer),
+ hlo_profile_index_map_(*hlo_profile_index_map),
profile_counters_(
/*count*/ hlo_profile_index_map_.total_count(),
/*value*/ 0) {}
@@ -131,10 +131,4 @@ uint64 HloExecutionProfile::GetCyclesTakenBy(const HloInstruction& hlo) const {
return profile_counters_[hlo_profile_index_map_.GetProfileIndexFor(hlo)];
}
-string HloExecutionProfile::ToString(
- const DeviceDescription& device_description) const {
- return hlo_profile_printer_.ToString(profile_counters_.data(),
- device_description.clock_rate_ghz());
-}
-
} // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.h b/tensorflow/compiler/xla/service/hlo_execution_profile.h
index 84702680c0..470fd4ce3c 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile.h
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile.h
@@ -77,6 +77,11 @@ class HloProfileIndexMap {
std::unordered_map<const HloComputation*, int64> computation_to_profile_idx_;
};
+// Create an instance of `HloProfilePrinter` that owns its memory.
+std::unique_ptr<HloProfilePrinter> CreateHloProfilePrinter(
+ const HloProfileIndexMap& hlo_profile_index_map,
+ const HloCostAnalysis& cost_analysis);
+
// Describes how much time each HLO operation took.
//
// Each HloComputation takes a certain number of cycles. This class helps break
@@ -85,8 +90,8 @@ class HloExecutionProfile {
public:
using DeviceDescription = perftools::gputools::DeviceDescription;
- HloExecutionProfile(const HloModule& module,
- const HloCostAnalysis& cost_analysis);
+ HloExecutionProfile(const HloProfilePrinter* hlo_profile_printer,
+ const HloProfileIndexMap* hlo_profile_index_map);
// Record how many cycles this HLO took to execute.
void SetCyclesTakenBy(const HloInstruction* hlo, uint64 cycles_taken);
@@ -114,15 +119,16 @@ class HloExecutionProfile {
// for the operations in a given computation. Returns an empty string if it
// wasn't possible to generate a printable version. cost_analysis should be a
// clean analysis that can be used to visit the computation.
- string ToString(const DeviceDescription& device_description) const;
+ string ToString(const DeviceDescription& device_description) const {
+ return hlo_profile_printer_.ToString(profile_counters_.data(),
+ device_description.clock_rate_ghz());
+ }
- private:
- // hlo_profile_index_map_ maps an Hlo entity (computation or instruction) to
- // an index in profile_counters_.
- HloProfileIndexMap hlo_profile_index_map_;
+ std::vector<int64>* mutable_profile_counters() { return &profile_counters_; }
- // Used to print profile_counters_ in a human readable form.
- HloProfilePrinter hlo_profile_printer_;
+ private:
+ const HloProfilePrinter& hlo_profile_printer_;
+ const HloProfileIndexMap& hlo_profile_index_map_;
// Stores per-Hlo profile counters. This is the only thing that changes when
// we execute an XLA computation.
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc b/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc
index 5ba31296ea..b1e6729e2b 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc
@@ -72,7 +72,11 @@ TEST_F(HloExecutionProfileTest, Basic) {
};
HloCostAnalysis cost_analysis(shape_size_function);
- HloExecutionProfile execution_profile(*hlo_module, cost_analysis);
+ HloProfileIndexMap profile_index_map(*hlo_module);
+ std::unique_ptr<HloProfilePrinter> profile_printer =
+ CreateHloProfilePrinter(profile_index_map, cost_analysis);
+ HloExecutionProfile execution_profile(profile_printer.get(),
+ &profile_index_map);
const int64 add_cycles = 1000;
const int64 dot_cycles = 4000;
diff --git a/tensorflow/compiler/xla/service/interpreter/executable.cc b/tensorflow/compiler/xla/service/interpreter/executable.cc
index 96f937caf9..9183a1d1bf 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executable.cc
@@ -42,7 +42,8 @@ namespace sep = ::perftools::gputools::interpreter;
InterpreterExecutable::InterpreterExecutable(
std::unique_ptr<const HloModule> hlo_module)
- : Executable(std::move(hlo_module)) {}
+ : Executable(std::move(hlo_module), /*hlo_profile_printer=*/nullptr,
+ /*hlo_profile_index_map=*/nullptr) {}
InterpreterExecutable::~InterpreterExecutable() {}
@@ -156,10 +157,5 @@ StatusOr<se::DeviceMemoryBase> InterpreterExecutable::ExecuteAsyncOnStream(
return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
}
-std::unique_ptr<HloCostAnalysis> InterpreterExecutable::CreateCostAnalysis()
- const {
- return MakeUnique<HloCostAnalysis>(ShapeSizeBytes);
-}
-
} // namespace interpreter
} // namespace xla
diff --git a/tensorflow/compiler/xla/service/interpreter/executable.h b/tensorflow/compiler/xla/service/interpreter/executable.h
index c69b0d036d..0e87eb90bf 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable.h
+++ b/tensorflow/compiler/xla/service/interpreter/executable.h
@@ -61,8 +61,6 @@ class InterpreterExecutable : public Executable {
static int64 ShapeSizeBytes(const Shape& shape);
- std::unique_ptr<HloCostAnalysis> CreateCostAnalysis() const override;
-
private:
TF_DISALLOW_COPY_AND_ASSIGN(InterpreterExecutable);
};
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index 902a1afb45..d997cab83f 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -575,12 +575,13 @@ Service::ExecuteParallelAndRegisterResult(
// profile.
for (auto& index_to_profiled_stream : index_to_profiled_streams) {
int64 device = index_to_profiled_stream.first;
- auto& module = executables[device]->module();
se::Stream* stream = index_to_profiled_stream.second;
- HloExecutionProfile hlo_profile(module,
- *executables[device]->CreateCostAnalysis());
- TF_RETURN_IF_ERROR(executables[device]->PopulateExecutionProfile(
- &hlo_profile, stream->parent()));
+ Executable* executable = executables[device];
+ const HloModule& module = executable->module();
+ HloExecutionProfile hlo_profile(&executable->hlo_profile_printer(),
+ &executable->hlo_profile_index_map());
+ TF_RETURN_IF_ERROR(
+ executable->PopulateExecutionProfile(&hlo_profile, stream->parent()));
XLA_LOG_LINES(
tensorflow::INFO,
hlo_profile.ToString(streams[0]->parent()->GetDeviceDescription()));