aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Sanjoy Das <sanjoy@google.com>2017-11-22 12:39:54 -0800
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2017-11-22 12:50:33 -0800
commit4b636957604faa3361a799dd9d8749a6b85afff7 (patch)
treee2b674a1f3f1a488c5487e4b4ad74483b8500397
parent8af1600d49ff4cc16063ab1aafbde52be9347c62 (diff)
Place HloProfilePrinter and HloProfileIndexMap in Executable
This refactoring will later allow XlaCompiledCpuFunction to pull out the HloProfilePrinter from Executable and use that to display the hlo execution profile. A de/serialized HloProfilePrinter will let AOT compiled binaries display their Hlo execution profile. PiperOrigin-RevId: 176689528
-rw-r--r--tensorflow/compiler/xla/service/cpu/cpu_compiler.cc75
-rw-r--r--tensorflow/compiler/xla/service/cpu/cpu_executable.cc51
-rw-r--r--tensorflow/compiler/xla/service/cpu/cpu_executable.h19
-rw-r--r--tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc46
-rw-r--r--tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h10
-rw-r--r--tensorflow/compiler/xla/service/executable.h35
-rw-r--r--tensorflow/compiler/xla/service/gpu/gpu_compiler.cc18
-rw-r--r--tensorflow/compiler/xla/service/gpu/gpu_executable.cc13
-rw-r--r--tensorflow/compiler/xla/service/gpu/gpu_executable.h8
-rw-r--r--tensorflow/compiler/xla/service/hlo_execution_profile.cc22
-rw-r--r--tensorflow/compiler/xla/service/hlo_execution_profile.h24
-rw-r--r--tensorflow/compiler/xla/service/hlo_execution_profile_test.cc6
-rw-r--r--tensorflow/compiler/xla/service/interpreter/executable.cc8
-rw-r--r--tensorflow/compiler/xla/service/interpreter/executable.h2
-rw-r--r--tensorflow/compiler/xla/service/service.cc11
15 files changed, 193 insertions, 155 deletions
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index 56940b8d63..ff6042ae19 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -197,28 +197,35 @@ void InitializeLLVMCommandLineOptions(const HloModuleConfig& config) {
class CollectProfileCandidates : public DfsHloVisitorWithDefault {
public:
static StatusOr<std::unordered_map<const HloInstruction*, size_t>>
- GetCandidatesForComputation(HloComputation* computation) {
+ GetCandidatesForComputation(
+ HloComputation* computation,
+ const std::unordered_map<const HloInstruction*, int64>&
+ assigned_indices) {
std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx;
CollectProfileCandidates profile_candidates_for_computation(
- &hlo_to_profile_idx);
+ &hlo_to_profile_idx, assigned_indices);
TF_RETURN_IF_ERROR(
computation->Accept(&profile_candidates_for_computation));
return hlo_to_profile_idx;
}
private:
- explicit CollectProfileCandidates(
- std::unordered_map<const HloInstruction*, size_t>* hlo_to_profile_idx)
- : hlo_to_profile_idx_(hlo_to_profile_idx) {}
+ CollectProfileCandidates(
+ std::unordered_map<const HloInstruction*, size_t>* hlo_to_profile_idx,
+ const std::unordered_map<const HloInstruction*, int64>& assigned_indices)
+ : hlo_to_profile_idx_(hlo_to_profile_idx),
+ assigned_indices_(assigned_indices) {}
Status DefaultAction(HloInstruction* hlo_instruction) override {
- hlo_to_profile_idx_->insert({hlo_instruction, hlo_to_profile_idx_->size()});
+ hlo_to_profile_idx_->insert(
+ {hlo_instruction, FindOrDie(assigned_indices_, hlo_instruction)});
return Status::OK();
}
Status HandleCall(HloInstruction* call) override {
TF_RETURN_IF_ERROR(DefaultAction(call));
- CollectProfileCandidates candidates_for_call(hlo_to_profile_idx_);
+ CollectProfileCandidates candidates_for_call(hlo_to_profile_idx_,
+ assigned_indices_);
TF_RETURN_IF_ERROR(call->to_apply()->Accept(&candidates_for_call));
return Status::OK();
}
@@ -232,17 +239,20 @@ class CollectProfileCandidates : public DfsHloVisitorWithDefault {
Status HandleWhile(HloInstruction* xla_while) override {
TF_RETURN_IF_ERROR(DefaultAction(xla_while));
- CollectProfileCandidates candidates_for_condition(hlo_to_profile_idx_);
+ CollectProfileCandidates candidates_for_condition(hlo_to_profile_idx_,
+ assigned_indices_);
TF_RETURN_IF_ERROR(
xla_while->while_condition()->Accept(&candidates_for_condition));
- CollectProfileCandidates candidates_for_body(hlo_to_profile_idx_);
+ CollectProfileCandidates candidates_for_body(hlo_to_profile_idx_,
+ assigned_indices_);
TF_RETURN_IF_ERROR(xla_while->while_body()->Accept(&candidates_for_body));
return Status::OK();
}
std::unordered_map<const HloInstruction*, size_t>* hlo_to_profile_idx_;
+ const std::unordered_map<const HloInstruction*, int64>& assigned_indices_;
};
} // namespace
@@ -475,10 +485,27 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
HloComputation* computation = module->entry_computation();
std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx;
+ std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map;
+ std::unique_ptr<HloProfilePrinter> hlo_profile_printer;
if (module->config().hlo_profiling_enabled()) {
+ hlo_profile_index_map = MakeUnique<HloProfileIndexMap>(*module);
+
TF_ASSIGN_OR_RETURN(
hlo_to_profile_idx,
- CollectProfileCandidates::GetCandidatesForComputation(computation));
+ CollectProfileCandidates::GetCandidatesForComputation(
+ computation, hlo_profile_index_map->instruction_to_profile_idx()));
+
+ auto shape_size_bytes = [](const Shape& shape) {
+ // On the cpu, opaques are pointers.
+ if (ShapeUtil::IsOpaque(shape)) {
+ return static_cast<int64>(sizeof(void*));
+ }
+ return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
+ };
+
+ HloCostAnalysis cost_analysis(shape_size_bytes);
+ hlo_profile_printer =
+ CreateHloProfilePrinter(*hlo_profile_index_map, cost_analysis);
}
std::unique_ptr<Executable> cpu_executable;
@@ -544,8 +571,16 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
parallel_computations.emplace(to_apply, instruction);
}
+ // We always profile the entire computation as a whole, even if hlo
+ // profiling is disabled. When hlo profiling is diabled, we pass in a
+ // profile counter array of just one element, which corresponds to the whole
+ // computation.
+ size_t entry_computation_profile_idx =
+ hlo_profile_index_map ? hlo_profile_index_map->GetProfileIndexFor(
+ *module->entry_computation())
+ : 0;
IrEmitter ir_emitter(*module, *assignment, llvm_module.get(),
- hlo_to_profile_idx, hlo_to_profile_idx.size(),
+ hlo_to_profile_idx, entry_computation_profile_idx,
jit->target_machine(), jit->external_constant_pool());
std::unique_ptr<HloInstructionMap<string>> function_names(
@@ -586,8 +621,8 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
jit->AddModule(std::move(llvm_module));
cpu_executable.reset(new ParallelCpuExecutable(
std::move(jit), std::move(assignment), std::move(module),
- std::move(function_names), std::move(hlo_to_profile_idx),
- std::move(aligned_constants)));
+ std::move(function_names), std::move(aligned_constants),
+ std::move(hlo_profile_printer), std::move(hlo_profile_index_map)));
if (embed_ir_in_executable) {
static_cast<CpuExecutable&>(*cpu_executable)
@@ -620,12 +655,22 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
TF_RETURN_IF_ERROR(protobuf_util::DumpProtoToDirectory(
proto, xla_dump_hlo_proto_to, module->name()));
}
+ // We always profile the entire computation as a whole, even if hlo
+ // profiling is disabled. When hlo profiling is diabled, we pass in a
+ // profile counter array of just one element, which corresponds to the whole
+ // computation.
+ size_t entry_computation_profile_idx =
+ hlo_profile_index_map ? hlo_profile_index_map->GetProfileIndexFor(
+ *module->entry_computation())
+ : 0;
+
// Each computation is a single function. Emit all embedded computations
// before the entry computation. The order of computations returned from
// GetEmbeddedComputations guarantees that a called computation occurs
// before a caller computation.
+
IrEmitter ir_emitter(*module, *assignment, llvm_module.get(),
- hlo_to_profile_idx, hlo_to_profile_idx.size(),
+ hlo_to_profile_idx, entry_computation_profile_idx,
jit->target_machine(), jit->external_constant_pool());
for (auto embedded_computation :
@@ -659,7 +704,7 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
jit->AddModule(std::move(llvm_module));
cpu_executable.reset(new CpuExecutable(
std::move(jit), std::move(assignment), std::move(module), function_name,
- std::move(hlo_to_profile_idx)));
+ std::move(hlo_profile_printer), std::move(hlo_profile_index_map)));
if (embed_ir_in_executable) {
static_cast<CpuExecutable&>(*cpu_executable)
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index e6ef9d6314..e956f478b8 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -55,11 +55,12 @@ CpuExecutable::CpuExecutable(
std::unique_ptr<const BufferAssignment> assignment,
std::unique_ptr<const HloModule> hlo_module,
const string& entry_function_name,
- std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx)
- : Executable(std::move(hlo_module)),
+ std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
+ std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
+ : Executable(std::move(hlo_module), std::move(hlo_profile_printer),
+ std::move(hlo_profile_index_map)),
jit_(std::move(jit)),
- assignment_(std::move(assignment)),
- hlo_to_profile_idx_(std::move(hlo_to_profile_idx)) {
+ assignment_(std::move(assignment)) {
// Resolve symbols in the constructor rather than at execution time to avoid
// races because FindSymbol is not thread safe.
llvm::JITSymbol sym = jit_->FindSymbol(entry_function_name);
@@ -183,9 +184,16 @@ Status CpuExecutable::ExecuteComputeFunction(
uint64 start_micros = tensorflow::Env::Default()->NowMicros();
// Allocate profiling counters for each hlo instruction that we would like to
- // profile. Allocate an additional profile counter for the entire
- // computation.
- std::vector<uint64> profile_counters(hlo_to_profile_idx_.size() + 1);
+ // profile. Even when not Hlo profiling, we allocate a counter for the entire
+ // computation, which we use to update ExecutionProfile below.
+ std::vector<int64>* profile_counters = nullptr;
+ std::vector<int64> profile_counter_for_entry_computation;
+ if (hlo_execution_profile) {
+ profile_counters = hlo_execution_profile->mutable_profile_counters();
+ } else {
+ profile_counters = &profile_counter_for_entry_computation;
+ profile_counter_for_entry_computation.push_back(0);
+ }
// Call the computation function following the calling convention.
std::vector<void*> buffer_pointers;
@@ -200,7 +208,7 @@ Status CpuExecutable::ExecuteComputeFunction(
VLOG(3) << tensorflow::strings::Printf(
" func(void* result, void* params[%zu], void* temps[%zu], "
"uint64 profile_counters[%zu])",
- args_array.size(), buffer_pointers.size(), profile_counters.size());
+ args_array.size(), buffer_pointers.size(), profile_counters->size());
VLOG(3) << tensorflow::strings::Printf(" result = %p", result_buffer);
auto ptr_printer = [](string* out, const void* p) {
tensorflow::strings::StrAppend(out, tensorflow::strings::Printf("%p", p));
@@ -212,11 +220,11 @@ Status CpuExecutable::ExecuteComputeFunction(
" temps = [%s]",
tensorflow::str_util::Join(buffer_pointers, ", ", ptr_printer).c_str());
VLOG(3) << tensorflow::strings::Printf(" profile_counters = %p",
- profile_counters.data());
+ profile_counters->data());
}
compute_function_(result_buffer, run_options, args_array.data(),
- buffer_pointers.data(), profile_counters.data());
+ buffer_pointers.data(), profile_counters->data());
uint64 end_micros = tensorflow::Env::Default()->NowMicros();
@@ -225,20 +233,15 @@ Status CpuExecutable::ExecuteComputeFunction(
const double nanoseconds = (end_micros - start_micros) * 1000.0;
execution_profile_.set_compute_time_ns(std::max(nanoseconds, 1.0));
- // The last profile counter is used for the computation as a whole.
- execution_profile_.set_compute_cycle_count(profile_counters.back());
- }
-
- if (hlo_execution_profile != nullptr) {
- hlo_execution_profile->set_total_cycles_executed(
- *module().entry_computation(), profile_counters.back());
-
- for (auto hlo_prof_idx : hlo_to_profile_idx_) {
- const HloInstruction* hlo = hlo_prof_idx.first;
- uint64 cycles_taken = profile_counters[hlo_prof_idx.second];
- hlo_execution_profile->SetCyclesTakenBy(hlo, cycles_taken);
+ if (hlo_execution_profile) {
+ execution_profile_.set_compute_cycle_count(
+ hlo_execution_profile->total_cycles_executed(
+ *module().entry_computation()));
+ } else {
+ execution_profile_.set_compute_cycle_count(profile_counters->back());
}
}
+
return Status::OK();
}
@@ -428,9 +431,5 @@ const PointsToSet& CpuExecutable::GetRootPointsToSet() const {
module().entry_computation()->root_instruction());
}
-std::unique_ptr<HloCostAnalysis> CpuExecutable::CreateCostAnalysis() const {
- return MakeUnique<HloCostAnalysis>(ShapeSizeBytes);
-}
-
} // namespace cpu
} // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.h b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
index 238bc9b46a..17ee2d673e 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
@@ -47,12 +47,12 @@ namespace cpu {
// architecture, so JIT-ed code and host code share the same ABI.
class CpuExecutable : public Executable {
public:
- CpuExecutable(
- std::unique_ptr<SimpleOrcJIT> jit,
- std::unique_ptr<const BufferAssignment> assignment,
- std::unique_ptr<const HloModule> hlo_module,
- const string& entry_function_name,
- std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx);
+ CpuExecutable(std::unique_ptr<SimpleOrcJIT> jit,
+ std::unique_ptr<const BufferAssignment> assignment,
+ std::unique_ptr<const HloModule> hlo_module,
+ const string& entry_function_name,
+ std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
+ std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
~CpuExecutable() override {}
StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteOnStream(
@@ -85,12 +85,10 @@ class CpuExecutable : public Executable {
static int64 ShapeSizeBytes(const Shape& shape);
- std::unique_ptr<HloCostAnalysis> CreateCostAnalysis() const override;
-
// Type of the computation function we expect in the JIT.
using ComputeFunctionType = void (*)(
void* /*result*/, const ExecutableRunOptions* /*run_options*/,
- const void** /*args*/, void** /*temps*/, uint64* /*profile_counters*/);
+ const void** /*args*/, void** /*temps*/, int64* /*profile_counters*/);
const ComputeFunctionType& compute_function() const {
return compute_function_;
@@ -145,9 +143,6 @@ class CpuExecutable : public Executable {
// Entry function name for the computation.
const string entry_function_name_;
- // Maps HLOs to their index into the profile counter array.
- const std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx_;
-
TF_DISALLOW_COPY_AND_ASSIGN(CpuExecutable);
};
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
index aff61296ce..0077e344e2 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
@@ -59,19 +59,20 @@ ParallelCpuExecutable::ParallelCpuExecutable(
std::unique_ptr<const BufferAssignment> assignment,
std::unique_ptr<const HloModule> hlo_module,
std::unique_ptr<const HloInstructionMap<string>> function_names,
- std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx,
std::unordered_map<const HloInstruction*, std::unique_ptr<unsigned char[]>>
- aligned_constants)
- : Executable(std::move(hlo_module)),
+ aligned_constants,
+ std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
+ std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
+ : Executable(std::move(hlo_module), std::move(hlo_profile_printer),
+ std::move(hlo_profile_index_map)),
jit_(std::move(jit)),
assignment_(std::move(assignment)),
function_names_(std::move(function_names)),
- hlo_to_profile_idx_(std::move(hlo_to_profile_idx)),
aligned_constants_(std::move(aligned_constants)) {}
// Type of the computation function we expect in the JIT.
using ComputeFunctionType = void (*)(void*, const void*, const void**, void**,
- int64*, uint64*);
+ int64*, int64*);
// Given a pointer to an output buffer (following the CPU JIT calling
// conventions), mark addresses that are "live". The initial pointer itself is
@@ -106,7 +107,7 @@ class Executor {
const ServiceExecutableRunOptions* run_options,
std::list<HloInstruction*>* pending,
HloInstructionMap<const void*>* results, void** temps_array,
- uint64* profile_counters_array, const BufferAssignment* assignment)
+ int64* profile_counters_array, const BufferAssignment* assignment)
: functions_(functions),
run_options_(run_options),
pending_(pending),
@@ -147,7 +148,7 @@ class Executor {
std::list<HloInstruction*>* pending_;
HloInstructionMap<const void*>* results_;
void** temps_array_;
- uint64* profile_counters_array_;
+ int64* profile_counters_array_;
tensorflow::thread::ThreadPool* thread_pool_;
const BufferAssignment* assignment_;
@@ -389,9 +390,11 @@ Status ParallelCpuExecutable::ExecuteComputeFunctions(
tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> buffers,
HloExecutionProfile* hlo_execution_profile) {
// Allocate profiling counters for each hlo instruction that we would like to
- // profile. Allocate an additional profile counter for the entire
- // computation.
- std::vector<uint64> profile_counters(hlo_to_profile_idx_.size() + 1);
+ // profile.
+ std::vector<int64>* profile_counters = nullptr;
+ if (hlo_execution_profile) {
+ profile_counters = hlo_execution_profile->mutable_profile_counters();
+ }
std::vector<void*> buffer_pointers;
buffer_pointers.reserve(buffers.size());
@@ -441,9 +444,9 @@ Status ParallelCpuExecutable::ExecuteComputeFunctions(
// For example, if we expect a library conv/matmul call to run at max
// concurrency, we should not dispatch runnable instructions until the
// library call is finished (to avoid expensive cache invalidation).
- Executor executor(functions, run_options, &pending, &results,
- buffer_pointers.data(), profile_counters.data(),
- assignment_.get());
+ Executor executor(
+ functions, run_options, &pending, &results, buffer_pointers.data(),
+ profile_counters ? profile_counters->data() : nullptr, assignment_.get());
TF_RETURN_IF_ERROR(executor.Run());
@@ -453,18 +456,6 @@ Status ParallelCpuExecutable::ExecuteComputeFunctions(
tensorflow::mutex_lock lock(mutex_);
double nanoseconds = (end_micros - start_micros) * 1000.0;
execution_profile_.set_compute_time_ns(std::max(nanoseconds, 1.0));
- // The last profile counter is used for the computation as a whole.
- execution_profile_.set_compute_cycle_count(profile_counters.back());
- }
- if (hlo_execution_profile != nullptr) {
- hlo_execution_profile->set_total_cycles_executed(entry_computation,
- profile_counters.back());
-
- for (auto hlo_prof_idx : hlo_to_profile_idx_) {
- const HloInstruction* hlo = hlo_prof_idx.first;
- uint64 cycles_taken = profile_counters[hlo_prof_idx.second];
- hlo_execution_profile->SetCyclesTakenBy(hlo, cycles_taken);
- }
}
return Status::OK();
@@ -618,10 +609,5 @@ const PointsToSet& ParallelCpuExecutable::GetRootPointsToSet() const {
module().entry_computation()->root_instruction());
}
-std::unique_ptr<HloCostAnalysis> ParallelCpuExecutable::CreateCostAnalysis()
- const {
- return MakeUnique<HloCostAnalysis>(ShapeSizeBytes);
-}
-
} // namespace cpu
} // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
index db16aaf48b..d65e3f42f3 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
@@ -52,10 +52,11 @@ class ParallelCpuExecutable : public Executable {
std::unique_ptr<const BufferAssignment> assignment,
std::unique_ptr<const HloModule> hlo_module,
std::unique_ptr<const HloInstructionMap<string>> function_names,
- std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx,
std::unordered_map<const HloInstruction*,
std::unique_ptr<unsigned char[]>>
- aligned_constants);
+ aligned_constants,
+ std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
+ std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
~ParallelCpuExecutable() override {}
StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteOnStream(
@@ -95,8 +96,6 @@ class ParallelCpuExecutable : public Executable {
"Equality test on CPU parallel executable is not implemented.");
}
- std::unique_ptr<HloCostAnalysis> CreateCostAnalysis() const override;
-
private:
// Allocate buffers required for execution and assign them to the elements of
// "buffers". "buffers" should be sized to the number of buffers in buffer
@@ -143,9 +142,6 @@ class ParallelCpuExecutable : public Executable {
// Map containing the JITted function names for each HLO instruction.
const std::unique_ptr<const HloInstructionMap<string>> function_names_;
- // Maps HLOs to their index into the profile counter array.
- const std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx_;
-
// Map from HLO Constant instructions to a pointer to their literal data.
// The data stored in the protocol buffer might be insufficiently aligned,
// we create a sufficiently aligned copy and store it in this map.
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index 2135707371..08862308c9 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -44,8 +44,15 @@ namespace xla {
// interface that is used for launching compiled programs across platforms.
class Executable {
public:
- explicit Executable(std::unique_ptr<const HloModule> hlo_module)
- : hlo_module_(std::move(hlo_module)) {}
+ explicit Executable(std::unique_ptr<const HloModule> hlo_module,
+ std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
+ std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
+ : hlo_module_(std::move(hlo_module)),
+ hlo_profile_printer_(std::move(hlo_profile_printer)),
+ hlo_profile_index_map_(std::move(hlo_profile_index_map)) {
+ CHECK_EQ(hlo_profile_printer_.get() == nullptr,
+ hlo_profile_index_map_.get() == nullptr);
+ }
virtual ~Executable() {}
// Enqueues the compilation result on the provided stream, passing the given
@@ -123,12 +130,20 @@ class Executable {
"Equality test on this executable is not implemented.");
}
+ const HloProfilePrinter& hlo_profile_printer() const {
+ CHECK(hlo_profiling_enabled());
+ return *hlo_profile_printer_;
+ }
+
+ const HloProfileIndexMap& hlo_profile_index_map() const {
+ CHECK(hlo_profiling_enabled());
+ return *hlo_profile_index_map_;
+ }
+
// Returns whether this executable was compiled with HLO profilings support
// enabled. If not, the caller should not expect an hlo_execution_profile
// passed to ExecuteOnStream above to be populated during execution.
- bool hlo_profiling_enabled() const {
- return hlo_module_->config().hlo_profiling_enabled();
- }
+ bool hlo_profiling_enabled() const { return hlo_profile_printer_ != nullptr; }
const HloModule& module() const { return *hlo_module_; }
@@ -160,10 +175,6 @@ class Executable {
static Status DumpToDirectory(const string& directory_path, string filename,
const SessionModule& session_module);
- // Returns a cost analysis object appropriate for the platform on which this
- // executable can run.
- virtual std::unique_ptr<HloCostAnalysis> CreateCostAnalysis() const = 0;
-
protected:
mutable tensorflow::mutex mutex_;
@@ -181,6 +192,9 @@ class Executable {
// Execution count, used to generate a unique filename for each dumped
// execution.
int64 execution_count_ = 0;
+
+ std::unique_ptr<HloProfilePrinter> hlo_profile_printer_;
+ std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map_;
};
template <typename ReturnT, typename ArgT>
@@ -200,7 +214,8 @@ StatusOr<ReturnT> Executable::ExecuteOnStreamWrapper(
std::unique_ptr<HloExecutionProfile> profile_ptr =
module_config().debug_options().xla_hlo_profile() &&
hlo_profiling_enabled()
- ? MakeUnique<HloExecutionProfile>(module(), *CreateCostAnalysis())
+ ? MakeUnique<HloExecutionProfile>(&hlo_profile_printer(),
+ &hlo_profile_index_map())
: nullptr;
auto return_value =
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index 92c53265d0..fcd73fd37a 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -465,10 +465,20 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
VLOG(2) << "Printing the thunk schedule...";
XLA_VLOG_LINES(2, thunk_schedule->ToString());
- auto* gpu_executable =
- new GpuExecutable(ptx, cubin, {cc_major, cc_minor},
- std::move(thunk_schedule), std::move(module),
- std::move(buffer_assignment), ShapeSizeBytesFunction());
+ std::unique_ptr<HloProfileIndexMap> profile_index_map;
+ std::unique_ptr<HloProfilePrinter> profile_printer;
+
+ if (module->config().hlo_profiling_enabled()) {
+ HloCostAnalysis cost_analysis(ShapeSizeBytesFunction());
+ profile_index_map = MakeUnique<HloProfileIndexMap>(*module);
+ profile_printer =
+ CreateHloProfilePrinter(*profile_index_map, cost_analysis);
+ }
+
+ auto* gpu_executable = new GpuExecutable(
+ ptx, cubin, {cc_major, cc_minor}, std::move(thunk_schedule),
+ std::move(module), std::move(buffer_assignment),
+ std::move(profile_printer), std::move(profile_index_map));
if (embed_ir_in_executable) {
DCHECK_NE("", ir_module_string_before_opt);
gpu_executable->set_ir_module_string(ir_module_string_before_opt);
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index c6f23f9b05..0fd85e4fb0 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -113,14 +113,15 @@ GpuExecutable::GpuExecutable(
std::unique_ptr<const ThunkSchedule> thunk_schedule,
std::unique_ptr<const HloModule> hlo_module,
std::unique_ptr<const BufferAssignment> assignment,
- HloCostAnalysis::ShapeSizeFunction shape_size_function)
- : Executable(std::move(hlo_module)),
+ std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
+ std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
+ : Executable(std::move(hlo_module), std::move(hlo_profile_printer),
+ std::move(hlo_profile_index_map)),
ptx_(ptx),
cubin_(cubin),
compute_capability_(compute_capability),
thunk_schedule_(std::move(thunk_schedule)),
- assignment_(std::move(assignment)),
- shape_size_function_(std::move(shape_size_function)) {}
+ assignment_(std::move(assignment)) {}
Status GpuExecutable::ExecuteThunks(
const ServiceExecutableRunOptions* run_options,
@@ -358,9 +359,5 @@ const PointsToSet& GpuExecutable::GetRootPointsToSet() const {
module().entry_computation()->root_instruction());
}
-std::unique_ptr<HloCostAnalysis> GpuExecutable::CreateCostAnalysis() const {
- return MakeUnique<HloCostAnalysis>(shape_size_function_);
-}
-
} // namespace gpu
} // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
index a3815370c1..e7307e07c0 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@@ -54,7 +54,8 @@ class GpuExecutable : public Executable {
std::unique_ptr<const ThunkSchedule> thunk_schedule,
std::unique_ptr<const HloModule> hlo_module,
std::unique_ptr<const BufferAssignment> assignment,
- HloCostAnalysis::ShapeSizeFunction shape_size_function);
+ std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
+ std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
// This should be called after set_ir_module_string.
const string& ir_module_string() const { return ir_module_string_; }
@@ -95,8 +96,6 @@ class GpuExecutable : public Executable {
return Unimplemented("Equality test on GPU executable is not implemented.");
}
- std::unique_ptr<HloCostAnalysis> CreateCostAnalysis() const override;
-
private:
// If `block_host_until_done` is false, execution will not block the host
// until the kernels have completed. This is used as an optimization for
@@ -140,9 +139,6 @@ class GpuExecutable : public Executable {
// memory for every output/temp buffers.
const std::unique_ptr<const BufferAssignment> assignment_;
- // Function to compute the size of a given Shape, in bytes.
- const HloCostAnalysis::ShapeSizeFunction shape_size_function_;
-
TF_DISALLOW_COPY_AND_ASSIGN(GpuExecutable);
};
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.cc b/tensorflow/compiler/xla/service/hlo_execution_profile.cc
index 9e256b9b37..ba75e2ef1b 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile.cc
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile.cc
@@ -40,7 +40,7 @@ HloProfileIndexMap::HloProfileIndexMap(const HloModule& module) {
}
}
-static HloProfilePrinter CreateOwnedHloProfilePrinter(
+std::unique_ptr<HloProfilePrinter> CreateHloProfilePrinter(
const HloProfileIndexMap& hlo_profile_index_map,
const HloCostAnalysis& cost_analysis) {
using HloComputationInfo = HloProfilePrinter::HloComputationInfo;
@@ -108,15 +108,15 @@ static HloProfilePrinter CreateOwnedHloProfilePrinter(
delete[] computation_infos;
};
- return HloProfilePrinter(computation_infos,
- hlo_profile_index_map.computation_count(), deleter);
+ return MakeUnique<HloProfilePrinter>(
+ computation_infos, hlo_profile_index_map.computation_count(), deleter);
}
-HloExecutionProfile::HloExecutionProfile(const HloModule& module,
- const HloCostAnalysis& cost_analysis)
- : hlo_profile_index_map_(module),
- hlo_profile_printer_(
- CreateOwnedHloProfilePrinter(hlo_profile_index_map_, cost_analysis)),
+HloExecutionProfile::HloExecutionProfile(
+ const HloProfilePrinter* hlo_profile_printer,
+ const HloProfileIndexMap* hlo_profile_index_map)
+ : hlo_profile_printer_(*hlo_profile_printer),
+ hlo_profile_index_map_(*hlo_profile_index_map),
profile_counters_(
/*count*/ hlo_profile_index_map_.total_count(),
/*value*/ 0) {}
@@ -131,10 +131,4 @@ uint64 HloExecutionProfile::GetCyclesTakenBy(const HloInstruction& hlo) const {
return profile_counters_[hlo_profile_index_map_.GetProfileIndexFor(hlo)];
}
-string HloExecutionProfile::ToString(
- const DeviceDescription& device_description) const {
- return hlo_profile_printer_.ToString(profile_counters_.data(),
- device_description.clock_rate_ghz());
-}
-
} // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.h b/tensorflow/compiler/xla/service/hlo_execution_profile.h
index 84702680c0..470fd4ce3c 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile.h
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile.h
@@ -77,6 +77,11 @@ class HloProfileIndexMap {
std::unordered_map<const HloComputation*, int64> computation_to_profile_idx_;
};
+// Create an instance of `HloProfilePrinter` that owns its memory.
+std::unique_ptr<HloProfilePrinter> CreateHloProfilePrinter(
+ const HloProfileIndexMap& hlo_profile_index_map,
+ const HloCostAnalysis& cost_analysis);
+
// Describes how much time each HLO operation took.
//
// Each HloComputation takes a certain number of cycles. This class helps break
@@ -85,8 +90,8 @@ class HloExecutionProfile {
public:
using DeviceDescription = perftools::gputools::DeviceDescription;
- HloExecutionProfile(const HloModule& module,
- const HloCostAnalysis& cost_analysis);
+ HloExecutionProfile(const HloProfilePrinter* hlo_profile_printer,
+ const HloProfileIndexMap* hlo_profile_index_map);
// Record how many cycles this HLO took to execute.
void SetCyclesTakenBy(const HloInstruction* hlo, uint64 cycles_taken);
@@ -114,15 +119,16 @@ class HloExecutionProfile {
// for the operations in a given computation. Returns an empty string if it
// wasn't possible to generate a printable version. cost_analysis should be a
// clean analysis that can be used to visit the computation.
- string ToString(const DeviceDescription& device_description) const;
+ string ToString(const DeviceDescription& device_description) const {
+ return hlo_profile_printer_.ToString(profile_counters_.data(),
+ device_description.clock_rate_ghz());
+ }
- private:
- // hlo_profile_index_map_ maps an Hlo entity (computation or instruction) to
- // an index in profile_counters_.
- HloProfileIndexMap hlo_profile_index_map_;
+ std::vector<int64>* mutable_profile_counters() { return &profile_counters_; }
- // Used to print profile_counters_ in a human readable form.
- HloProfilePrinter hlo_profile_printer_;
+ private:
+ const HloProfilePrinter& hlo_profile_printer_;
+ const HloProfileIndexMap& hlo_profile_index_map_;
// Stores per-Hlo profile counters. This is the only thing that changes when
// we execute an XLA computation.
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc b/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc
index 5ba31296ea..b1e6729e2b 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc
@@ -72,7 +72,11 @@ TEST_F(HloExecutionProfileTest, Basic) {
};
HloCostAnalysis cost_analysis(shape_size_function);
- HloExecutionProfile execution_profile(*hlo_module, cost_analysis);
+ HloProfileIndexMap profile_index_map(*hlo_module);
+ std::unique_ptr<HloProfilePrinter> profile_printer =
+ CreateHloProfilePrinter(profile_index_map, cost_analysis);
+ HloExecutionProfile execution_profile(profile_printer.get(),
+ &profile_index_map);
const int64 add_cycles = 1000;
const int64 dot_cycles = 4000;
diff --git a/tensorflow/compiler/xla/service/interpreter/executable.cc b/tensorflow/compiler/xla/service/interpreter/executable.cc
index 96f937caf9..9183a1d1bf 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executable.cc
@@ -42,7 +42,8 @@ namespace sep = ::perftools::gputools::interpreter;
InterpreterExecutable::InterpreterExecutable(
std::unique_ptr<const HloModule> hlo_module)
- : Executable(std::move(hlo_module)) {}
+ : Executable(std::move(hlo_module), /*hlo_profile_printer=*/nullptr,
+ /*hlo_profile_index_map=*/nullptr) {}
InterpreterExecutable::~InterpreterExecutable() {}
@@ -156,10 +157,5 @@ StatusOr<se::DeviceMemoryBase> InterpreterExecutable::ExecuteAsyncOnStream(
return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
}
-std::unique_ptr<HloCostAnalysis> InterpreterExecutable::CreateCostAnalysis()
- const {
- return MakeUnique<HloCostAnalysis>(ShapeSizeBytes);
-}
-
} // namespace interpreter
} // namespace xla
diff --git a/tensorflow/compiler/xla/service/interpreter/executable.h b/tensorflow/compiler/xla/service/interpreter/executable.h
index c69b0d036d..0e87eb90bf 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable.h
+++ b/tensorflow/compiler/xla/service/interpreter/executable.h
@@ -61,8 +61,6 @@ class InterpreterExecutable : public Executable {
static int64 ShapeSizeBytes(const Shape& shape);
- std::unique_ptr<HloCostAnalysis> CreateCostAnalysis() const override;
-
private:
TF_DISALLOW_COPY_AND_ASSIGN(InterpreterExecutable);
};
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index 902a1afb45..d997cab83f 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -575,12 +575,13 @@ Service::ExecuteParallelAndRegisterResult(
// profile.
for (auto& index_to_profiled_stream : index_to_profiled_streams) {
int64 device = index_to_profiled_stream.first;
- auto& module = executables[device]->module();
se::Stream* stream = index_to_profiled_stream.second;
- HloExecutionProfile hlo_profile(module,
- *executables[device]->CreateCostAnalysis());
- TF_RETURN_IF_ERROR(executables[device]->PopulateExecutionProfile(
- &hlo_profile, stream->parent()));
+ Executable* executable = executables[device];
+ const HloModule& module = executable->module();
+ HloExecutionProfile hlo_profile(&executable->hlo_profile_printer(),
+ &executable->hlo_profile_index_map());
+ TF_RETURN_IF_ERROR(
+ executable->PopulateExecutionProfile(&hlo_profile, stream->parent()));
XLA_LOG_LINES(
tensorflow::INFO,
hlo_profile.ToString(streams[0]->parent()->GetDeviceDescription()));