aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar A. Unique TensorFlower <gardener@tensorflow.org>2017-11-06 02:57:40 -0800
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2017-11-06 03:02:01 -0800
commitc217140933e36f4f2ac18a2add51e27b1d146b24 (patch)
tree83c92293b91785f2e23c823d7c7cb93c65121b75
parent7ec9c634f6b41ee081be57ed14bdefc838691dbf (diff)
Add profiling support to Service::ExecuteParallel.
PiperOrigin-RevId: 174682772
-rw-r--r--tensorflow/compiler/xla/service/executable.h10
-rw-r--r--tensorflow/compiler/xla/service/service.cc94
-rw-r--r--tensorflow/compiler/xla/service/service.h3
3 files changed, 104 insertions, 3 deletions
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index 2d32e59d36..7e0d182b36 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -88,6 +88,16 @@ class Executable {
tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>>
arguments);
+ // Populates `hlo_execution_profile` from `executor`. This is implicit in any
+ // Execute* API call that takes a hlo_execution_profile argument, but must be
+ // called explicitly for other (async, for example) variants after the stream
+ // has completed.
+ virtual Status PopulateExecutionProfile(
+ HloExecutionProfile* hlo_execution_profile,
+ perftools::gputools::StreamExecutor* executor) {
+ return Status::OK();
+ }
+
// Convenience wrapper for calling Executable::ExecuteOnStream. Sets up a
// timer for the execution, sets up HLO profiling if enabled, and fills in the
// given ExecutionProfile if non-null. The ExecuteOnStream overloads have
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index bac33d8102..b6c9c5eefa 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -490,14 +490,20 @@ Service::ExecuteParallelAndRegisterResult(
std::vector<perftools::gputools::DeviceMemoryBase>>
arguments,
Backend* backend, tensorflow::gtl::ArraySlice<DeviceHandle> device_handles,
- tensorflow::gtl::ArraySlice<string> result_tags) {
+ tensorflow::gtl::ArraySlice<string> result_tags,
+ ExecutionProfile* profile) {
// Streams where the computation are launched, so we can wait on the streams
// to complete.
std::vector<Pool<se::Stream>::SmartPtr> streams;
+ std::vector<std::unique_ptr<perftools::gputools::Timer>> timers;
// Global data handles for the computation results, one for each computation.
std::vector<GlobalDataHandle> result_handles;
+ // Device ID to stream executor, populated only with devices that are being
+ // profiled.
+ std::map<int64, se::Stream*> index_to_profiled_streams;
+
TF_ASSIGN_OR_RETURN(DeviceAssignment device_assignment,
backend->computation_placer()->AssignDevices(
options_.number_of_replicas(), executables.size()));
@@ -510,6 +516,21 @@ Service::ExecuteParallelAndRegisterResult(
backend->BorrowStream(replicas[replica]));
streams.push_back(std::move(stream));
+ if (replica == 0 && profile != nullptr) {
+ timers.emplace_back(
+ new perftools::gputools::Timer(streams.back()->parent()));
+ streams.back()
+ ->InitTimer(timers.back().get())
+ .ThenStartTimer(timers.back().get());
+ CHECK(timers.front() != nullptr);
+ }
+
+ if (replica == 0 &&
+ executables[i]->module_config().debug_options().xla_hlo_profile() &&
+ executables[i]->hlo_profiling_enabled()) {
+ index_to_profiled_streams[i] = streams.back().get();
+ }
+
// Set up run options.
ExecutableRunOptions options;
options.set_stream(streams.back().get());
@@ -526,6 +547,10 @@ Service::ExecuteParallelAndRegisterResult(
perftools::gputools::DeviceMemoryBase result,
executables[i]->ExecuteAsyncOnStream(&run_options, arguments[i]));
+ if (replica == 0 && profile != nullptr) {
+ streams.back()->ThenStopTimer(timers.back().get());
+ }
+
// All replicas share the same device address for the result allocation,
// so only one of the replicas need to register the result handle.
if (replica == 0) {
@@ -543,6 +568,69 @@ Service::ExecuteParallelAndRegisterResult(
}
}
+ // For every stream that had profiling enabled, obtain and debug-dump the HLO
+ // profile.
+ for (auto& index_to_profiled_stream : index_to_profiled_streams) {
+ int64 device = index_to_profiled_stream.first;
+ se::Stream* stream = index_to_profiled_stream.second;
+ HloExecutionProfile hlo_profile;
+ TF_RETURN_IF_ERROR(executables[device]->PopulateExecutionProfile(
+ &hlo_profile, stream->parent()));
+
+ std::unordered_set<const xla::HloComputation*> profiled_computations =
+ hlo_profile.profiled_computations();
+ // To ensure we have print the profiles in a stable order, iterate over the
+ // computations in post order.
+ auto& module = executables[device]->module();
+ std::list<xla::HloComputation*> all_computations =
+ module.MakeComputationPostOrder();
+ for (xla::HloComputation* computation : all_computations) {
+ if (profiled_computations.count(computation) > 0) {
+ string profile_string = hlo_profile.ToString(
+ *computation, streams[0]->parent()->GetDeviceDescription(),
+ executables[device]->CreateCostAnalysis().get());
+ if (!profile_string.empty()) {
+ LOG(INFO) << "HLO profile for execution on device " << device
+ << ":\n";
+ XLA_LOG_LINES(tensorflow::INFO, profile_string);
+ }
+ }
+ }
+ hlo_graph_dumper::MaybeDumpHloModule(module, "Service::Execute",
+ &hlo_profile);
+ }
+
+ if (profile != nullptr) {
+ CHECK(!timers.empty());
+ std::vector<uint64> timer_nanoseconds;
+ timer_nanoseconds.reserve(timers.size());
+ for (auto& timer : timers) {
+ timer_nanoseconds.push_back(timer->Nanoseconds());
+ }
+ uint64 nanoseconds =
+ *std::max_element(timer_nanoseconds.begin(), timer_nanoseconds.end());
+
+ // Merge in run-time profile information from execution_profile on the
+ // zeroth device.
+ profile->MergeFrom(executables[0]->execution_profile());
+
+ // Overall execution time (in nanoseconds) from the executor timer.
+ profile->set_compute_and_transfer_time_ns(nanoseconds);
+
+ // TODO(b/28123297): On GPU we end up including transfer time in
+ // the compute time this way. Instead, we should get the correct
+ // value by measuring it. Setting the field here at least lets
+ // benchmarks provide *some* value for GPU computations.
+ //
+ // TODO(b/28447609): The value in compute_and_transfer_time_ns is actually
+ // the compute time without the transfer time, so this way we get the
+ // correct compute time. We should instead have the correct value for
+ // compute_and_transfer_time and set compute_time to the compute time.
+ if (profile->compute_time_ns() == 0) {
+ profile->set_compute_time_ns(profile->compute_and_transfer_time_ns());
+ }
+ }
+
return result_handles;
}
@@ -715,14 +803,16 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
// Execute the generated executables in parallel and return the device
// handles for each computation's output.
+ ExecutionProfile profile;
TF_ASSIGN_OR_RETURN(
std::vector<GlobalDataHandle> outputs,
ExecuteParallelAndRegisterResult(executable_ptrs, all_arguments,
execute_backend_.get(), device_handles,
- computation_names));
+ computation_names, &profile));
for (const GlobalDataHandle& output : outputs) {
ExecuteResponse response;
*response.mutable_output() = output;
+ *response.mutable_profile() = profile;
*result->add_responses() = response;
}
diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h
index 2452259f73..6646be2e9a 100644
--- a/tensorflow/compiler/xla/service/service.h
+++ b/tensorflow/compiler/xla/service/service.h
@@ -327,7 +327,8 @@ class Service : public ServiceInterface {
arguments,
Backend* backend,
tensorflow::gtl::ArraySlice<DeviceHandle> device_handles,
- tensorflow::gtl::ArraySlice<string> result_tags);
+ tensorflow::gtl::ArraySlice<string> result_tags,
+ ExecutionProfile* profile);
// Convenience function for adding a function to a user computation.
template <typename RequestT, typename ResponseT>