diff options
author | 2017-11-06 02:57:40 -0800 | |
---|---|---|
committer | 2017-11-06 03:02:01 -0800 | |
commit | c217140933e36f4f2ac18a2add51e27b1d146b24 (patch) | |
tree | 83c92293b91785f2e23c823d7c7cb93c65121b75 | |
parent | 7ec9c634f6b41ee081be57ed14bdefc838691dbf (diff) |
Add profiling support to Service::ExecuteParallel.
PiperOrigin-RevId: 174682772
-rw-r--r-- | tensorflow/compiler/xla/service/executable.h | 10 | ||||
-rw-r--r-- | tensorflow/compiler/xla/service/service.cc | 94 | ||||
-rw-r--r-- | tensorflow/compiler/xla/service/service.h | 3 |
3 files changed, 104 insertions, 3 deletions
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h index 2d32e59d36..7e0d182b36 100644 --- a/tensorflow/compiler/xla/service/executable.h +++ b/tensorflow/compiler/xla/service/executable.h @@ -88,6 +88,16 @@ class Executable { tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>> arguments); + // Populates `hlo_execution_profile` from `executor`. This is implicit in any + // Execute* API call that takes a hlo_execution_profile argument, but must be + // called explicitly for other (async, for example) variants after the stream + // has completed. + virtual Status PopulateExecutionProfile( + HloExecutionProfile* hlo_execution_profile, + perftools::gputools::StreamExecutor* executor) { + return Status::OK(); + } + // Convenience wrapper for calling Executable::ExecuteOnStream. Sets up a // timer for the execution, sets up HLO profiling if enabled, and fills in the // given ExecutionProfile if non-null. The ExecuteOnStream overloads have diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc index bac33d8102..b6c9c5eefa 100644 --- a/tensorflow/compiler/xla/service/service.cc +++ b/tensorflow/compiler/xla/service/service.cc @@ -490,14 +490,20 @@ Service::ExecuteParallelAndRegisterResult( std::vector<perftools::gputools::DeviceMemoryBase>> arguments, Backend* backend, tensorflow::gtl::ArraySlice<DeviceHandle> device_handles, - tensorflow::gtl::ArraySlice<string> result_tags) { + tensorflow::gtl::ArraySlice<string> result_tags, + ExecutionProfile* profile) { // Streams where the computation are launched, so we can wait on the streams // to complete. std::vector<Pool<se::Stream>::SmartPtr> streams; + std::vector<std::unique_ptr<perftools::gputools::Timer>> timers; // Global data handles for the computation results, one for each computation. std::vector<GlobalDataHandle> result_handles; + // Device ID to stream executor, populated only with devices that are being + // profiled. + std::map<int64, se::Stream*> index_to_profiled_streams; + TF_ASSIGN_OR_RETURN(DeviceAssignment device_assignment, backend->computation_placer()->AssignDevices( options_.number_of_replicas(), executables.size())); @@ -510,6 +516,21 @@ Service::ExecuteParallelAndRegisterResult( backend->BorrowStream(replicas[replica])); streams.push_back(std::move(stream)); + if (replica == 0 && profile != nullptr) { + timers.emplace_back( + new perftools::gputools::Timer(streams.back()->parent())); + streams.back() + ->InitTimer(timers.back().get()) + .ThenStartTimer(timers.back().get()); + CHECK(timers.front() != nullptr); + } + + if (replica == 0 && + executables[i]->module_config().debug_options().xla_hlo_profile() && + executables[i]->hlo_profiling_enabled()) { + index_to_profiled_streams[i] = streams.back().get(); + } + // Set up run options. ExecutableRunOptions options; options.set_stream(streams.back().get()); @@ -526,6 +547,10 @@ Service::ExecuteParallelAndRegisterResult( perftools::gputools::DeviceMemoryBase result, executables[i]->ExecuteAsyncOnStream(&run_options, arguments[i])); + if (replica == 0 && profile != nullptr) { + streams.back()->ThenStopTimer(timers.back().get()); + } + // All replicas share the same device address for the result allocation, // so only one of the replicas need to register the result handle. if (replica == 0) { @@ -543,6 +568,69 @@ Service::ExecuteParallelAndRegisterResult( } } + // For every stream that had profiling enabled, obtain and debug-dump the HLO + // profile. + for (auto& index_to_profiled_stream : index_to_profiled_streams) { + int64 device = index_to_profiled_stream.first; + se::Stream* stream = index_to_profiled_stream.second; + HloExecutionProfile hlo_profile; + TF_RETURN_IF_ERROR(executables[device]->PopulateExecutionProfile( + &hlo_profile, stream->parent())); + + std::unordered_set<const xla::HloComputation*> profiled_computations = + hlo_profile.profiled_computations(); + // To ensure we have print the profiles in a stable order, iterate over the + // computations in post order. + auto& module = executables[device]->module(); + std::list<xla::HloComputation*> all_computations = + module.MakeComputationPostOrder(); + for (xla::HloComputation* computation : all_computations) { + if (profiled_computations.count(computation) > 0) { + string profile_string = hlo_profile.ToString( + *computation, streams[0]->parent()->GetDeviceDescription(), + executables[device]->CreateCostAnalysis().get()); + if (!profile_string.empty()) { + LOG(INFO) << "HLO profile for execution on device " << device + << ":\n"; + XLA_LOG_LINES(tensorflow::INFO, profile_string); + } + } + } + hlo_graph_dumper::MaybeDumpHloModule(module, "Service::Execute", + &hlo_profile); + } + + if (profile != nullptr) { + CHECK(!timers.empty()); + std::vector<uint64> timer_nanoseconds; + timer_nanoseconds.reserve(timers.size()); + for (auto& timer : timers) { + timer_nanoseconds.push_back(timer->Nanoseconds()); + } + uint64 nanoseconds = + *std::max_element(timer_nanoseconds.begin(), timer_nanoseconds.end()); + + // Merge in run-time profile information from execution_profile on the + // zeroth device. + profile->MergeFrom(executables[0]->execution_profile()); + + // Overall execution time (in nanoseconds) from the executor timer. + profile->set_compute_and_transfer_time_ns(nanoseconds); + + // TODO(b/28123297): On GPU we end up including transfer time in + // the compute time this way. Instead, we should get the correct + // value by measuring it. Setting the field here at least lets + // benchmarks provide *some* value for GPU computations. + // + // TODO(b/28447609): The value in compute_and_transfer_time_ns is actually + // the compute time without the transfer time, so this way we get the + // correct compute time. We should instead have the correct value for + // compute_and_transfer_time and set compute_time to the compute time. + if (profile->compute_time_ns() == 0) { + profile->set_compute_time_ns(profile->compute_and_transfer_time_ns()); + } + } + return result_handles; } @@ -715,14 +803,16 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg, // Execute the generated executables in parallel and return the device // handles for each computation's output. + ExecutionProfile profile; TF_ASSIGN_OR_RETURN( std::vector<GlobalDataHandle> outputs, ExecuteParallelAndRegisterResult(executable_ptrs, all_arguments, execute_backend_.get(), device_handles, - computation_names)); + computation_names, &profile)); for (const GlobalDataHandle& output : outputs) { ExecuteResponse response; *response.mutable_output() = output; + *response.mutable_profile() = profile; *result->add_responses() = response; } diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h index 2452259f73..6646be2e9a 100644 --- a/tensorflow/compiler/xla/service/service.h +++ b/tensorflow/compiler/xla/service/service.h @@ -327,7 +327,8 @@ class Service : public ServiceInterface { arguments, Backend* backend, tensorflow::gtl::ArraySlice<DeviceHandle> device_handles, - tensorflow::gtl::ArraySlice<string> result_tags); + tensorflow::gtl::ArraySlice<string> result_tags, + ExecutionProfile* profile); // Convenience function for adding a function to a user computation. template <typename RequestT, typename ResponseT> |