diff options
author | Tim Shen <timshen@google.com> | 2018-08-09 15:46:02 -0700 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2018-08-09 15:49:54 -0700 |
commit | 9eb310c3f651d69b9a8eea016f6397049c5a0a31 (patch) | |
tree | fce34ab87d8304f024dad4f1a77b9aa0448da28f | |
parent | 04af1735f203df9fdc539f7c8d6cd1cd0425d25b (diff) |
Make CpuExecutable::ExecuteOnStream actually use the stream.
Currently, it eagerly execute the function, ignoring the previously en-queued tasks.
Also removed the "Asynchronous execution on stream with hlo profiling is not yet supported on CPU." check, as profiling is not supported by the async interface anyway.
PiperOrigin-RevId: 208124125
-rw-r--r-- | tensorflow/compiler/xla/service/cpu/cpu_executable.cc | 38 | ||||
-rw-r--r-- | tensorflow/compiler/xla/service/cpu/cpu_executable.h | 10 |
2 files changed, 28 insertions, 20 deletions
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc index 946f5124b8..c376864c3e 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc @@ -249,24 +249,11 @@ StatusOr<ScopedShapedBuffer> CpuExecutable::ExecuteOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments, HloExecutionProfile* hlo_execution_profile) { - if (GetRootPointsToSet().IsAmbiguous()) { - return Unimplemented("Points-to set of root instruction is ambiguous"); - } - - se::Stream* stream = run_options->stream(); - DeviceMemoryAllocator* memory_allocator = run_options->allocator(); - - std::vector<OwningDeviceMemory> owning_buffers; - std::vector<se::DeviceMemoryBase> unowning_buffers; TF_ASSIGN_OR_RETURN( - std::tie(unowning_buffers, owning_buffers), - CreateTempArray(memory_allocator, stream->parent()->device_ordinal(), - arguments)); - - TF_RETURN_IF_ERROR(ExecuteComputeFunction( - &run_options->run_options(), unowning_buffers, hlo_execution_profile)); - - return CreateResultShapedBuffer(run_options, &owning_buffers); + auto result, + ExecuteAsyncOnStreamImpl(run_options, arguments, hlo_execution_profile)); + TF_RETURN_IF_ERROR(run_options->stream()->BlockHostUntilDone()); + return std::move(result); } StatusOr<ScopedShapedBuffer> CpuExecutable::ExecuteAsyncOnStream( @@ -277,6 +264,16 @@ StatusOr<ScopedShapedBuffer> CpuExecutable::ExecuteAsyncOnStream( "Asynchronous execution on stream with hlo profiling is not yet " "supported on CPU."); } + return ExecuteAsyncOnStreamImpl(run_options, arguments, nullptr); +} + +StatusOr<ScopedShapedBuffer> CpuExecutable::ExecuteAsyncOnStreamImpl( + const ServiceExecutableRunOptions* run_options, + tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments, + HloExecutionProfile* hlo_execution_profile) { + if (GetRootPointsToSet().IsAmbiguous()) { + return Unimplemented("Points-to set of root instruction is ambiguous"); + } auto* host_stream = dynamic_cast<se::host::HostStream*>( run_options->stream()->implementation()); @@ -310,19 +307,20 @@ StatusOr<ScopedShapedBuffer> CpuExecutable::ExecuteAsyncOnStream( ServiceExecutableRunOptions run_options; std::vector<se::DeviceMemoryBase> unowning_buffers; std::shared_ptr<std::vector<OwningDeviceMemory>> buffers; + HloExecutionProfile* hlo_execution_profile; void operator()() { // Failing a CHECK here is not great, but I don't see an obvious way to // return a failed Status asynchronously. TF_CHECK_OK(executable->ExecuteComputeFunction( - &run_options.run_options(), unowning_buffers, - /*hlo_execution_profile=*/nullptr)); + &run_options.run_options(), unowning_buffers, hlo_execution_profile)); } }; host_stream->EnqueueTask( AsyncRunTask{this, *run_options, std::move(unowning_buffers), std::make_shared<std::vector<OwningDeviceMemory>>( - std::move(owning_buffers))}); + std::move(owning_buffers)), + hlo_execution_profile}); return std::move(result); } diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.h b/tensorflow/compiler/xla/service/cpu/cpu_executable.h index 8af8a5dfec..96e53de57e 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_executable.h +++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.h @@ -85,6 +85,16 @@ class CpuExecutable : public Executable { const BufferAssignment& buffer_assignment() const { return *assignment_; } private: + // This is for sharing the code between ExecuteOnStream and + // ExecuteAsyncOnStream. + // + // Notice that it's tricky to use correctly, as the profile object (when it + // exists) must out-live the task. + StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStreamImpl( + const ServiceExecutableRunOptions* run_options, + tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments, + HloExecutionProfile* hlo_execution_profile); + // Creates an array suitable for passing as the "temps" argument to the JIT // compiled function pointer. // |