aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Tim Shen <timshen@google.com>2018-08-09 15:46:02 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-08-09 15:49:54 -0700
commit9eb310c3f651d69b9a8eea016f6397049c5a0a31 (patch)
treefce34ab87d8304f024dad4f1a77b9aa0448da28f
parent04af1735f203df9fdc539f7c8d6cd1cd0425d25b (diff)
Make CpuExecutable::ExecuteOnStream actually use the stream.
Currently, it eagerly execute the function, ignoring the previously en-queued tasks. Also removed the "Asynchronous execution on stream with hlo profiling is not yet supported on CPU." check, as profiling is not supported by the async interface anyway. PiperOrigin-RevId: 208124125
-rw-r--r--tensorflow/compiler/xla/service/cpu/cpu_executable.cc38
-rw-r--r--tensorflow/compiler/xla/service/cpu/cpu_executable.h10
2 files changed, 28 insertions, 20 deletions
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index 946f5124b8..c376864c3e 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -249,24 +249,11 @@ StatusOr<ScopedShapedBuffer> CpuExecutable::ExecuteOnStream(
const ServiceExecutableRunOptions* run_options,
tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
HloExecutionProfile* hlo_execution_profile) {
- if (GetRootPointsToSet().IsAmbiguous()) {
- return Unimplemented("Points-to set of root instruction is ambiguous");
- }
-
- se::Stream* stream = run_options->stream();
- DeviceMemoryAllocator* memory_allocator = run_options->allocator();
-
- std::vector<OwningDeviceMemory> owning_buffers;
- std::vector<se::DeviceMemoryBase> unowning_buffers;
TF_ASSIGN_OR_RETURN(
- std::tie(unowning_buffers, owning_buffers),
- CreateTempArray(memory_allocator, stream->parent()->device_ordinal(),
- arguments));
-
- TF_RETURN_IF_ERROR(ExecuteComputeFunction(
- &run_options->run_options(), unowning_buffers, hlo_execution_profile));
-
- return CreateResultShapedBuffer(run_options, &owning_buffers);
+ auto result,
+ ExecuteAsyncOnStreamImpl(run_options, arguments, hlo_execution_profile));
+ TF_RETURN_IF_ERROR(run_options->stream()->BlockHostUntilDone());
+ return std::move(result);
}
StatusOr<ScopedShapedBuffer> CpuExecutable::ExecuteAsyncOnStream(
@@ -277,6 +264,16 @@ StatusOr<ScopedShapedBuffer> CpuExecutable::ExecuteAsyncOnStream(
"Asynchronous execution on stream with hlo profiling is not yet "
"supported on CPU.");
}
+ return ExecuteAsyncOnStreamImpl(run_options, arguments, nullptr);
+}
+
+StatusOr<ScopedShapedBuffer> CpuExecutable::ExecuteAsyncOnStreamImpl(
+ const ServiceExecutableRunOptions* run_options,
+ tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
+ HloExecutionProfile* hlo_execution_profile) {
+ if (GetRootPointsToSet().IsAmbiguous()) {
+ return Unimplemented("Points-to set of root instruction is ambiguous");
+ }
auto* host_stream = dynamic_cast<se::host::HostStream*>(
run_options->stream()->implementation());
@@ -310,19 +307,20 @@ StatusOr<ScopedShapedBuffer> CpuExecutable::ExecuteAsyncOnStream(
ServiceExecutableRunOptions run_options;
std::vector<se::DeviceMemoryBase> unowning_buffers;
std::shared_ptr<std::vector<OwningDeviceMemory>> buffers;
+ HloExecutionProfile* hlo_execution_profile;
void operator()() {
// Failing a CHECK here is not great, but I don't see an obvious way to
// return a failed Status asynchronously.
TF_CHECK_OK(executable->ExecuteComputeFunction(
- &run_options.run_options(), unowning_buffers,
- /*hlo_execution_profile=*/nullptr));
+ &run_options.run_options(), unowning_buffers, hlo_execution_profile));
}
};
host_stream->EnqueueTask(
AsyncRunTask{this, *run_options, std::move(unowning_buffers),
std::make_shared<std::vector<OwningDeviceMemory>>(
- std::move(owning_buffers))});
+ std::move(owning_buffers)),
+ hlo_execution_profile});
return std::move(result);
}
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.h b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
index 8af8a5dfec..96e53de57e 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
@@ -85,6 +85,16 @@ class CpuExecutable : public Executable {
const BufferAssignment& buffer_assignment() const { return *assignment_; }
private:
+ // This is for sharing the code between ExecuteOnStream and
+ // ExecuteAsyncOnStream.
+ //
+ // Notice that it's tricky to use correctly, as the profile object (when it
+ // exists) must out-live the task.
+ StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStreamImpl(
+ const ServiceExecutableRunOptions* run_options,
+ tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
+ HloExecutionProfile* hlo_execution_profile);
+
// Creates an array suitable for passing as the "temps" argument to the JIT
// compiled function pointer.
//