aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
diff options
context:
space:
mode:
authorGravatar A. Unique TensorFlower <gardener@tensorflow.org>2017-12-15 23:38:01 -0800
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2017-12-18 10:10:43 -0800
commitfc2526a8c1cf0bc2a93c8cc819ff7209eb4628c9 (patch)
treeeb4a3ea9ba49b7cbeabe1195c98267856b6d62ae /tensorflow/compiler/xla/service/gpu/gpu_executable.cc
parentf2996ec7a1e0e9f4d07637889475681a7432375c (diff)
Merged commit includes the following changes:
179277894 by gunan: Run buildifier on build file. -- 179275101 by meheff: Replace DeviceMemoryBase with ShapedBuffer in XLA interfaces. Executable, TransferManager, and AllocationTracker now use ShapedBuffer to hold device memory addresses holding XLA data. Most of the change is straight-forward with the exception of AllocationTracker which was mostly rewritten (and simplified) and some refactoring in the CPU executable. Also, have ShapedBuffer hold on-host and on-device Shapes which are the shapes of the representation of the data on the host and device, respectively. This is necessary because with cl/178624364 the on-host and on-device shape may no longer be equal. -- 179265385 by A. Unique TensorFlower: Return error rather than CHECK fail in Executable::ExecuteOnStreamWrapper -- 179264551 by dandelion: Internal fixes. -- PiperOrigin-RevId: 179277894
Diffstat (limited to 'tensorflow/compiler/xla/service/gpu/gpu_executable.cc')
-rw-r--r--tensorflow/compiler/xla/service/gpu/gpu_executable.cc142
1 files changed, 30 insertions, 112 deletions
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index b802ae9c7a..366d87e9c3 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -203,84 +203,6 @@ Status GpuExecutable::ExecuteThunks(
return Status::OK();
}
-StatusOr<se::DeviceMemoryBase> GpuExecutable::ExecuteOnStream(
- const ServiceExecutableRunOptions* run_options,
- tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments,
- HloExecutionProfile* hlo_execution_profile) {
- se::Stream* stream = run_options->stream();
- DeviceMemoryAllocator* memory_allocator = run_options->allocator();
-
- BufferAllocations::Builder buffer_allocations_builder;
- for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size();
- ++i) {
- const BufferAllocation& allocation = assignment_->GetAllocation(i);
- if (allocation.is_entry_computation_parameter()) {
- buffer_allocations_builder.RegisterBuffer(
- i, arguments[allocation.parameter_number()]);
- }
- }
- se::StreamExecutor* executor = stream->parent();
- TF_ASSIGN_OR_RETURN(
- auto buffer_allocations,
- buffer_allocations_builder.Build(*assignment_, executor->device_ordinal(),
- memory_allocator));
-
- bool block_host_until_done =
- !memory_allocator->AllowsAsynchronousDeallocation();
- TF_RETURN_IF_ERROR(ExecuteThunks(run_options, *buffer_allocations,
- block_host_until_done,
- hlo_execution_profile));
-
- HloInstruction* root = hlo_module_->entry_computation()->root_instruction();
- TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice output_slice,
- assignment_->GetUniqueTopLevelOutputSlice());
- se::DeviceMemoryBase output_buffer_address =
- buffer_allocations->GetDeviceAddress(output_slice.index());
-
- if (ShapeUtil::IsTuple(root->shape())) {
- std::set<se::DeviceMemoryBase> referred_by_output;
- if (GetRootPointsToSet().IsAmbiguous()) {
- // The points-to set of the root is ambiguous so we need to examine the
- // result data to determine which buffers are contained in the result.
- TF_ASSIGN_OR_RETURN(
- TransferManager * transfer_manager,
- TransferManager::GetForPlatform(executor->platform()));
- TF_ASSIGN_OR_RETURN(referred_by_output,
- transfer_manager->GatherBufferPointersFromTuple(
- executor, output_buffer_address, root->shape()));
- } else {
- // The points-to set of the root is unambiguous so it's known statically
- // which buffers are in the result. Gather these buffers using the root's
- // points-to set.
- TF_RETURN_IF_ERROR(GetRootPointsToSet().ForEachElementWithStatus(
- [&referred_by_output, &buffer_allocations, this](
- const ShapeIndex& /*index*/,
- const PointsToSet::BufferList& buffers) {
- // The points to set is unambiguous so the set should be a
- // singleton. That is, we know exactly which instruction produced
- // the array at this element.
- CHECK_EQ(1, buffers.size());
- HloInstruction* hlo = buffers[0]->instruction();
- TF_ASSIGN_OR_RETURN(
- const BufferAllocation::Slice slice,
- this->assignment_->GetUniqueSlice(hlo, buffers[0]->index()));
- CHECK(!slice.allocation()->is_entry_computation_parameter());
- referred_by_output.insert(
- buffer_allocations->GetDeviceAddress(slice.index()));
- return Status::OK();
- }));
- }
- TF_RETURN_IF_ERROR(
- buffer_allocations->TearDown(referred_by_output, *assignment_));
- } else {
- // If the computation result is not a tuple, we can delete all temporary
- // buffers that are not the output.
- TF_RETURN_IF_ERROR(
- buffer_allocations->TearDown({output_buffer_address}, *assignment_));
- }
- return output_buffer_address;
-}
-
StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteOnStream(
const ServiceExecutableRunOptions* run_options,
tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
@@ -298,7 +220,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteOnStream(
if (allocation.is_entry_computation_parameter()) {
auto param_no = allocation.parameter_number();
buffer_allocations_builder.RegisterBuffer(
- i, arguments[param_no]->buffer(/*index=*/{}));
+ i, arguments[param_no]->root_buffer());
}
}
se::StreamExecutor* executor = run_options->stream()->parent();
@@ -316,50 +238,46 @@ StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteOnStream(
HloInstruction* root = hlo_module_->entry_computation()->root_instruction();
auto device_ordinal = executor->device_ordinal();
auto shaped_buffer = MakeUnique<ShapedBuffer>(
- root->shape(), executor->platform(), device_ordinal);
+ root->shape(), root->shape(), executor->platform(), device_ordinal);
// Copy DeviceMemoryBase values which contain the array(s) of the result into
// the respective location in ShapedBuffer.
std::set<se::DeviceMemoryBase> buffers_in_result;
- TF_RETURN_IF_ERROR(
- shaped_buffer->mutable_shape_index_to_buffer_entry()
- ->ForEachMutableElementWithStatus(
- [&buffer_allocations, &buffers_in_result, &shaped_buffer, this](
- const ShapeIndex& index, size_t* buffer_entry) {
- const auto& sources = this->GetRootPointsToSet().element(index);
- // The points-to set is unambiguous so the set should be a
- // singleton. That is, we know exactly which instruction
- // produced the array at this element.
- CHECK_EQ(1, sources.size());
- auto src_hlo = sources[0]->instruction();
-
- VLOG(4) << "Looking at: " << sources[0];
-
- // The source instruction should have a non-parameter buffer
- // assigned.
- TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice slice,
- this->assignment_->GetUniqueSlice(
- src_hlo, sources[0]->index()));
- CHECK(!slice.allocation()->is_entry_computation_parameter());
-
- perftools::gputools::DeviceMemoryBase src_base =
- buffer_allocations->GetDeviceAddress(slice.index());
- CHECK(!src_base.is_null() || src_base.size() == 0);
- shaped_buffer->mutable_buffers()->push_back(src_base);
- *buffer_entry = shaped_buffer->mutable_buffers()->size() - 1;
-
- buffers_in_result.insert(src_base);
- return Status::OK();
- }));
+ TF_RETURN_IF_ERROR(shaped_buffer->buffers().ForEachMutableElementWithStatus(
+ [&buffer_allocations, &buffers_in_result, &shaped_buffer, this](
+ const ShapeIndex& index, se::DeviceMemoryBase* device_memory) {
+ const auto& sources = this->GetRootPointsToSet().element(index);
+ // The points-to set is unambiguous so the set should be a
+ // singleton. That is, we know exactly which instruction
+ // produced the array at this element.
+ CHECK_EQ(1, sources.size());
+ auto src_hlo = sources[0]->instruction();
+
+ VLOG(4) << "Looking at: " << sources[0];
+
+ // The source instruction should have a non-parameter buffer
+ // assigned.
+ TF_ASSIGN_OR_RETURN(
+ const BufferAllocation::Slice slice,
+ this->assignment_->GetUniqueSlice(src_hlo, sources[0]->index()));
+ CHECK(!slice.allocation()->is_entry_computation_parameter());
+
+ perftools::gputools::DeviceMemoryBase src_base =
+ buffer_allocations->GetDeviceAddress(slice.index());
+ CHECK(!src_base.is_null() || src_base.size() == 0);
+ *device_memory = src_base;
+ buffers_in_result.insert(src_base);
+ return Status::OK();
+ }));
TF_RETURN_IF_ERROR(
buffer_allocations->TearDown(buffers_in_result, *assignment_));
return std::move(shaped_buffer);
}
-StatusOr<se::DeviceMemoryBase> GpuExecutable::ExecuteAsyncOnStream(
+StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteAsyncOnStream(
const ServiceExecutableRunOptions* run_options,
- tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments) {
+ tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) {
// TODO(b/30671675): Implement asynchronous execution mode.
return Unimplemented(
"Asynchronous execution on stream is not yet supported on GPU.");