diff options
author | 2017-12-15 23:38:01 -0800 | |
---|---|---|
committer | 2017-12-18 10:10:43 -0800 | |
commit | fc2526a8c1cf0bc2a93c8cc819ff7209eb4628c9 (patch) | |
tree | eb4a3ea9ba49b7cbeabe1195c98267856b6d62ae /tensorflow/compiler/xla/service/gpu/gpu_executable.cc | |
parent | f2996ec7a1e0e9f4d07637889475681a7432375c (diff) |
Merged commit includes the following changes:
179277894 by gunan:
Run buildifier on build file.
--
179275101 by meheff:
Replace DeviceMemoryBase with ShapedBuffer in XLA interfaces.
Executable, TransferManager, and AllocationTracker now use ShapedBuffer to hold device memory addresses holding XLA data. Most of the change is straight-forward with the exception of AllocationTracker which was mostly rewritten (and simplified) and some refactoring in the CPU executable.
Also, have ShapedBuffer hold on-host and on-device Shapes which are the shapes of the representation of the data on the host and device, respectively. This is necessary because with cl/178624364 the on-host and on-device shape may no longer be equal.
--
179265385 by A. Unique TensorFlower:
Return error rather than CHECK fail in Executable::ExecuteOnStreamWrapper
--
179264551 by dandelion:
Internal fixes.
--
PiperOrigin-RevId: 179277894
Diffstat (limited to 'tensorflow/compiler/xla/service/gpu/gpu_executable.cc')
-rw-r--r-- | tensorflow/compiler/xla/service/gpu/gpu_executable.cc | 142 |
1 files changed, 30 insertions, 112 deletions
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc index b802ae9c7a..366d87e9c3 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc @@ -203,84 +203,6 @@ Status GpuExecutable::ExecuteThunks( return Status::OK(); } -StatusOr<se::DeviceMemoryBase> GpuExecutable::ExecuteOnStream( - const ServiceExecutableRunOptions* run_options, - tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments, - HloExecutionProfile* hlo_execution_profile) { - se::Stream* stream = run_options->stream(); - DeviceMemoryAllocator* memory_allocator = run_options->allocator(); - - BufferAllocations::Builder buffer_allocations_builder; - for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size(); - ++i) { - const BufferAllocation& allocation = assignment_->GetAllocation(i); - if (allocation.is_entry_computation_parameter()) { - buffer_allocations_builder.RegisterBuffer( - i, arguments[allocation.parameter_number()]); - } - } - se::StreamExecutor* executor = stream->parent(); - TF_ASSIGN_OR_RETURN( - auto buffer_allocations, - buffer_allocations_builder.Build(*assignment_, executor->device_ordinal(), - memory_allocator)); - - bool block_host_until_done = - !memory_allocator->AllowsAsynchronousDeallocation(); - TF_RETURN_IF_ERROR(ExecuteThunks(run_options, *buffer_allocations, - block_host_until_done, - hlo_execution_profile)); - - HloInstruction* root = hlo_module_->entry_computation()->root_instruction(); - TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice output_slice, - assignment_->GetUniqueTopLevelOutputSlice()); - se::DeviceMemoryBase output_buffer_address = - buffer_allocations->GetDeviceAddress(output_slice.index()); - - if (ShapeUtil::IsTuple(root->shape())) { - std::set<se::DeviceMemoryBase> referred_by_output; - if (GetRootPointsToSet().IsAmbiguous()) { - // The points-to set of the root is ambiguous so we need to examine the - // result data to determine which buffers are contained in the result. - TF_ASSIGN_OR_RETURN( - TransferManager * transfer_manager, - TransferManager::GetForPlatform(executor->platform())); - TF_ASSIGN_OR_RETURN(referred_by_output, - transfer_manager->GatherBufferPointersFromTuple( - executor, output_buffer_address, root->shape())); - } else { - // The points-to set of the root is unambiguous so it's known statically - // which buffers are in the result. Gather these buffers using the root's - // points-to set. - TF_RETURN_IF_ERROR(GetRootPointsToSet().ForEachElementWithStatus( - [&referred_by_output, &buffer_allocations, this]( - const ShapeIndex& /*index*/, - const PointsToSet::BufferList& buffers) { - // The points to set is unambiguous so the set should be a - // singleton. That is, we know exactly which instruction produced - // the array at this element. - CHECK_EQ(1, buffers.size()); - HloInstruction* hlo = buffers[0]->instruction(); - TF_ASSIGN_OR_RETURN( - const BufferAllocation::Slice slice, - this->assignment_->GetUniqueSlice(hlo, buffers[0]->index())); - CHECK(!slice.allocation()->is_entry_computation_parameter()); - referred_by_output.insert( - buffer_allocations->GetDeviceAddress(slice.index())); - return Status::OK(); - })); - } - TF_RETURN_IF_ERROR( - buffer_allocations->TearDown(referred_by_output, *assignment_)); - } else { - // If the computation result is not a tuple, we can delete all temporary - // buffers that are not the output. - TF_RETURN_IF_ERROR( - buffer_allocations->TearDown({output_buffer_address}, *assignment_)); - } - return output_buffer_address; -} - StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteOnStream( const ServiceExecutableRunOptions* run_options, tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments, @@ -298,7 +220,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteOnStream( if (allocation.is_entry_computation_parameter()) { auto param_no = allocation.parameter_number(); buffer_allocations_builder.RegisterBuffer( - i, arguments[param_no]->buffer(/*index=*/{})); + i, arguments[param_no]->root_buffer()); } } se::StreamExecutor* executor = run_options->stream()->parent(); @@ -316,50 +238,46 @@ StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteOnStream( HloInstruction* root = hlo_module_->entry_computation()->root_instruction(); auto device_ordinal = executor->device_ordinal(); auto shaped_buffer = MakeUnique<ShapedBuffer>( - root->shape(), executor->platform(), device_ordinal); + root->shape(), root->shape(), executor->platform(), device_ordinal); // Copy DeviceMemoryBase values which contain the array(s) of the result into // the respective location in ShapedBuffer. std::set<se::DeviceMemoryBase> buffers_in_result; - TF_RETURN_IF_ERROR( - shaped_buffer->mutable_shape_index_to_buffer_entry() - ->ForEachMutableElementWithStatus( - [&buffer_allocations, &buffers_in_result, &shaped_buffer, this]( - const ShapeIndex& index, size_t* buffer_entry) { - const auto& sources = this->GetRootPointsToSet().element(index); - // The points-to set is unambiguous so the set should be a - // singleton. That is, we know exactly which instruction - // produced the array at this element. - CHECK_EQ(1, sources.size()); - auto src_hlo = sources[0]->instruction(); - - VLOG(4) << "Looking at: " << sources[0]; - - // The source instruction should have a non-parameter buffer - // assigned. - TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice slice, - this->assignment_->GetUniqueSlice( - src_hlo, sources[0]->index())); - CHECK(!slice.allocation()->is_entry_computation_parameter()); - - perftools::gputools::DeviceMemoryBase src_base = - buffer_allocations->GetDeviceAddress(slice.index()); - CHECK(!src_base.is_null() || src_base.size() == 0); - shaped_buffer->mutable_buffers()->push_back(src_base); - *buffer_entry = shaped_buffer->mutable_buffers()->size() - 1; - - buffers_in_result.insert(src_base); - return Status::OK(); - })); + TF_RETURN_IF_ERROR(shaped_buffer->buffers().ForEachMutableElementWithStatus( + [&buffer_allocations, &buffers_in_result, &shaped_buffer, this]( + const ShapeIndex& index, se::DeviceMemoryBase* device_memory) { + const auto& sources = this->GetRootPointsToSet().element(index); + // The points-to set is unambiguous so the set should be a + // singleton. That is, we know exactly which instruction + // produced the array at this element. + CHECK_EQ(1, sources.size()); + auto src_hlo = sources[0]->instruction(); + + VLOG(4) << "Looking at: " << sources[0]; + + // The source instruction should have a non-parameter buffer + // assigned. + TF_ASSIGN_OR_RETURN( + const BufferAllocation::Slice slice, + this->assignment_->GetUniqueSlice(src_hlo, sources[0]->index())); + CHECK(!slice.allocation()->is_entry_computation_parameter()); + + perftools::gputools::DeviceMemoryBase src_base = + buffer_allocations->GetDeviceAddress(slice.index()); + CHECK(!src_base.is_null() || src_base.size() == 0); + *device_memory = src_base; + buffers_in_result.insert(src_base); + return Status::OK(); + })); TF_RETURN_IF_ERROR( buffer_allocations->TearDown(buffers_in_result, *assignment_)); return std::move(shaped_buffer); } -StatusOr<se::DeviceMemoryBase> GpuExecutable::ExecuteAsyncOnStream( +StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, - tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments) { + tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) { // TODO(b/30671675): Implement asynchronous execution mode. return Unimplemented( "Asynchronous execution on stream is not yet supported on GPU."); |