diff options
Diffstat (limited to 'tensorflow/compiler/xla/service/gpu/outfeed_thunk.cc')
-rw-r--r-- | tensorflow/compiler/xla/service/gpu/outfeed_thunk.cc | 111 |
1 files changed, 111 insertions, 0 deletions
diff --git a/tensorflow/compiler/xla/service/gpu/outfeed_thunk.cc b/tensorflow/compiler/xla/service/gpu/outfeed_thunk.cc new file mode 100644 index 0000000000..7986e63f43 --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/outfeed_thunk.cc @@ -0,0 +1,111 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/gpu/outfeed_thunk.h" +#include "tensorflow/compiler/xla/literal.h" +#include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h" +#include "tensorflow/compiler/xla/service/gpu/outfeed_manager.h" +#include "tensorflow/compiler/xla/util.h" +#include "tensorflow/core/platform/stream_executor_no_cuda.h" + +namespace xla { +namespace gpu { + +OutfeedThunk::OutfeedThunk(ShapeTree<BufferAllocation::Slice> outfeed_slices, + const HloInstruction* hlo_instruction) + : Thunk(Kind::kOutfeed, hlo_instruction), + outfeed_slices_(std::move(outfeed_slices)) {} + +Status OutfeedThunk::ExecuteOnStream( + const BufferAllocations& buffer_allocations, se::Stream* stream, + HloExecutionProfiler* profiler) { + VLOG(2) << "Outfeeding from GPU: " << hlo_instruction()->ToString(); + + auto op_profiler = profiler->MakeScopedInstructionProfiler(hlo_instruction()); + OutfeedManager* outfeed_manager = GetOrCreateOutfeedManager(); + ShapeTree<std::unique_ptr<OutfeedBuffer>>* outfeed_buffers = + outfeed_manager->BlockingGetNextDestination(); + + // Nothing to be done for empty tuples. + if (ShapeUtil::IsEmptyTuple(hlo_instruction()->operand(0)->shape())) { + return Status::OK(); + } + CHECK(ShapeUtil::Compatible(hlo_instruction()->operand(0)->shape(), + outfeed_buffers->shape())); + + TF_RETURN_IF_ERROR(outfeed_buffers->ForEachMutableElementWithStatus( + [&](const ShapeIndex& index, std::unique_ptr<OutfeedBuffer>* buffer) { + if (!*buffer) { // Tuple pointers. + return Status::OK(); + } + // Allocate storage for the literal data. + const Shape& shape = + ShapeUtil::GetSubshape(outfeed_buffers->shape(), index); + (*buffer)->set_destination(Literal::CreateFromShape(shape)); + + BufferAllocation::Slice slice = outfeed_slices_.element(index); + se::DeviceMemoryBase data_address; + if (slice.allocation()) { + // If we have a static allocation, read it from there. This avoids + // synchronizing the host and device just to read a pointer. + data_address = buffer_allocations.GetDeviceAddress(slice); + } else { + // Otherwise we have to read the tuple pointer first. + CHECK(!index.empty()); + // Copy the parent buffer to the host. + BufferAllocation::Slice tuple_slice = + outfeed_slices_.element(ShapeIndexView(index).ConsumeFront()); + if (!tuple_slice.allocation()) { + return Unimplemented( + "Nested dynamic tuples are not supported on GPU"); + } + se::DeviceMemoryBase tuple_address = + buffer_allocations.GetDeviceAddress(tuple_slice); + CHECK(tuple_slice.size() % sizeof(void*) == 0) + << "Tuple size must be a multiple of pointer size"; + std::vector<void*> tuple_element_buffer_addresses(tuple_slice.size() / + sizeof(void*)); + stream->ThenMemcpy(tuple_element_buffer_addresses.data(), + tuple_address, tuple_slice.size()); + TF_RETURN_IF_ERROR(stream->BlockHostUntilDone()); + // The data address is specified by the element of the tuple pointer + // buffer. + data_address = + se::DeviceMemoryBase(tuple_element_buffer_addresses[index.back()], + (*buffer)->length()); + } + + // TODO(b/111309141): Run this on a separate stream so it doesn't block + // the GPU from doing work during the transfer. This could be handled by + // making StreamAssignment do something intelligent with outfeed thunks. + stream + ->ThenMemcpy((*buffer)->destination()->untyped_data(), data_address, + (*buffer)->length()) + .ThenDoHostCallback([buffer]() { (*buffer)->Done(); }); + return Status::OK(); + })); + + Status block_status = stream->BlockHostUntilDone(); + if (!block_status.ok()) { + return InternalError("Failed to complete data transfer on stream %p: %s", + stream, block_status.error_message().c_str()); + } + + VLOG(2) << "Outfeeding from GPU complete"; + return Status::OK(); +} + +} // namespace gpu +} // namespace xla |