diff options
Diffstat (limited to 'tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc')
-rw-r--r-- | tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc | 150 |
1 files changed, 92 insertions, 58 deletions
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc index 7bb8df6581..79b3f1efec 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc @@ -20,8 +20,10 @@ limitations under the License. #include <vector> #include "llvm/IR/DataLayout.h" +#include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/literal_util.h" -#include "tensorflow/compiler/xla/service/gpu/gpu_compiler.h" +#include "tensorflow/compiler/xla/service/gpu/nvptx_compiler.h" +#include "tensorflow/compiler/xla/service/gpu/outfeed_manager.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/statusor.h" @@ -34,15 +36,14 @@ limitations under the License. #include "tensorflow/core/platform/stream_executor_no_cuda.h" namespace xla { +namespace gpu { // TODO(b/30467474) Once GPU infeed implementation settles, consider // folding back the cpu and gpu infeed implementations into a generic // one if possible. -GpuTransferManager::GpuTransferManager() - : GenericTransferManager( - se::cuda::kCudaPlatformId, - /*pointer_size=*/llvm::DataLayout(gpu::GpuCompiler::kDataLayout) - .getPointerSize(0 /* default address space */)) {} +GpuTransferManager::GpuTransferManager(se::Platform::Id id, + unsigned pointer_size) + : GenericTransferManager(id, pointer_size) {} Status GpuTransferManager::TransferLiteralToInfeed( se::StreamExecutor* executor, const LiteralSlice& literal) { @@ -50,53 +51,28 @@ Status GpuTransferManager::TransferLiteralToInfeed( VLOG(2) << "Transferring literal to infeed with shape: " << ShapeUtil::HumanString(shape); - if (!ShapeUtil::IsTuple(shape)) { - int64 size = GetByteSizeRequirement(shape); - return TransferBufferToInfeed(executor, size, literal.untyped_data()); - } - - if (ShapeUtil::IsNestedTuple(shape)) { - return Unimplemented( - "Infeed with a nested tuple shape is not supported: %s", - ShapeUtil::HumanString(literal.shape()).c_str()); - } - // For a tuple, we transfer each of its elements to the device and // enqueue the resulting destination device addresses with the // infeed manager. - std::vector<gpu::InfeedBuffer*> buffers; - buffers.reserve(ShapeUtil::TupleElementCount(shape)); - auto cleanup = tensorflow::gtl::MakeCleanup([buffers]() { - for (gpu::InfeedBuffer* b : buffers) { - b->Done(); - } - }); - - for (int64 i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) { - const Shape& tuple_element_shape = - ShapeUtil::GetTupleElementShape(shape, i); - int64 tuple_element_size = GetByteSizeRequirement(tuple_element_shape); - TF_ASSIGN_OR_RETURN( - gpu::InfeedBuffer * buffer, - TransferBufferToInfeedInternal(executor, tuple_element_size, - literal.untyped_data({i}))); - buffers.push_back(buffer); - } - - cleanup.release(); - return EnqueueBuffersToInfeed(executor, buffers); -} - -Status GpuTransferManager::TransferBufferToInfeed(se::StreamExecutor* executor, - int64 size, - const void* source) { - TF_ASSIGN_OR_RETURN(gpu::InfeedBuffer * buffer, - TransferBufferToInfeedInternal(executor, size, source)); - return EnqueueBuffersToInfeed(executor, {buffer}); + ShapeTree<InfeedBuffer> buffer_tree(shape); + + TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus( + shape, [&](const Shape& literal_subshape, const ShapeIndex& index) { + if (ShapeUtil::IsArray(literal_subshape)) { + int64 tuple_element_size = GetByteSizeRequirement(literal_subshape); + TF_ASSIGN_OR_RETURN( + *buffer_tree.mutable_element(index), + TransferBufferToInfeedInternal(executor, tuple_element_size, + literal.untyped_data(index))); + } + return Status::OK(); + })); + + return EnqueueBuffersToInfeed(executor, std::move(buffer_tree)); } Status GpuTransferManager::EnqueueBuffersToInfeed( - se::StreamExecutor* executor, std::vector<gpu::InfeedBuffer*> buffers) { + se::StreamExecutor* executor, ShapeTree<InfeedBuffer> buffers) { gpu::InfeedManager* infeed_manager = gpu::GetOrCreateInfeedManager(); se::Stream* stream = infeed_manager->GetStream(executor); @@ -106,21 +82,18 @@ Status GpuTransferManager::EnqueueBuffersToInfeed( // possible. Status block_status = stream->BlockHostUntilDone(); if (!block_status.ok()) { - for (gpu::InfeedBuffer* b : buffers) { - b->Done(); - } return InternalError("Failed to complete data transfer on stream %p: %s", stream, block_status.error_message().c_str()); } - infeed_manager->EnqueueBuffers(buffers); + infeed_manager->EnqueueDestination(std::move(buffers)); VLOG(2) << "Infeed data transferred"; return Status::OK(); } -StatusOr<gpu::InfeedBuffer*> GpuTransferManager::TransferBufferToInfeedInternal( +StatusOr<InfeedBuffer> GpuTransferManager::TransferBufferToInfeedInternal( se::StreamExecutor* executor, int64 size, const void* source) { if (size > std::numeric_limits<int32>::max()) { return InvalidArgument("Infeed shape is too large: needs %lld bytes", size); @@ -136,23 +109,84 @@ StatusOr<gpu::InfeedBuffer*> GpuTransferManager::TransferBufferToInfeedInternal( return InternalError("Failed to obtain a stream"); } - gpu::InfeedBuffer* buffer = new gpu::InfeedBuffer(executor, size); - stream->ThenMemcpy(buffer->device_memory(), source, size); + InfeedBuffer buffer(executor, size); + stream->ThenMemcpy(buffer.device_memory(), source, size); VLOG(2) << "Queued infeed data on stream " << stream; - return buffer; + return std::move(buffer); +} + +static std::unique_ptr<Literal> ShapeTreeToLiteral( + ShapeTree<std::unique_ptr<gpu::OutfeedBuffer>>* shape_tree) { + // This is a struct instead of a lambda for std::function-free recursion. + struct Helper { + static std::unique_ptr<Literal> helper( + ShapeTree<std::unique_ptr<gpu::OutfeedBuffer>>* shape_tree, + ShapeIndex* index) { + const Shape& shape = ShapeUtil::GetSubshape(shape_tree->shape(), *index); + if (ShapeUtil::IsArray(shape)) { + return (*shape_tree->mutable_element(*index))->WaitUntilAvailable(); + } + + CHECK(ShapeUtil::IsTuple(shape)) + << ShapeUtil::HumanStringWithLayout(shape); + const int64 tuple_element_count = ShapeUtil::TupleElementCount(shape); + index->push_back(0); + std::vector<std::unique_ptr<Literal>> tuple_operands; + for (int64 i = 0; i < tuple_element_count; ++i) { + index->back() = i; + tuple_operands.push_back(helper(shape_tree, index)); + } + index->pop_back(); + return LiteralUtil::MakeTupleOwned(std::move(tuple_operands)); + } + }; + ShapeIndex index; + return Helper::helper(shape_tree, &index); +} + +Status GpuTransferManager::TransferLiteralFromOutfeed( + se::StreamExecutor* /*executor*/, const Shape& literal_shape, + Literal* literal) { + ShapeTree<std::unique_ptr<gpu::OutfeedBuffer>> outfeed_buffers( + &literal_shape); + + // First create a tree of literal buffers that the device can write to. + outfeed_buffers.ForEachMutableElement( + [&](const ShapeIndex& index, + std::unique_ptr<gpu::OutfeedBuffer>* buffer) { + const Shape& shape = ShapeUtil::GetSubshape(literal_shape, index); + // Do not transfer tuple index buffers. + if (ShapeUtil::IsTuple(shape)) { + return; + } + *buffer = MakeUnique<gpu::OutfeedBuffer>(GetByteSizeRequirement(shape)); + }); + + // Give the tree of buffers to the outfeed mananger. The device will fill it + // while we're waiting for it below. + gpu::OutfeedManager* outfeed_manager = gpu::GetOrCreateOutfeedManager(); + outfeed_manager->EnqueueDestination(&outfeed_buffers); + + // Now turn the tree of buffers back into a literal. + *literal = std::move(*ShapeTreeToLiteral(&outfeed_buffers)); + return Status::OK(); } +} // namespace gpu } // namespace xla -static std::unique_ptr<xla::TransferManager> CreateGpuTransferManager() { - return xla::MakeUnique<xla::GpuTransferManager>(); +static std::unique_ptr<xla::TransferManager> CreateNVPTXTransferManager() { + return xla::MakeUnique<xla::gpu::GpuTransferManager>( + /*id=*/stream_executor::cuda::kCudaPlatformId, + /*pointer_size=*/llvm::DataLayout(xla::gpu::NVPTXCompiler::kDataLayout) + .getPointerSize(0 /* default address space */)); } static bool InitModule() { xla::TransferManager::RegisterTransferManager( - stream_executor::cuda::kCudaPlatformId, &CreateGpuTransferManager); + stream_executor::cuda::kCudaPlatformId, &CreateNVPTXTransferManager); return true; } static bool module_initialized = InitModule(); |