aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc')
-rw-r--r--tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc150
1 files changed, 92 insertions, 58 deletions
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
index 7bb8df6581..79b3f1efec 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
@@ -20,8 +20,10 @@ limitations under the License.
#include <vector>
#include "llvm/IR/DataLayout.h"
+#include "tensorflow/compiler/xla/literal.h"
#include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/gpu/gpu_compiler.h"
+#include "tensorflow/compiler/xla/service/gpu/nvptx_compiler.h"
+#include "tensorflow/compiler/xla/service/gpu/outfeed_manager.h"
#include "tensorflow/compiler/xla/shape_util.h"
#include "tensorflow/compiler/xla/status_macros.h"
#include "tensorflow/compiler/xla/statusor.h"
@@ -34,15 +36,14 @@ limitations under the License.
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
namespace xla {
+namespace gpu {
// TODO(b/30467474) Once GPU infeed implementation settles, consider
// folding back the cpu and gpu infeed implementations into a generic
// one if possible.
-GpuTransferManager::GpuTransferManager()
- : GenericTransferManager(
- se::cuda::kCudaPlatformId,
- /*pointer_size=*/llvm::DataLayout(gpu::GpuCompiler::kDataLayout)
- .getPointerSize(0 /* default address space */)) {}
+GpuTransferManager::GpuTransferManager(se::Platform::Id id,
+ unsigned pointer_size)
+ : GenericTransferManager(id, pointer_size) {}
Status GpuTransferManager::TransferLiteralToInfeed(
se::StreamExecutor* executor, const LiteralSlice& literal) {
@@ -50,53 +51,28 @@ Status GpuTransferManager::TransferLiteralToInfeed(
VLOG(2) << "Transferring literal to infeed with shape: "
<< ShapeUtil::HumanString(shape);
- if (!ShapeUtil::IsTuple(shape)) {
- int64 size = GetByteSizeRequirement(shape);
- return TransferBufferToInfeed(executor, size, literal.untyped_data());
- }
-
- if (ShapeUtil::IsNestedTuple(shape)) {
- return Unimplemented(
- "Infeed with a nested tuple shape is not supported: %s",
- ShapeUtil::HumanString(literal.shape()).c_str());
- }
-
// For a tuple, we transfer each of its elements to the device and
// enqueue the resulting destination device addresses with the
// infeed manager.
- std::vector<gpu::InfeedBuffer*> buffers;
- buffers.reserve(ShapeUtil::TupleElementCount(shape));
- auto cleanup = tensorflow::gtl::MakeCleanup([buffers]() {
- for (gpu::InfeedBuffer* b : buffers) {
- b->Done();
- }
- });
-
- for (int64 i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
- const Shape& tuple_element_shape =
- ShapeUtil::GetTupleElementShape(shape, i);
- int64 tuple_element_size = GetByteSizeRequirement(tuple_element_shape);
- TF_ASSIGN_OR_RETURN(
- gpu::InfeedBuffer * buffer,
- TransferBufferToInfeedInternal(executor, tuple_element_size,
- literal.untyped_data({i})));
- buffers.push_back(buffer);
- }
-
- cleanup.release();
- return EnqueueBuffersToInfeed(executor, buffers);
-}
-
-Status GpuTransferManager::TransferBufferToInfeed(se::StreamExecutor* executor,
- int64 size,
- const void* source) {
- TF_ASSIGN_OR_RETURN(gpu::InfeedBuffer * buffer,
- TransferBufferToInfeedInternal(executor, size, source));
- return EnqueueBuffersToInfeed(executor, {buffer});
+ ShapeTree<InfeedBuffer> buffer_tree(shape);
+
+ TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
+ shape, [&](const Shape& literal_subshape, const ShapeIndex& index) {
+ if (ShapeUtil::IsArray(literal_subshape)) {
+ int64 tuple_element_size = GetByteSizeRequirement(literal_subshape);
+ TF_ASSIGN_OR_RETURN(
+ *buffer_tree.mutable_element(index),
+ TransferBufferToInfeedInternal(executor, tuple_element_size,
+ literal.untyped_data(index)));
+ }
+ return Status::OK();
+ }));
+
+ return EnqueueBuffersToInfeed(executor, std::move(buffer_tree));
}
Status GpuTransferManager::EnqueueBuffersToInfeed(
- se::StreamExecutor* executor, std::vector<gpu::InfeedBuffer*> buffers) {
+ se::StreamExecutor* executor, ShapeTree<InfeedBuffer> buffers) {
gpu::InfeedManager* infeed_manager = gpu::GetOrCreateInfeedManager();
se::Stream* stream = infeed_manager->GetStream(executor);
@@ -106,21 +82,18 @@ Status GpuTransferManager::EnqueueBuffersToInfeed(
// possible.
Status block_status = stream->BlockHostUntilDone();
if (!block_status.ok()) {
- for (gpu::InfeedBuffer* b : buffers) {
- b->Done();
- }
return InternalError("Failed to complete data transfer on stream %p: %s",
stream, block_status.error_message().c_str());
}
- infeed_manager->EnqueueBuffers(buffers);
+ infeed_manager->EnqueueDestination(std::move(buffers));
VLOG(2) << "Infeed data transferred";
return Status::OK();
}
-StatusOr<gpu::InfeedBuffer*> GpuTransferManager::TransferBufferToInfeedInternal(
+StatusOr<InfeedBuffer> GpuTransferManager::TransferBufferToInfeedInternal(
se::StreamExecutor* executor, int64 size, const void* source) {
if (size > std::numeric_limits<int32>::max()) {
return InvalidArgument("Infeed shape is too large: needs %lld bytes", size);
@@ -136,23 +109,84 @@ StatusOr<gpu::InfeedBuffer*> GpuTransferManager::TransferBufferToInfeedInternal(
return InternalError("Failed to obtain a stream");
}
- gpu::InfeedBuffer* buffer = new gpu::InfeedBuffer(executor, size);
- stream->ThenMemcpy(buffer->device_memory(), source, size);
+ InfeedBuffer buffer(executor, size);
+ stream->ThenMemcpy(buffer.device_memory(), source, size);
VLOG(2) << "Queued infeed data on stream " << stream;
- return buffer;
+ return std::move(buffer);
+}
+
+static std::unique_ptr<Literal> ShapeTreeToLiteral(
+ ShapeTree<std::unique_ptr<gpu::OutfeedBuffer>>* shape_tree) {
+ // This is a struct instead of a lambda for std::function-free recursion.
+ struct Helper {
+ static std::unique_ptr<Literal> helper(
+ ShapeTree<std::unique_ptr<gpu::OutfeedBuffer>>* shape_tree,
+ ShapeIndex* index) {
+ const Shape& shape = ShapeUtil::GetSubshape(shape_tree->shape(), *index);
+ if (ShapeUtil::IsArray(shape)) {
+ return (*shape_tree->mutable_element(*index))->WaitUntilAvailable();
+ }
+
+ CHECK(ShapeUtil::IsTuple(shape))
+ << ShapeUtil::HumanStringWithLayout(shape);
+ const int64 tuple_element_count = ShapeUtil::TupleElementCount(shape);
+ index->push_back(0);
+ std::vector<std::unique_ptr<Literal>> tuple_operands;
+ for (int64 i = 0; i < tuple_element_count; ++i) {
+ index->back() = i;
+ tuple_operands.push_back(helper(shape_tree, index));
+ }
+ index->pop_back();
+ return LiteralUtil::MakeTupleOwned(std::move(tuple_operands));
+ }
+ };
+ ShapeIndex index;
+ return Helper::helper(shape_tree, &index);
+}
+
+Status GpuTransferManager::TransferLiteralFromOutfeed(
+ se::StreamExecutor* /*executor*/, const Shape& literal_shape,
+ Literal* literal) {
+ ShapeTree<std::unique_ptr<gpu::OutfeedBuffer>> outfeed_buffers(
+ &literal_shape);
+
+ // First create a tree of literal buffers that the device can write to.
+ outfeed_buffers.ForEachMutableElement(
+ [&](const ShapeIndex& index,
+ std::unique_ptr<gpu::OutfeedBuffer>* buffer) {
+ const Shape& shape = ShapeUtil::GetSubshape(literal_shape, index);
+ // Do not transfer tuple index buffers.
+ if (ShapeUtil::IsTuple(shape)) {
+ return;
+ }
+ *buffer = MakeUnique<gpu::OutfeedBuffer>(GetByteSizeRequirement(shape));
+ });
+
+ // Give the tree of buffers to the outfeed mananger. The device will fill it
+ // while we're waiting for it below.
+ gpu::OutfeedManager* outfeed_manager = gpu::GetOrCreateOutfeedManager();
+ outfeed_manager->EnqueueDestination(&outfeed_buffers);
+
+ // Now turn the tree of buffers back into a literal.
+ *literal = std::move(*ShapeTreeToLiteral(&outfeed_buffers));
+ return Status::OK();
}
+} // namespace gpu
} // namespace xla
-static std::unique_ptr<xla::TransferManager> CreateGpuTransferManager() {
- return xla::MakeUnique<xla::GpuTransferManager>();
+static std::unique_ptr<xla::TransferManager> CreateNVPTXTransferManager() {
+ return xla::MakeUnique<xla::gpu::GpuTransferManager>(
+ /*id=*/stream_executor::cuda::kCudaPlatformId,
+ /*pointer_size=*/llvm::DataLayout(xla::gpu::NVPTXCompiler::kDataLayout)
+ .getPointerSize(0 /* default address space */));
}
static bool InitModule() {
xla::TransferManager::RegisterTransferManager(
- stream_executor::cuda::kCudaPlatformId, &CreateGpuTransferManager);
+ stream_executor::cuda::kCudaPlatformId, &CreateNVPTXTransferManager);
return true;
}
static bool module_initialized = InitModule();