diff options
Diffstat (limited to 'tensorflow/compiler/jit/xla_device_context.cc')
-rw-r--r-- | tensorflow/compiler/jit/xla_device_context.cc | 262 |
1 files changed, 164 insertions, 98 deletions
diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc index e20f5aa837..8cf198239c 100644 --- a/tensorflow/compiler/jit/xla_device_context.cc +++ b/tensorflow/compiler/jit/xla_device_context.cc @@ -48,17 +48,24 @@ void XlaDeviceAllocator::DeallocateRaw(void* ptr) { void XlaDeviceAllocator::GetStats(AllocatorStats* stats) { stats->Clear(); } XlaTransferManager::XlaTransferManager( - se::Stream* stream, xla::LocalClient* client, bool transfer_as_literal, + se::Stream* compute_stream, se::Stream* host_to_device_stream, + se::Stream* device_to_host_stream, xla::LocalClient* client, + bool transfer_as_literal, XlaCompiler::ShapeRepresentationFn shape_representation_fn) - : stream_(stream), + : stream_(compute_stream), + host_to_device_stream_(host_to_device_stream), + device_to_host_stream_(device_to_host_stream), client_(client), transfer_manager_(client->backend().transfer_manager()), transfer_as_literal_(transfer_as_literal), shape_representation_fn_(std::move(shape_representation_fn)) { + CHECK(host_to_device_stream_ != nullptr); + CHECK(device_to_host_stream_ != nullptr); + CHECK(stream_ != nullptr); if (!shape_representation_fn_) { - shape_representation_fn_ = [](const TensorShape& shape, DataType dtype) { - return shape; - }; + shape_representation_fn_ = + [](const TensorShape& shape, + DataType dtype) -> xla::StatusOr<TensorShape> { return shape; }; } } @@ -74,15 +81,26 @@ Status XlaTransferManager::TransferLiteralToDevice( auto literal = std::make_shared<xla::BorrowingLiteral>( static_cast<const char*>(DMAHelper::base(&host_tensor)), xla_shape); - const xla::ShapedBuffer& shaped_buffer = - XlaTensor::FromTensor(device_tensor)->shaped_buffer(); + XlaTensor* xla_tensor = XlaTensor::FromTensor(device_tensor); + const xla::ShapedBuffer& shaped_buffer = xla_tensor->shaped_buffer(); VLOG(1) << "Transfer to device as literal: " << literal->ToString() << " " << shaped_buffer.ToString(); + if (UseMultipleStreams()) { + // Initially wait for the compute stream so that memory allocations are + // synchronized. + host_to_device_stream_->ThenWaitFor(stream_); + } TF_RETURN_IF_ERROR(transfer_manager_->TransferLiteralToDeviceAsync( - stream_, *literal, shaped_buffer)); + host_to_device_stream_, *literal, shaped_buffer)); + if (UseMultipleStreams()) { + se::Event event(stream_->parent()); + TF_RET_CHECK(event.Init()) << "Event failed to initialize!"; + host_to_device_stream_->ThenRecordEvent(&event); + xla_tensor->SetDefinedOn(host_to_device_stream_, std::move(event)); + } // Unref the host tensor, and capture the literal shared_ptr too so it goes // out of scope when the lambda completes. - stream_->ThenDoHostCallback([ref, literal]() { ref.Unref(); }); + host_to_device_stream_->ThenDoHostCallback([ref, literal]() { ref.Unref(); }); return Status::OK(); } @@ -94,7 +112,7 @@ void XlaTransferManager::TransferLiteralFromDevice( TensorReference ref(device_tensor); transfer_manager_->TransferLiteralFromDevice( - stream_, shaped_buffer, + device_to_host_stream_, shaped_buffer, [=, &shaped_buffer]( xla::StatusOr<std::unique_ptr<xla::Literal> > literal_or) { ref.Unref(); @@ -120,62 +138,73 @@ void XlaTransferManager::CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device, Tensor* device_tensor, StatusCallback done) const { - if (cpu_tensor->NumElements() > 0) { - VLOG(2) << "CopyCPUTensorToDevice " - << reinterpret_cast<const void*>(cpu_tensor->tensor_data().data()) - << " " - << reinterpret_cast<const void*>( - device_tensor->tensor_data().data()) - << " " << cpu_tensor->NumElements() << " " - << cpu_tensor->shape().DebugString() << " " - << device_tensor->shape().DebugString(); - - void* src_ptr = const_cast<void*>(DMAHelper::base(cpu_tensor)); - const int64 total_bytes = cpu_tensor->TotalBytes(); - - XlaTensor* xla_tensor = XlaTensor::FromTensor(device_tensor); - CHECK(xla_tensor); - - TensorShape shape = shape_representation_fn_(device_tensor->shape(), - device_tensor->dtype()); - Status status; - if (!xla_tensor->has_shaped_buffer()) { - status = xla_tensor->AllocateShapedBuffer( - device_tensor->dtype(), shape, client_, - stream_->parent()->device_ordinal()); - if (!status.ok()) { - return done(status); - } - } + if (cpu_tensor->NumElements() == 0) { + VLOG(2) << "CopyCPUTensorToDevice empty tensor"; + done(Status::OK()); + return; + } - if (transfer_as_literal_) { - Tensor reshaped_cpu_tensor; - if (!reshaped_cpu_tensor.CopyFrom(*cpu_tensor, shape)) { - done(errors::Internal( - "Tensor::CopyFrom failed when copying from CPU to XLA device")); - return; - } - status = TransferLiteralToDevice(reshaped_cpu_tensor, device_tensor); - } else { - se::DeviceMemoryBase dev_dst_ptr = - XlaTensor::DeviceMemoryFromTensor(*device_tensor); - stream_->ThenMemcpy(&dev_dst_ptr, src_ptr, total_bytes); - // TODO(hpucha): Make this asynchronous. - Status block_status = stream_->BlockHostUntilDone(); - if (!block_status.ok()) { - status = xla::InternalError( - "Failed to complete data transfer on stream %p: %s", stream_, - block_status.error_message().c_str()); - } - } - xla_tensor->set_host_tensor(*cpu_tensor); + VLOG(2) << "CopyCPUTensorToDevice " + << reinterpret_cast<const void*>(cpu_tensor->tensor_data().data()) + << " " + << reinterpret_cast<const void*>(device_tensor->tensor_data().data()) + << " " << cpu_tensor->NumElements() << " " + << cpu_tensor->shape().DebugString() << " " + << device_tensor->shape().DebugString(); - done(status); + void* src_ptr = const_cast<void*>(DMAHelper::base(cpu_tensor)); + const int64 total_bytes = cpu_tensor->TotalBytes(); + + XlaTensor* xla_tensor = XlaTensor::FromTensor(device_tensor); + CHECK(xla_tensor); + + xla::StatusOr<TensorShape> shape_or_status = + shape_representation_fn_(device_tensor->shape(), device_tensor->dtype()); + if (!shape_or_status.ok()) { + done(shape_or_status.status()); return; } + TensorShape shape = shape_or_status.ValueOrDie(); + if (!xla_tensor->has_shaped_buffer()) { + Status s = + xla_tensor->AllocateShapedBuffer(device_tensor->dtype(), shape, client_, + stream_->parent()->device_ordinal()); + if (!s.ok()) { + done(s); + return; + } + } - VLOG(2) << "CopyCPUTensorToDevice empty tensor"; - done(Status::OK()); + Status status; + if (transfer_as_literal_) { + Tensor reshaped_cpu_tensor; + if (!reshaped_cpu_tensor.CopyFrom(*cpu_tensor, shape)) { + done(errors::Internal( + "Tensor::CopyFrom failed when copying from CPU to XLA device")); + return; + } + status = TransferLiteralToDevice(reshaped_cpu_tensor, device_tensor); + if (status.ok()) { + xla_tensor->set_host_tensor(*cpu_tensor); + host_to_device_stream_->ThenDoHostCallback( + [done]() { done(Status::OK()); }); + return; + } + } else { + se::DeviceMemoryBase dev_dst_ptr = + XlaTensor::DeviceMemoryFromTensor(*device_tensor); + host_to_device_stream_->ThenMemcpy(&dev_dst_ptr, src_ptr, total_bytes); + // TODO(hpucha): Make this asynchronous. + Status block_status = host_to_device_stream_->BlockHostUntilDone(); + if (!block_status.ok()) { + status = xla::InternalError( + "Failed to complete data transfer on stream %p: %s", + host_to_device_stream_, block_status.error_message().c_str()); + } + } + xla_tensor->set_host_tensor(*cpu_tensor); + + done(status); } void XlaTransferManager::CopyDeviceTensorToCPU(const Tensor* device_tensor, @@ -183,68 +212,102 @@ void XlaTransferManager::CopyDeviceTensorToCPU(const Tensor* device_tensor, Device* device, Tensor* cpu_tensor, StatusCallback done) { - if (device_tensor->NumElements() > 0) { - VLOG(2) << "CopyDeviceTensorToCPU " - << reinterpret_cast<const void*>( - device_tensor->tensor_data().data()) - << " " - << reinterpret_cast<const void*>(cpu_tensor->tensor_data().data()) - << " " << device_tensor->NumElements() << " " - << cpu_tensor->shape().DebugString() << " " - << device_tensor->shape().DebugString(); - - const int64 total_bytes = cpu_tensor->TotalBytes(); - se::DeviceMemoryBase dev_src_ptr = - XlaTensor::DeviceMemoryFromTensor(*device_tensor); - void* dst_ptr = DMAHelper::base(cpu_tensor); + if (device_tensor->NumElements() == 0) { + VLOG(2) << "CopyDeviceTensorToCPU empty tensor"; + done(Status::OK()); + return; + } + VLOG(2) << "CopyDeviceTensorToCPU " + << reinterpret_cast<const void*>(device_tensor->tensor_data().data()) + << " " + << reinterpret_cast<const void*>(cpu_tensor->tensor_data().data()) + << " " << device_tensor->NumElements() << " " + << cpu_tensor->shape().DebugString() << " " + << device_tensor->shape().DebugString(); + + const int64 total_bytes = cpu_tensor->TotalBytes(); + se::DeviceMemoryBase dev_src_ptr = + XlaTensor::DeviceMemoryFromTensor(*device_tensor); + void* dst_ptr = DMAHelper::base(cpu_tensor); + XlaTensor* xla_tensor = XlaTensor::FromTensor(device_tensor); + + if (se::Event* event = + xla_tensor->GetDefinitionEvent(device_to_host_stream_)) { + device_to_host_stream_->ThenWaitFor(event); + xla_tensor->SetDefinedOn(device_to_host_stream_); + } - Status status; - if (transfer_as_literal_) { - TransferLiteralFromDevice(cpu_tensor, *device_tensor, done); - return; - } else { - stream_->ThenMemcpy(dst_ptr, dev_src_ptr, total_bytes); - // TODO(hpucha): Make this asynchronous. - Status block_status = stream_->BlockHostUntilDone(); - if (!block_status.ok()) { - status = xla::InternalError( - "Failed to complete data transfer on stream %p: %s", stream_, - block_status.error_message().c_str()); - } - done(status); - } + Status status; + if (transfer_as_literal_) { + TransferLiteralFromDevice(cpu_tensor, *device_tensor, done); return; + } else { + device_to_host_stream_->ThenMemcpy(dst_ptr, dev_src_ptr, total_bytes); + // TODO(hpucha): Make this asynchronous. + Status block_status = device_to_host_stream_->BlockHostUntilDone(); + if (!block_status.ok()) { + status = xla::InternalError( + "Failed to complete data transfer on stream %p: %s", stream_, + block_status.error_message().c_str()); + } } - VLOG(2) << "CopyDeviceTensorToCPU empty tensor"; - done(Status::OK()); + done(status); } void XlaTransferManager::CopyDeviceTensorToDevice(const Tensor& src_tensor, Tensor* dst_tensor, const StatusCallback& done) { + VLOG(2) << "CopyDeviceTensorToDevice " + << reinterpret_cast<const void*>(src_tensor.tensor_data().data()) + << " " + << reinterpret_cast<const void*>(dst_tensor->tensor_data().data()); // Perform memory allocation now, and enqueue the device-to-device transfer. Status status = [&]() -> Status { if (src_tensor.NumElements() == 0) { return Status::OK(); } + // TODO(jmolloy): We co-opt the device_to_host stream for device to device + // transfers; perhaps we should have a dedicated device to device stream? or + // one per device? + auto device_to_device_stream = stream_; XlaTensor* xla_src = XlaTensor::FromTensor(&src_tensor); XlaTensor* xla_dst = XlaTensor::FromTensor(dst_tensor); CHECK(xla_src && xla_dst) << "Missing destination tensor for device-to-device copy"; if (!xla_dst->has_shaped_buffer()) { - TensorShape shape = - shape_representation_fn_(src_tensor.shape(), src_tensor.dtype()); + TF_ASSIGN_OR_RETURN( + TensorShape shape, + shape_representation_fn_(src_tensor.shape(), src_tensor.dtype())); TF_RETURN_IF_ERROR( xla_dst->AllocateShapedBuffer(src_tensor.dtype(), shape, client_, stream_->parent()->device_ordinal())); + if (stream_ != device_to_device_stream) { + // Initially wait for the compute stream so that memory allocations are + // synchronized. + device_to_device_stream->ThenWaitFor(stream_); + } } + + if (se::Event* event = + xla_src->GetDefinitionEvent(device_to_device_stream)) { + device_to_device_stream->ThenWaitFor(event); + xla_src->SetDefinedOn(device_to_device_stream); + } + auto from_iter = xla_src->shaped_buffer().buffers().begin(); auto to_iter = xla_dst->shaped_buffer().buffers().begin(); for (auto end_iter = xla_src->shaped_buffer().buffers().end(); from_iter != end_iter; ++from_iter, ++to_iter) { - stream_->ThenMemcpyD2D(&to_iter->second, from_iter->second, - to_iter->second.size()); + device_to_device_stream->ThenMemcpyD2D( + &to_iter->second, from_iter->second, to_iter->second.size()); + } + + if (UseMultipleStreams()) { + se::Event event(stream_->parent()); + CHECK(event.Init()); + device_to_device_stream->ThenRecordEvent(&event); + xla_dst->SetDefinedOn(device_to_device_stream, std::move(event)); } return Status::OK(); }(); @@ -256,9 +319,12 @@ void XlaTransferManager::CopyDeviceTensorToDevice(const Tensor& src_tensor, } XlaDeviceContext::XlaDeviceContext( - se::Stream* stream, xla::LocalClient* client, bool transfer_as_literal, + se::Stream* compute_stream, se::Stream* host_to_device_stream, + se::Stream* device_to_host_stream, xla::LocalClient* client, + bool transfer_as_literal, XlaCompiler::ShapeRepresentationFn shape_representation_fn) - : manager_(stream, client, transfer_as_literal, + : manager_(compute_stream, host_to_device_stream, device_to_host_stream, + client, transfer_as_literal, std::move(shape_representation_fn)) {} void XlaDeviceContext::CopyCPUTensorToDevice(const Tensor* cpu_tensor, |