aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/compiler/jit/xla_device_context.cc
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow/compiler/jit/xla_device_context.cc')
-rw-r--r--tensorflow/compiler/jit/xla_device_context.cc262
1 files changed, 164 insertions, 98 deletions
diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc
index e20f5aa837..8cf198239c 100644
--- a/tensorflow/compiler/jit/xla_device_context.cc
+++ b/tensorflow/compiler/jit/xla_device_context.cc
@@ -48,17 +48,24 @@ void XlaDeviceAllocator::DeallocateRaw(void* ptr) {
void XlaDeviceAllocator::GetStats(AllocatorStats* stats) { stats->Clear(); }
XlaTransferManager::XlaTransferManager(
- se::Stream* stream, xla::LocalClient* client, bool transfer_as_literal,
+ se::Stream* compute_stream, se::Stream* host_to_device_stream,
+ se::Stream* device_to_host_stream, xla::LocalClient* client,
+ bool transfer_as_literal,
XlaCompiler::ShapeRepresentationFn shape_representation_fn)
- : stream_(stream),
+ : stream_(compute_stream),
+ host_to_device_stream_(host_to_device_stream),
+ device_to_host_stream_(device_to_host_stream),
client_(client),
transfer_manager_(client->backend().transfer_manager()),
transfer_as_literal_(transfer_as_literal),
shape_representation_fn_(std::move(shape_representation_fn)) {
+ CHECK(host_to_device_stream_ != nullptr);
+ CHECK(device_to_host_stream_ != nullptr);
+ CHECK(stream_ != nullptr);
if (!shape_representation_fn_) {
- shape_representation_fn_ = [](const TensorShape& shape, DataType dtype) {
- return shape;
- };
+ shape_representation_fn_ =
+ [](const TensorShape& shape,
+ DataType dtype) -> xla::StatusOr<TensorShape> { return shape; };
}
}
@@ -74,15 +81,26 @@ Status XlaTransferManager::TransferLiteralToDevice(
auto literal = std::make_shared<xla::BorrowingLiteral>(
static_cast<const char*>(DMAHelper::base(&host_tensor)), xla_shape);
- const xla::ShapedBuffer& shaped_buffer =
- XlaTensor::FromTensor(device_tensor)->shaped_buffer();
+ XlaTensor* xla_tensor = XlaTensor::FromTensor(device_tensor);
+ const xla::ShapedBuffer& shaped_buffer = xla_tensor->shaped_buffer();
VLOG(1) << "Transfer to device as literal: " << literal->ToString() << " "
<< shaped_buffer.ToString();
+ if (UseMultipleStreams()) {
+ // Initially wait for the compute stream so that memory allocations are
+ // synchronized.
+ host_to_device_stream_->ThenWaitFor(stream_);
+ }
TF_RETURN_IF_ERROR(transfer_manager_->TransferLiteralToDeviceAsync(
- stream_, *literal, shaped_buffer));
+ host_to_device_stream_, *literal, shaped_buffer));
+ if (UseMultipleStreams()) {
+ se::Event event(stream_->parent());
+ TF_RET_CHECK(event.Init()) << "Event failed to initialize!";
+ host_to_device_stream_->ThenRecordEvent(&event);
+ xla_tensor->SetDefinedOn(host_to_device_stream_, std::move(event));
+ }
// Unref the host tensor, and capture the literal shared_ptr too so it goes
// out of scope when the lambda completes.
- stream_->ThenDoHostCallback([ref, literal]() { ref.Unref(); });
+ host_to_device_stream_->ThenDoHostCallback([ref, literal]() { ref.Unref(); });
return Status::OK();
}
@@ -94,7 +112,7 @@ void XlaTransferManager::TransferLiteralFromDevice(
TensorReference ref(device_tensor);
transfer_manager_->TransferLiteralFromDevice(
- stream_, shaped_buffer,
+ device_to_host_stream_, shaped_buffer,
[=, &shaped_buffer](
xla::StatusOr<std::unique_ptr<xla::Literal> > literal_or) {
ref.Unref();
@@ -120,62 +138,73 @@ void XlaTransferManager::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
Device* device,
Tensor* device_tensor,
StatusCallback done) const {
- if (cpu_tensor->NumElements() > 0) {
- VLOG(2) << "CopyCPUTensorToDevice "
- << reinterpret_cast<const void*>(cpu_tensor->tensor_data().data())
- << " "
- << reinterpret_cast<const void*>(
- device_tensor->tensor_data().data())
- << " " << cpu_tensor->NumElements() << " "
- << cpu_tensor->shape().DebugString() << " "
- << device_tensor->shape().DebugString();
-
- void* src_ptr = const_cast<void*>(DMAHelper::base(cpu_tensor));
- const int64 total_bytes = cpu_tensor->TotalBytes();
-
- XlaTensor* xla_tensor = XlaTensor::FromTensor(device_tensor);
- CHECK(xla_tensor);
-
- TensorShape shape = shape_representation_fn_(device_tensor->shape(),
- device_tensor->dtype());
- Status status;
- if (!xla_tensor->has_shaped_buffer()) {
- status = xla_tensor->AllocateShapedBuffer(
- device_tensor->dtype(), shape, client_,
- stream_->parent()->device_ordinal());
- if (!status.ok()) {
- return done(status);
- }
- }
+ if (cpu_tensor->NumElements() == 0) {
+ VLOG(2) << "CopyCPUTensorToDevice empty tensor";
+ done(Status::OK());
+ return;
+ }
- if (transfer_as_literal_) {
- Tensor reshaped_cpu_tensor;
- if (!reshaped_cpu_tensor.CopyFrom(*cpu_tensor, shape)) {
- done(errors::Internal(
- "Tensor::CopyFrom failed when copying from CPU to XLA device"));
- return;
- }
- status = TransferLiteralToDevice(reshaped_cpu_tensor, device_tensor);
- } else {
- se::DeviceMemoryBase dev_dst_ptr =
- XlaTensor::DeviceMemoryFromTensor(*device_tensor);
- stream_->ThenMemcpy(&dev_dst_ptr, src_ptr, total_bytes);
- // TODO(hpucha): Make this asynchronous.
- Status block_status = stream_->BlockHostUntilDone();
- if (!block_status.ok()) {
- status = xla::InternalError(
- "Failed to complete data transfer on stream %p: %s", stream_,
- block_status.error_message().c_str());
- }
- }
- xla_tensor->set_host_tensor(*cpu_tensor);
+ VLOG(2) << "CopyCPUTensorToDevice "
+ << reinterpret_cast<const void*>(cpu_tensor->tensor_data().data())
+ << " "
+ << reinterpret_cast<const void*>(device_tensor->tensor_data().data())
+ << " " << cpu_tensor->NumElements() << " "
+ << cpu_tensor->shape().DebugString() << " "
+ << device_tensor->shape().DebugString();
- done(status);
+ void* src_ptr = const_cast<void*>(DMAHelper::base(cpu_tensor));
+ const int64 total_bytes = cpu_tensor->TotalBytes();
+
+ XlaTensor* xla_tensor = XlaTensor::FromTensor(device_tensor);
+ CHECK(xla_tensor);
+
+ xla::StatusOr<TensorShape> shape_or_status =
+ shape_representation_fn_(device_tensor->shape(), device_tensor->dtype());
+ if (!shape_or_status.ok()) {
+ done(shape_or_status.status());
return;
}
+ TensorShape shape = shape_or_status.ValueOrDie();
+ if (!xla_tensor->has_shaped_buffer()) {
+ Status s =
+ xla_tensor->AllocateShapedBuffer(device_tensor->dtype(), shape, client_,
+ stream_->parent()->device_ordinal());
+ if (!s.ok()) {
+ done(s);
+ return;
+ }
+ }
- VLOG(2) << "CopyCPUTensorToDevice empty tensor";
- done(Status::OK());
+ Status status;
+ if (transfer_as_literal_) {
+ Tensor reshaped_cpu_tensor;
+ if (!reshaped_cpu_tensor.CopyFrom(*cpu_tensor, shape)) {
+ done(errors::Internal(
+ "Tensor::CopyFrom failed when copying from CPU to XLA device"));
+ return;
+ }
+ status = TransferLiteralToDevice(reshaped_cpu_tensor, device_tensor);
+ if (status.ok()) {
+ xla_tensor->set_host_tensor(*cpu_tensor);
+ host_to_device_stream_->ThenDoHostCallback(
+ [done]() { done(Status::OK()); });
+ return;
+ }
+ } else {
+ se::DeviceMemoryBase dev_dst_ptr =
+ XlaTensor::DeviceMemoryFromTensor(*device_tensor);
+ host_to_device_stream_->ThenMemcpy(&dev_dst_ptr, src_ptr, total_bytes);
+ // TODO(hpucha): Make this asynchronous.
+ Status block_status = host_to_device_stream_->BlockHostUntilDone();
+ if (!block_status.ok()) {
+ status = xla::InternalError(
+ "Failed to complete data transfer on stream %p: %s",
+ host_to_device_stream_, block_status.error_message().c_str());
+ }
+ }
+ xla_tensor->set_host_tensor(*cpu_tensor);
+
+ done(status);
}
void XlaTransferManager::CopyDeviceTensorToCPU(const Tensor* device_tensor,
@@ -183,68 +212,102 @@ void XlaTransferManager::CopyDeviceTensorToCPU(const Tensor* device_tensor,
Device* device,
Tensor* cpu_tensor,
StatusCallback done) {
- if (device_tensor->NumElements() > 0) {
- VLOG(2) << "CopyDeviceTensorToCPU "
- << reinterpret_cast<const void*>(
- device_tensor->tensor_data().data())
- << " "
- << reinterpret_cast<const void*>(cpu_tensor->tensor_data().data())
- << " " << device_tensor->NumElements() << " "
- << cpu_tensor->shape().DebugString() << " "
- << device_tensor->shape().DebugString();
-
- const int64 total_bytes = cpu_tensor->TotalBytes();
- se::DeviceMemoryBase dev_src_ptr =
- XlaTensor::DeviceMemoryFromTensor(*device_tensor);
- void* dst_ptr = DMAHelper::base(cpu_tensor);
+ if (device_tensor->NumElements() == 0) {
+ VLOG(2) << "CopyDeviceTensorToCPU empty tensor";
+ done(Status::OK());
+ return;
+ }
+ VLOG(2) << "CopyDeviceTensorToCPU "
+ << reinterpret_cast<const void*>(device_tensor->tensor_data().data())
+ << " "
+ << reinterpret_cast<const void*>(cpu_tensor->tensor_data().data())
+ << " " << device_tensor->NumElements() << " "
+ << cpu_tensor->shape().DebugString() << " "
+ << device_tensor->shape().DebugString();
+
+ const int64 total_bytes = cpu_tensor->TotalBytes();
+ se::DeviceMemoryBase dev_src_ptr =
+ XlaTensor::DeviceMemoryFromTensor(*device_tensor);
+ void* dst_ptr = DMAHelper::base(cpu_tensor);
+ XlaTensor* xla_tensor = XlaTensor::FromTensor(device_tensor);
+
+ if (se::Event* event =
+ xla_tensor->GetDefinitionEvent(device_to_host_stream_)) {
+ device_to_host_stream_->ThenWaitFor(event);
+ xla_tensor->SetDefinedOn(device_to_host_stream_);
+ }
- Status status;
- if (transfer_as_literal_) {
- TransferLiteralFromDevice(cpu_tensor, *device_tensor, done);
- return;
- } else {
- stream_->ThenMemcpy(dst_ptr, dev_src_ptr, total_bytes);
- // TODO(hpucha): Make this asynchronous.
- Status block_status = stream_->BlockHostUntilDone();
- if (!block_status.ok()) {
- status = xla::InternalError(
- "Failed to complete data transfer on stream %p: %s", stream_,
- block_status.error_message().c_str());
- }
- done(status);
- }
+ Status status;
+ if (transfer_as_literal_) {
+ TransferLiteralFromDevice(cpu_tensor, *device_tensor, done);
return;
+ } else {
+ device_to_host_stream_->ThenMemcpy(dst_ptr, dev_src_ptr, total_bytes);
+ // TODO(hpucha): Make this asynchronous.
+ Status block_status = device_to_host_stream_->BlockHostUntilDone();
+ if (!block_status.ok()) {
+ status = xla::InternalError(
+ "Failed to complete data transfer on stream %p: %s", stream_,
+ block_status.error_message().c_str());
+ }
}
- VLOG(2) << "CopyDeviceTensorToCPU empty tensor";
- done(Status::OK());
+ done(status);
}
void XlaTransferManager::CopyDeviceTensorToDevice(const Tensor& src_tensor,
Tensor* dst_tensor,
const StatusCallback& done) {
+ VLOG(2) << "CopyDeviceTensorToDevice "
+ << reinterpret_cast<const void*>(src_tensor.tensor_data().data())
+ << " "
+ << reinterpret_cast<const void*>(dst_tensor->tensor_data().data());
// Perform memory allocation now, and enqueue the device-to-device transfer.
Status status = [&]() -> Status {
if (src_tensor.NumElements() == 0) {
return Status::OK();
}
+ // TODO(jmolloy): We co-opt the device_to_host stream for device to device
+ // transfers; perhaps we should have a dedicated device to device stream? or
+ // one per device?
+ auto device_to_device_stream = stream_;
XlaTensor* xla_src = XlaTensor::FromTensor(&src_tensor);
XlaTensor* xla_dst = XlaTensor::FromTensor(dst_tensor);
CHECK(xla_src && xla_dst)
<< "Missing destination tensor for device-to-device copy";
if (!xla_dst->has_shaped_buffer()) {
- TensorShape shape =
- shape_representation_fn_(src_tensor.shape(), src_tensor.dtype());
+ TF_ASSIGN_OR_RETURN(
+ TensorShape shape,
+ shape_representation_fn_(src_tensor.shape(), src_tensor.dtype()));
TF_RETURN_IF_ERROR(
xla_dst->AllocateShapedBuffer(src_tensor.dtype(), shape, client_,
stream_->parent()->device_ordinal()));
+ if (stream_ != device_to_device_stream) {
+ // Initially wait for the compute stream so that memory allocations are
+ // synchronized.
+ device_to_device_stream->ThenWaitFor(stream_);
+ }
}
+
+ if (se::Event* event =
+ xla_src->GetDefinitionEvent(device_to_device_stream)) {
+ device_to_device_stream->ThenWaitFor(event);
+ xla_src->SetDefinedOn(device_to_device_stream);
+ }
+
auto from_iter = xla_src->shaped_buffer().buffers().begin();
auto to_iter = xla_dst->shaped_buffer().buffers().begin();
for (auto end_iter = xla_src->shaped_buffer().buffers().end();
from_iter != end_iter; ++from_iter, ++to_iter) {
- stream_->ThenMemcpyD2D(&to_iter->second, from_iter->second,
- to_iter->second.size());
+ device_to_device_stream->ThenMemcpyD2D(
+ &to_iter->second, from_iter->second, to_iter->second.size());
+ }
+
+ if (UseMultipleStreams()) {
+ se::Event event(stream_->parent());
+ CHECK(event.Init());
+ device_to_device_stream->ThenRecordEvent(&event);
+ xla_dst->SetDefinedOn(device_to_device_stream, std::move(event));
}
return Status::OK();
}();
@@ -256,9 +319,12 @@ void XlaTransferManager::CopyDeviceTensorToDevice(const Tensor& src_tensor,
}
XlaDeviceContext::XlaDeviceContext(
- se::Stream* stream, xla::LocalClient* client, bool transfer_as_literal,
+ se::Stream* compute_stream, se::Stream* host_to_device_stream,
+ se::Stream* device_to_host_stream, xla::LocalClient* client,
+ bool transfer_as_literal,
XlaCompiler::ShapeRepresentationFn shape_representation_fn)
- : manager_(stream, client, transfer_as_literal,
+ : manager_(compute_stream, host_to_device_stream, device_to_host_stream,
+ client, transfer_as_literal,
std::move(shape_representation_fn)) {}
void XlaDeviceContext::CopyCPUTensorToDevice(const Tensor* cpu_tensor,