1 files changed, 164 insertions, 98 deletions
diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc
index e20f5aa837..8cf198239c 100644
--- a/tensorflow/compiler/jit/xla_device_context.cc
+++ b/tensorflow/compiler/jit/xla_device_context.cc
@@ -48,17 +48,24 @@ void XlaDeviceAllocator::DeallocateRaw(void* ptr) {
 void XlaDeviceAllocator::GetStats(AllocatorStats* stats) { stats->Clear(); }
 
 XlaTransferManager::XlaTransferManager(
-    se::Stream* stream, xla::LocalClient* client, bool transfer_as_literal,
+    se::Stream* compute_stream, se::Stream* host_to_device_stream,
+    se::Stream* device_to_host_stream, xla::LocalClient* client,
+    bool transfer_as_literal,
     XlaCompiler::ShapeRepresentationFn shape_representation_fn)
-    : stream_(stream),
+    : stream_(compute_stream),
+      host_to_device_stream_(host_to_device_stream),
+      device_to_host_stream_(device_to_host_stream),
       client_(client),
       transfer_manager_(client->backend().transfer_manager()),
       transfer_as_literal_(transfer_as_literal),
       shape_representation_fn_(std::move(shape_representation_fn)) {
+  CHECK(host_to_device_stream_ != nullptr);
+  CHECK(device_to_host_stream_ != nullptr);
+  CHECK(stream_ != nullptr);
   if (!shape_representation_fn_) {
-    shape_representation_fn_ = [](const TensorShape& shape, DataType dtype) {
-      return shape;
-    };
+    shape_representation_fn_ =
+        [](const TensorShape& shape,
+           DataType dtype) -> xla::StatusOr<TensorShape> { return shape; };
   }
 }
 
@@ -74,15 +81,26 @@ Status XlaTransferManager::TransferLiteralToDevice(
   auto literal = std::make_shared<xla::BorrowingLiteral>(
       static_cast<const char*>(DMAHelper::base(&host_tensor)), xla_shape);
 
-  const xla::ShapedBuffer& shaped_buffer =
-      XlaTensor::FromTensor(device_tensor)->shaped_buffer();
+  XlaTensor* xla_tensor = XlaTensor::FromTensor(device_tensor);
+  const xla::ShapedBuffer& shaped_buffer = xla_tensor->shaped_buffer();
   VLOG(1) << "Transfer to device as literal: " << literal->ToString() << " "
           << shaped_buffer.ToString();
+  if (UseMultipleStreams()) {
+    // Initially wait for the compute stream so that memory allocations are
+    // synchronized.
+    host_to_device_stream_->ThenWaitFor(stream_);
+  }
   TF_RETURN_IF_ERROR(transfer_manager_->TransferLiteralToDeviceAsync(
-      stream_, *literal, shaped_buffer));
+      host_to_device_stream_, *literal, shaped_buffer));
+  if (UseMultipleStreams()) {
+    se::Event event(stream_->parent());
+    TF_RET_CHECK(event.Init()) << "Event failed to initialize!";
+    host_to_device_stream_->ThenRecordEvent(&event);
+    xla_tensor->SetDefinedOn(host_to_device_stream_, std::move(event));
+  }
   // Unref the host tensor, and capture the literal shared_ptr too so it goes
   // out of scope when the lambda completes.
-  stream_->ThenDoHostCallback([ref, literal]() { ref.Unref(); });
+  host_to_device_stream_->ThenDoHostCallback([ref, literal]() { ref.Unref(); });
   return Status::OK();
 }
 
@@ -94,7 +112,7 @@ void XlaTransferManager::TransferLiteralFromDevice(
 
   TensorReference ref(device_tensor);
   transfer_manager_->TransferLiteralFromDevice(
-      stream_, shaped_buffer,
+      device_to_host_stream_, shaped_buffer,
       [=, &shaped_buffer](
           xla::StatusOr<std::unique_ptr<xla::Literal> > literal_or) {
         ref.Unref();
@@ -120,62 +138,73 @@ void XlaTransferManager::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
                                                Device* device,
                                                Tensor* device_tensor,
                                                StatusCallback done) const {
-  if (cpu_tensor->NumElements() > 0) {
-    VLOG(2) << "CopyCPUTensorToDevice "
-            << reinterpret_cast<const void*>(cpu_tensor->tensor_data().data())
-            << " "
-            << reinterpret_cast<const void*>(
-                   device_tensor->tensor_data().data())
-            << " " << cpu_tensor->NumElements() << " "
-            << cpu_tensor->shape().DebugString() << " "
-            << device_tensor->shape().DebugString();
-
-    void* src_ptr = const_cast<void*>(DMAHelper::base(cpu_tensor));
-    const int64 total_bytes = cpu_tensor->TotalBytes();
-
-    XlaTensor* xla_tensor = XlaTensor::FromTensor(device_tensor);
-    CHECK(xla_tensor);
-
-    TensorShape shape = shape_representation_fn_(device_tensor->shape(),
-                                                 device_tensor->dtype());
-    Status status;
-    if (!xla_tensor->has_shaped_buffer()) {
-      status = xla_tensor->AllocateShapedBuffer(
-          device_tensor->dtype(), shape, client_,
-          stream_->parent()->device_ordinal());
-      if (!status.ok()) {
-        return done(status);
-      }
-    }
+  if (cpu_tensor->NumElements() == 0) {
+    VLOG(2) << "CopyCPUTensorToDevice empty tensor";
+    done(Status::OK());
+    return;
+  }
 
-    if (transfer_as_literal_) {
-      Tensor reshaped_cpu_tensor;
-      if (!reshaped_cpu_tensor.CopyFrom(*cpu_tensor, shape)) {
-        done(errors::Internal(
-            "Tensor::CopyFrom failed when copying from CPU to XLA device"));
-        return;
-      }
-      status = TransferLiteralToDevice(reshaped_cpu_tensor, device_tensor);
-    } else {
-      se::DeviceMemoryBase dev_dst_ptr =
-          XlaTensor::DeviceMemoryFromTensor(*device_tensor);
-      stream_->ThenMemcpy(&dev_dst_ptr, src_ptr, total_bytes);
-      // TODO(hpucha): Make this asynchronous.
-      Status block_status = stream_->BlockHostUntilDone();
-      if (!block_status.ok()) {
-        status = xla::InternalError(
-            "Failed to complete data transfer on stream %p: %s", stream_,
-            block_status.error_message().c_str());
-      }
-    }
-    xla_tensor->set_host_tensor(*cpu_tensor);
+  VLOG(2) << "CopyCPUTensorToDevice "
+          << reinterpret_cast<const void*>(cpu_tensor->tensor_data().data())
+          << " "
+          << reinterpret_cast<const void*>(device_tensor->tensor_data().data())
+          << " " << cpu_tensor->NumElements() << " "
+          << cpu_tensor->shape().DebugString() << " "
+          << device_tensor->shape().DebugString();
 
-    done(status);
+  void* src_ptr = const_cast<void*>(DMAHelper::base(cpu_tensor));
+  const int64 total_bytes = cpu_tensor->TotalBytes();
+
+  XlaTensor* xla_tensor = XlaTensor::FromTensor(device_tensor);
+  CHECK(xla_tensor);
+
+  xla::StatusOr<TensorShape> shape_or_status =
+      shape_representation_fn_(device_tensor->shape(), device_tensor->dtype());
+  if (!shape_or_status.ok()) {
+    done(shape_or_status.status());
     return;
   }
+  TensorShape shape = shape_or_status.ValueOrDie();
+  if (!xla_tensor->has_shaped_buffer()) {
+    Status s =
+        xla_tensor->AllocateShapedBuffer(device_tensor->dtype(), shape, client_,
+                                         stream_->parent()->device_ordinal());
+    if (!s.ok()) {
+      done(s);
+      return;
+    }
+  }
 
-  VLOG(2) << "CopyCPUTensorToDevice empty tensor";
-  done(Status::OK());
+  Status status;
+  if (transfer_as_literal_) {
+    Tensor reshaped_cpu_tensor;
+    if (!reshaped_cpu_tensor.CopyFrom(*cpu_tensor, shape)) {
+      done(errors::Internal(
+          "Tensor::CopyFrom failed when copying from CPU to XLA device"));
+      return;
+    }
+    status = TransferLiteralToDevice(reshaped_cpu_tensor, device_tensor);
+    if (status.ok()) {
+      xla_tensor->set_host_tensor(*cpu_tensor);
+      host_to_device_stream_->ThenDoHostCallback(
+          [done]() { done(Status::OK()); });
+      return;
+    }
+  } else {
+    se::DeviceMemoryBase dev_dst_ptr =
+        XlaTensor::DeviceMemoryFromTensor(*device_tensor);
+    host_to_device_stream_->ThenMemcpy(&dev_dst_ptr, src_ptr, total_bytes);
+    // TODO(hpucha): Make this asynchronous.
+    Status block_status = host_to_device_stream_->BlockHostUntilDone();
+    if (!block_status.ok()) {
+      status = xla::InternalError(
+          "Failed to complete data transfer on stream %p: %s",
+          host_to_device_stream_, block_status.error_message().c_str());
+    }
+  }
+  xla_tensor->set_host_tensor(*cpu_tensor);
+
+  done(status);
 }
 
 void XlaTransferManager::CopyDeviceTensorToCPU(const Tensor* device_tensor,
@@ -183,68 +212,102 @@ void XlaTransferManager::CopyDeviceTensorToCPU(const Tensor* device_tensor,
                                                Device* device,
                                                Tensor* cpu_tensor,
                                                StatusCallback done) {
-  if (device_tensor->NumElements() > 0) {
-    VLOG(2) << "CopyDeviceTensorToCPU "
-            << reinterpret_cast<const void*>(
-                   device_tensor->tensor_data().data())
-            << " "
-            << reinterpret_cast<const void*>(cpu_tensor->tensor_data().data())
-            << " " << device_tensor->NumElements() << " "
-            << cpu_tensor->shape().DebugString() << " "
-            << device_tensor->shape().DebugString();
-
-    const int64 total_bytes = cpu_tensor->TotalBytes();
-    se::DeviceMemoryBase dev_src_ptr =
-        XlaTensor::DeviceMemoryFromTensor(*device_tensor);
-    void* dst_ptr = DMAHelper::base(cpu_tensor);
+  if (device_tensor->NumElements() == 0) {
+    VLOG(2) << "CopyDeviceTensorToCPU empty tensor";
+    done(Status::OK());
+    return;
+  }
+  VLOG(2) << "CopyDeviceTensorToCPU "
+          << reinterpret_cast<const void*>(device_tensor->tensor_data().data())
+          << " "
+          << reinterpret_cast<const void*>(cpu_tensor->tensor_data().data())
+          << " " << device_tensor->NumElements() << " "
+          << cpu_tensor->shape().DebugString() << " "
+          << device_tensor->shape().DebugString();
+
+  const int64 total_bytes = cpu_tensor->TotalBytes();
+  se::DeviceMemoryBase dev_src_ptr =
+      XlaTensor::DeviceMemoryFromTensor(*device_tensor);
+  void* dst_ptr = DMAHelper::base(cpu_tensor);
+  XlaTensor* xla_tensor = XlaTensor::FromTensor(device_tensor);
+
+  if (se::Event* event =
+          xla_tensor->GetDefinitionEvent(device_to_host_stream_)) {
+    device_to_host_stream_->ThenWaitFor(event);
+    xla_tensor->SetDefinedOn(device_to_host_stream_);
+  }
 
-    Status status;
-    if (transfer_as_literal_) {
-      TransferLiteralFromDevice(cpu_tensor, *device_tensor, done);
-      return;
-    } else {
-      stream_->ThenMemcpy(dst_ptr, dev_src_ptr, total_bytes);
-      // TODO(hpucha): Make this asynchronous.
-      Status block_status = stream_->BlockHostUntilDone();
-      if (!block_status.ok()) {
-        status = xla::InternalError(
-            "Failed to complete data transfer on stream %p: %s", stream_,
-            block_status.error_message().c_str());
-      }
-      done(status);
-    }
+  Status status;
+  if (transfer_as_literal_) {
+    TransferLiteralFromDevice(cpu_tensor, *device_tensor, done);
     return;
+  } else {
+    device_to_host_stream_->ThenMemcpy(dst_ptr, dev_src_ptr, total_bytes);
+    // TODO(hpucha): Make this asynchronous.
+    Status block_status = device_to_host_stream_->BlockHostUntilDone();
+    if (!block_status.ok()) {
+      status = xla::InternalError(
+          "Failed to complete data transfer on stream %p: %s", stream_,
+          block_status.error_message().c_str());
+    }
   }
 
-  VLOG(2) << "CopyDeviceTensorToCPU empty tensor";
-  done(Status::OK());
+  done(status);
 }
 
 void XlaTransferManager::CopyDeviceTensorToDevice(const Tensor& src_tensor,
                                                   Tensor* dst_tensor,
                                                   const StatusCallback& done) {
+  VLOG(2) << "CopyDeviceTensorToDevice "
+          << reinterpret_cast<const void*>(src_tensor.tensor_data().data())
+          << " "
+          << reinterpret_cast<const void*>(dst_tensor->tensor_data().data());
   // Perform memory allocation now, and enqueue the device-to-device transfer.
   Status status = [&]() -> Status {
     if (src_tensor.NumElements() == 0) {
       return Status::OK();
     }
+    // TODO(jmolloy): We co-opt the device_to_host stream for device to device
+    // transfers; perhaps we should have a dedicated device to device stream? or
+    // one per device?
+    auto device_to_device_stream = stream_;
     XlaTensor* xla_src = XlaTensor::FromTensor(&src_tensor);
     XlaTensor* xla_dst = XlaTensor::FromTensor(dst_tensor);
     CHECK(xla_src && xla_dst)
         << "Missing destination tensor for device-to-device copy";
     if (!xla_dst->has_shaped_buffer()) {
-      TensorShape shape =
-          shape_representation_fn_(src_tensor.shape(), src_tensor.dtype());
+      TF_ASSIGN_OR_RETURN(
+          TensorShape shape,
+          shape_representation_fn_(src_tensor.shape(), src_tensor.dtype()));
       TF_RETURN_IF_ERROR(
           xla_dst->AllocateShapedBuffer(src_tensor.dtype(), shape, client_,
                                         stream_->parent()->device_ordinal()));
+      if (stream_ != device_to_device_stream) {
+        // Initially wait for the compute stream so that memory allocations are
+        // synchronized.
+        device_to_device_stream->ThenWaitFor(stream_);
+      }
     }
+
+    if (se::Event* event =
+            xla_src->GetDefinitionEvent(device_to_device_stream)) {
+      device_to_device_stream->ThenWaitFor(event);
+      xla_src->SetDefinedOn(device_to_device_stream);
+    }
+
     auto from_iter = xla_src->shaped_buffer().buffers().begin();
     auto to_iter = xla_dst->shaped_buffer().buffers().begin();
     for (auto end_iter = xla_src->shaped_buffer().buffers().end();
          from_iter != end_iter; ++from_iter, ++to_iter) {
-      stream_->ThenMemcpyD2D(&to_iter->second, from_iter->second,
-                             to_iter->second.size());
+      device_to_device_stream->ThenMemcpyD2D(
+          &to_iter->second, from_iter->second, to_iter->second.size());
+    }
+
+    if (UseMultipleStreams()) {
+      se::Event event(stream_->parent());
+      CHECK(event.Init());
+      device_to_device_stream->ThenRecordEvent(&event);
+      xla_dst->SetDefinedOn(device_to_device_stream, std::move(event));
     }
     return Status::OK();
   }();
@@ -256,9 +319,12 @@ void XlaTransferManager::CopyDeviceTensorToDevice(const Tensor& src_tensor,
 }
 
 XlaDeviceContext::XlaDeviceContext(
-    se::Stream* stream, xla::LocalClient* client, bool transfer_as_literal,
+    se::Stream* compute_stream, se::Stream* host_to_device_stream,
+    se::Stream* device_to_host_stream, xla::LocalClient* client,
+    bool transfer_as_literal,
     XlaCompiler::ShapeRepresentationFn shape_representation_fn)
-    : manager_(stream, client, transfer_as_literal,
+    : manager_(compute_stream, host_to_device_stream, device_to_host_stream,
+               client, transfer_as_literal,
                std::move(shape_representation_fn)) {}
 
 void XlaDeviceContext::CopyCPUTensorToDevice(const Tensor* cpu_tensor,