Automated g4 rollback of changelist 202292422

PiperOrigin-RevId: 202551122
author: A. Unique TensorFlower <gardener@tensorflow.org> 2018-06-28 15:06:40 -0700
committer: TensorFlower Gardener <gardener@tensorflow.org> 2018-06-28 15:10:14 -0700
commit: 0ea6847c892497afdd20c1150fee1e532612ca17 (patch)
tree: 2a5347d4599788cc4ce9c9981a57243a136e1704 /tensorflow
parent: c0616864648cd52749bf722051de1a5d46be9a5e (diff)
9 files changed, 52 insertions, 117 deletions
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.cc b/tensorflow/compiler/jit/xla_compilation_cache.cc
index 54a41a4daa..7ed609c437 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.cc
+++ b/tensorflow/compiler/jit/xla_compilation_cache.cc
@@ -40,23 +40,7 @@ namespace tensorflow {
 XlaCompilationCache::XlaCompilationCache(xla::LocalClient* client,
                                          DeviceType device_type)
     : client_(client), device_type_(std::move(device_type)) {}
-XlaCompilationCache::~XlaCompilationCache() {
-  // Ensure any use of our programs have completed by waiting for all stream
-  // executors to complete.
-  for (auto* executor : client_->backend().stream_executors()) {
-    bool ok = executor->SynchronizeAllActivity();
-    if (!ok) {
-      LOG(ERROR) << "Error synchronizing activity while waiting for all "
-                    "programs to complete";
-    }
-  }
-  // TODO(b/110813685): Think about the program ownership model. Programs are
-  // currently owned by the compilation cache which means we must wait for
-  // program completion in the destructor. There are multiple compilation caches
-  // around, which complicates things a little. Perhaps having programs be
-  // shared_ptrs (an invasive change) would make the model easier to reason
-  // about?
-}
+XlaCompilationCache::~XlaCompilationCache() = default;
 
 string XlaCompilationCache::DebugString() {
   return "XLA JIT compilation cache";
diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc
index e20f5aa837..37005479dc 100644
--- a/tensorflow/compiler/jit/xla_device_context.cc
+++ b/tensorflow/compiler/jit/xla_device_context.cc
@@ -67,53 +67,36 @@ Status XlaTransferManager::TransferLiteralToDevice(
   xla::Shape xla_shape;
   TF_RETURN_IF_ERROR(TensorShapeToXLAShape(host_tensor.dtype(),
                                            host_tensor.shape(), &xla_shape));
-  // Create a reference to hold onto host_tensor until after the literal has
-  // been transferred. Also make sure the literal exists until the function
-  // asynchronously completes, as it will be wrapped in an xla::LiteralSlice.
-  TensorReference ref(host_tensor);
-  auto literal = std::make_shared<xla::BorrowingLiteral>(
+  xla::BorrowingLiteral literal(
       static_cast<const char*>(DMAHelper::base(&host_tensor)), xla_shape);
 
   const xla::ShapedBuffer& shaped_buffer =
       XlaTensor::FromTensor(device_tensor)->shaped_buffer();
-  VLOG(1) << "Transfer to device as literal: " << literal->ToString() << " "
+  VLOG(1) << "Transfer to device as literal: " << literal.ToString() << " "
           << shaped_buffer.ToString();
-  TF_RETURN_IF_ERROR(transfer_manager_->TransferLiteralToDeviceAsync(
-      stream_, *literal, shaped_buffer));
-  // Unref the host tensor, and capture the literal shared_ptr too so it goes
-  // out of scope when the lambda completes.
-  stream_->ThenDoHostCallback([ref, literal]() { ref.Unref(); });
-  return Status::OK();
+  return transfer_manager_->TransferLiteralToDevice(stream_, literal,
+                                                    shaped_buffer);
 }
 
-void XlaTransferManager::TransferLiteralFromDevice(
-    Tensor* host_tensor, const Tensor& device_tensor,
-    const StatusCallback& done) const {
+Status XlaTransferManager::TransferLiteralFromDevice(
+    Tensor* host_tensor, const Tensor& device_tensor) const {
   const xla::ShapedBuffer& shaped_buffer =
       XlaTensor::FromTensor(&device_tensor)->shaped_buffer();
 
-  TensorReference ref(device_tensor);
-  transfer_manager_->TransferLiteralFromDevice(
-      stream_, shaped_buffer,
-      [=, &shaped_buffer](
-          xla::StatusOr<std::unique_ptr<xla::Literal> > literal_or) {
-        ref.Unref();
-        done([&]() -> Status {
-          TF_ASSIGN_OR_RETURN(auto literal, std::move(literal_or));
-          VLOG(1) << "Transfer from device as literal: " << literal->ToString()
-                  << " " << shaped_buffer.ToString();
-          Tensor tensor;
-          TF_RETURN_IF_ERROR(
-              LiteralToHostTensor(*literal, host_tensor->dtype(), &tensor));
-          // Reshape the tensor back to its declared shape.
-          Status status;
-          if (!host_tensor->CopyFrom(tensor, device_tensor.shape())) {
-            status = errors::Internal(
-                "Tensor::CopyFrom failed when copying from XLA device to CPU");
-          }
-          return status;
-        }());
-      });
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<xla::Literal> literal,
+      transfer_manager_->TransferLiteralFromDevice(stream_, shaped_buffer));
+  VLOG(1) << "Transfer from device as literal: " << literal->ToString() << " "
+          << shaped_buffer.ToString();
+  Tensor tensor;
+  TF_RETURN_IF_ERROR(
+      LiteralToHostTensor(*literal, host_tensor->dtype(), &tensor));
+  // Reshape the tensor back to its declared shape.
+  if (!host_tensor->CopyFrom(tensor, device_tensor.shape())) {
+    return errors::Internal(
+        "Tensor::CopyFrom failed when copying from XLA device to CPU");
+  }
+  return Status::OK();
 }
 
 void XlaTransferManager::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
@@ -138,16 +121,17 @@ void XlaTransferManager::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
 
     TensorShape shape = shape_representation_fn_(device_tensor->shape(),
                                                  device_tensor->dtype());
-    Status status;
     if (!xla_tensor->has_shaped_buffer()) {
-      status = xla_tensor->AllocateShapedBuffer(
+      Status s = xla_tensor->AllocateShapedBuffer(
           device_tensor->dtype(), shape, client_,
           stream_->parent()->device_ordinal());
-      if (!status.ok()) {
-        return done(status);
+      if (!s.ok()) {
+        done(s);
+        return;
       }
     }
 
+    Status status;
     if (transfer_as_literal_) {
       Tensor reshaped_cpu_tensor;
       if (!reshaped_cpu_tensor.CopyFrom(*cpu_tensor, shape)) {
@@ -200,8 +184,7 @@ void XlaTransferManager::CopyDeviceTensorToCPU(const Tensor* device_tensor,
 
     Status status;
     if (transfer_as_literal_) {
-      TransferLiteralFromDevice(cpu_tensor, *device_tensor, done);
-      return;
+      status = TransferLiteralFromDevice(cpu_tensor, *device_tensor);
     } else {
       stream_->ThenMemcpy(dst_ptr, dev_src_ptr, total_bytes);
       // TODO(hpucha): Make this asynchronous.
@@ -211,8 +194,9 @@ void XlaTransferManager::CopyDeviceTensorToCPU(const Tensor* device_tensor,
             "Failed to complete data transfer on stream %p: %s", stream_,
             block_status.error_message().c_str());
       }
-      done(status);
     }
+
+    done(status);
     return;
   }
 
@@ -223,8 +207,8 @@ void XlaTransferManager::CopyDeviceTensorToCPU(const Tensor* device_tensor,
 void XlaTransferManager::CopyDeviceTensorToDevice(const Tensor& src_tensor,
                                                   Tensor* dst_tensor,
                                                   const StatusCallback& done) {
-  // Perform memory allocation now, and enqueue the device-to-device transfer.
-  Status status = [&]() -> Status {
+  // TODO(phawkins): replace this code with an asynchronous implementation.
+  auto body = [&]() {
     if (src_tensor.NumElements() == 0) {
       return Status::OK();
     }
@@ -239,20 +223,21 @@ void XlaTransferManager::CopyDeviceTensorToDevice(const Tensor& src_tensor,
           xla_dst->AllocateShapedBuffer(src_tensor.dtype(), shape, client_,
                                         stream_->parent()->device_ordinal()));
     }
-    auto from_iter = xla_src->shaped_buffer().buffers().begin();
-    auto to_iter = xla_dst->shaped_buffer().buffers().begin();
-    for (auto end_iter = xla_src->shaped_buffer().buffers().end();
-         from_iter != end_iter; ++from_iter, ++to_iter) {
-      stream_->ThenMemcpyD2D(&to_iter->second, from_iter->second,
-                             to_iter->second.size());
-    }
+    TF_RETURN_IF_ERROR(
+        xla_dst->shaped_buffer().buffers().ForEachMutableElementWithStatus(
+            [&](const xla::ShapeIndex& index, se::DeviceMemoryBase* buffer) {
+              const se::DeviceMemoryBase& from_buffer =
+                  xla_src->shaped_buffer().buffers().element(index);
+              CHECK_EQ(buffer->size(), from_buffer.size());
+              if (!stream_->parent()->SynchronousMemcpy(buffer, from_buffer,
+                                                        buffer->size())) {
+                return errors::Internal("Device to device memcpy failed");
+              }
+              return Status::OK();
+            }));
     return Status::OK();
-  }();
-  if (!status.ok()) {
-    return done(status);
-  } else {
-    stream_->ThenDoHostCallback([=]() { done(Status::OK()); });
-  }
+  };
+  done(body());
 }
 
 XlaDeviceContext::XlaDeviceContext(
diff --git a/tensorflow/compiler/jit/xla_device_context.h b/tensorflow/compiler/jit/xla_device_context.h
index c5c81d65fe..ee346e5653 100644
--- a/tensorflow/compiler/jit/xla_device_context.h
+++ b/tensorflow/compiler/jit/xla_device_context.h
@@ -64,9 +64,8 @@ class XlaTransferManager {
  private:
   Status TransferLiteralToDevice(const Tensor& host_tensor,
                                  Tensor* device_tensor) const;
-  void TransferLiteralFromDevice(Tensor* host_tensor,
-                                 const Tensor& device_tensor,
-                                 const StatusCallback& done) const;
+  Status TransferLiteralFromDevice(Tensor* host_tensor,
+                                   const Tensor& device_tensor) const;
 
   // Stream obtained from a Device, used to transfer tensors between
   // CPU and device.
diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc
index fd75847d0c..7cf2746947 100644
--- a/tensorflow/compiler/xla/service/executable.cc
+++ b/tensorflow/compiler/xla/service/executable.cc
@@ -82,18 +82,7 @@ StatusOr<ScopedShapedBuffer> Executable::ExecuteOnStreamWrapper(
 
   StatusOr<ScopedShapedBuffer> return_value =
       ExecuteOnStream(run_options, arguments, profile_ptr.get());
-  if (!return_value.status().ok()) {
-    if (profile != nullptr) {
-      // Ensure the ThenStartTimer call has completed before we destroy timer.
-      // We already have a failure status to return, so just log this if it
-      // fails.
-      Status status = stream->BlockHostUntilDone();
-      if (!status.ok()) {
-        LOG(ERROR) << "Failed to BlockHostUntilDone: " << status;
-      }
-    }
-    return return_value.status();
-  }
+  TF_RETURN_IF_ERROR(return_value.status());
 
   if (profile != nullptr) {
     VLOG(1) << "enqueueing 'stop timer' and blocking host until done...";
diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc
index b2725e2918..4f0569f405 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.cc
+++ b/tensorflow/compiler/xla/service/hlo_runner.cc
@@ -180,12 +180,8 @@ StatusOr<ScopedShapedBuffer> HloRunner::ExecuteWithDeviceBuffers(
 
   TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
                       CreateExecutable(std::move(module), run_hlo_passes));
-  TF_ASSIGN_OR_RETURN(
-      ScopedShapedBuffer retval,
-      executable->ExecuteOnStreamWrapper(&service_run_options,
-                                         /*profile=*/profile, arguments));
-  TF_RETURN_IF_ERROR(stream.BlockHostUntilDone());
-  return std::move(retval);
+  return executable->ExecuteOnStreamWrapper(&service_run_options,
+                                            /*profile=*/profile, arguments);
 }
 
 StatusOr<ScopedShapedBuffer> HloRunner::ExecuteWithDeviceBuffers(
@@ -313,7 +309,6 @@ StatusOr<std::vector<std::unique_ptr<Literal>>> HloRunner::ExecuteReplicated(
 
   std::vector<std::unique_ptr<Literal>> exec_results;
   for (int64 i = 0; i < options.num_replicas; ++i) {
-    TF_RETURN_IF_ERROR(streams[i]->BlockHostUntilDone());
     TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> literal,
                         backend().transfer_manager()->TransferLiteralFromDevice(
                             streams[i].get(), results[i]));
diff --git a/tensorflow/compiler/xla/tests/local_client_execute_test.cc b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
index 2c6393794e..fd74cadea2 100644
--- a/tensorflow/compiler/xla/tests/local_client_execute_test.cc
+++ b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
@@ -771,10 +771,6 @@ XLA_TEST_F(LocalClientExecuteTest, CompileExecutable) {
   ScopedShapedBuffer result =
       executable->Run({&x_array}, DefaultExecutableRunOptions())
           .ConsumeValueOrDie();
-  ASSERT_IS_OK(local_client_->mutable_backend()
-                   ->BorrowStream(0)
-                   .ValueOrDie()
-                   ->BlockHostUntilDone());
 
   LiteralTestUtil::ExpectR1Near<float>(
       {2.0f, 4.0f, 6.0f}, *ShapedBufferToLiteral(result), error_spec_);
diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.cc b/tensorflow/compiler/xla/tests/local_client_test_base.cc
index c31ba0e713..88797a7d0a 100644
--- a/tensorflow/compiler/xla/tests/local_client_test_base.cc
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.cc
@@ -189,19 +189,7 @@ StatusOr<ScopedShapedBuffer> LocalClientTestBase::ExecuteLocally(
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<LocalExecutable> executable,
       local_client_->Compile(computation, argument_layouts, build_options));
-  TF_ASSIGN_OR_RETURN(auto ret, executable->Run(arguments, run_options));
-
-  auto device_ordinal =
-      build_options.device_ordinal() == -1 ? 0 : build_options.device_ordinal();
-  auto* stream = run_options.stream();
-  if (!stream) {
-    stream = local_client_->mutable_backend()
-                 ->BorrowStream(device_ordinal)
-                 .ValueOrDie()
-                 .get();
-  }
-  TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
-  return std::move(ret);
+  return executable->Run(arguments, run_options);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
index c0616809f9..28695413f9 100644
--- a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
+++ b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
@@ -168,7 +168,6 @@ void ExecuteAndFetchProfile(string* profile_output, LocalClient* client,
       auto execution_result,
       executable->ExecuteOnStream(&run_options, {&lhs_arg, &rhs_arg},
                                   &hlo_execution_profile));
-  TF_ASSERT_OK(stream_ptr->BlockHostUntilDone());
   (void)execution_result;
 
   *profile_output =
diff --git a/tensorflow/stream_executor/host/host_gpu_executor.cc b/tensorflow/stream_executor/host/host_gpu_executor.cc
index c8a6297330..2c4819651a 100644
--- a/tensorflow/stream_executor/host/host_gpu_executor.cc
+++ b/tensorflow/stream_executor/host/host_gpu_executor.cc
@@ -95,7 +95,7 @@ bool HostExecutor::MemcpyDeviceToDevice(Stream *stream,
   // the nature of the HostExecutor) memcpy  on the stream (HostStream)
   // associated with the HostExecutor.
   AsHostStream(stream)->EnqueueTask(
-      [src_mem, dst_mem, size]() { memcpy(dst_mem, src_mem, size); });
+      [src_mem, dst_mem, size]() { memcpy(src_mem, dst_mem, size); });
   return true;
 }
author	A. Unique TensorFlower <gardener@tensorflow.org>	2018-06-28 15:06:40 -0700
committer	TensorFlower Gardener <gardener@tensorflow.org>	2018-06-28 15:10:14 -0700
commit	0ea6847c892497afdd20c1150fee1e532612ca17 (patch)
tree	2a5347d4599788cc4ce9c9981a57243a136e1704 /tensorflow
parent	c0616864648cd52749bf722051de1a5d46be9a5e (diff)