1 files changed, 162 insertions, 120 deletions
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.cc b/tensorflow/core/common_runtime/gpu/gpu_util.cc
index 2accf92503..f34ac256d1 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_util.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_util.cc
@@ -37,6 +37,18 @@ limitations under the License.
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/util/util.h"
 
+// IMPLEMENTATION NOTE:
+//
+// 1. Within this module, we intentionally LOG(FATAL) if any stream
+//    involved in memcpy becomes !stream->ok(), because TF process
+//    today (1/2016) can not properly recover from such an error.
+//
+// 2. When 0-size tensor is being copied, we should not schedule a
+//    copy ThenMemcpy since there is no byte to move. However, we must
+//    ensure the causal ordering by arranging the copy done callback
+//    happens-after all activities scheduled on the given stream being
+//    finished.
+
 // If this need to be runtime configurable, consider adding options to
 // ConfigProto.
 const tensorflow::int64 FLAGS_brain_gpu_util_debug_string_maxlen = 128;
@@ -50,60 +62,106 @@ namespace tensorflow {
 
 namespace gpu = ::perftools::gputools;
 
+Status PrepareCopy(Device* device, const DeviceContext* ctx, const Tensor& src,
+                   const Tensor* dst,
+                   const DeviceBase::GpuDeviceInfo** dev_info,
+                   gpu::Stream** stream) {
+  if (device == nullptr) {
+    return errors::Internal("Unexpected null device.");
+  }
+  auto di = device->tensorflow_gpu_device_info();
+  if (di == nullptr) {
+    return errors::Internal("Unexpected null device info.");
+  }
+  *dev_info = di;
+  if (ctx == nullptr) {
+    return errors::Internal("Unexpected null device context.");
+  }
+  auto gs = static_cast<const GPUDeviceContext*>(ctx)->stream();
+  if (gs == nullptr) {
+    return errors::Internal("No gpu stream is available.");
+  }
+  *stream = gs;
+  if (dst != nullptr) {
+    if (src.dtype() != dst->dtype()) {
+      return errors::Internal("Can't copy a tensor of ",
+                              DataTypeString(src.dtype()), " into a tensor of ",
+                              DataTypeString(dst->dtype()));
+    }
+    if (src.TotalBytes() != dst->TotalBytes()) {
+      return errors::Internal("Can't copy ", src.TotalBytes(),
+                              " bytes of a tensor into another with ",
+                              dst->TotalBytes(), " bytes buffer.");
+    }
+    if ((src.TotalBytes() > 0) && !src.IsInitialized()) {
+      return errors::Internal("Src tensor is not initialized.");
+    }
+    if ((dst->TotalBytes() > 0) && !dst->IsInitialized()) {
+      return errors::Internal("Dst tensor is not initialized.");
+    }
+  }
+  if (!DMAHelper::CanUseDMA(&src)) {
+    return errors::Internal("GPU copy from non-DMA ",
+                            DataTypeString(src.dtype()), "tensor");
+  }
+  return Status::OK();
+}
+
+void* GetBase(const Tensor* src) {
+  return const_cast<void*>(DMAHelper::base(src));
+}
+
+void* GetBase(Tensor* dst) { return DMAHelper::base(dst); }
+
 /*static*/
 void GPUUtil::SetProtoFromGPU(const Tensor& tensor, Device* dev,
                               const DeviceContext* device_context,
                               TensorProto* proto, bool is_dead,
                               StatusCallback done) {
   VLOG(1) << "SetProtoFromGPU device_context " << device_context;
+  const DeviceBase::GpuDeviceInfo* dev_info = nullptr;
+  gpu::Stream* stream = nullptr;
+  Status s =
+      PrepareCopy(dev, device_context, tensor, nullptr, &dev_info, &stream);
+  if (!s.ok()) {
+    done(s);
+    return;
+  }
+
   // Tensor values need to be copied from GPU to CPU ram so that
   // we can build the protobuf response for a RecvTensor RPC.
   // "device context" identifies the stream where the _Send op executed.
-  CHECK(device_context);
-  gpu::Stream* stream =
-      static_cast<const GPUDeviceContext*>(device_context)->stream();
-
-  if (!DMAHelper::CanUseDMA(&tensor)) {
-    done(errors::Internal(strings::StrCat(
-        "GPU copy from non-DMA ", DataTypeString(tensor.dtype()), "tensor")));
-    return;
-  }
   proto->set_dtype(tensor.dtype());
   tensor.shape().AsProto(proto->mutable_tensor_shape());
-  // Prepare a Cord with the right data buf size, and DMA the
-  // data over from the GPU buffer.  Note that 0-size tensors
-  // do not have a backing buffer.
-  const size_t num_bytes = is_dead ? 0 : tensor.TotalBytes();
-  if (num_bytes > 0) {
+
+  // Prepare a proto with the right data buf size, and DMA the data
+  // over from the GPU buffer.  Note that 0-size tensors do not have a
+  // backing buffer.
+  Allocator* alloc = nullptr;
+  char* buf = nullptr;
+  const int64 total_bytes = is_dead ? 0 : tensor.TotalBytes();
+  if (total_bytes > 0) {
     port::Tracing::ScopedAnnotation annotation("SetProtoFromGPU");
-    Allocator* alloc = ProcessState::singleton()->GetCUDAHostAllocator(0);
-    char* mb = alloc->Allocate<char>(num_bytes);
-    const char* src_ptr =
-        reinterpret_cast<const char*>(DMAHelper::base(&tensor));
-    DeviceMemoryBase gpu_src_ptr(const_cast<char*>(src_ptr), num_bytes);
-    stream->ThenMemcpy(mb, gpu_src_ptr, num_bytes);
-    // Use of tensor may outlive stack scope, so keep a ref.
-    TensorReference tensor_ref(tensor);
-    dev->tensorflow_gpu_device_info()->event_mgr->ThenExecute(
-        stream, [stream, done, proto, mb, num_bytes, alloc, tensor_ref]() {
-          if (!stream->ok()) {
-            done(errors::Internal("SetProtoFromGPU: GPU Memcpy failed"));
-            // TODO(pbar) We currently have no way to recover the
-            // worker from a GPU stream in the error state.  Until
-            // there is a way to reset the CUDA driver, it is
-            // preferable to crash the process and restart.  Tracked
-            // under b/23717097
-            LOG(FATAL) << "SetProtoFromGPU: GPU Memcpy failed";
-            return;
-          }
-          tensor_ref.Unref();
-          port::CopyFromArray(proto->mutable_tensor_content(), mb, num_bytes);
-          alloc->Deallocate<char>(mb, num_bytes);
-          done(Status::OK());
-        });
-  } else {
-    done(Status::OK());
+    alloc = ProcessState::singleton()->GetCUDAHostAllocator(0);
+    buf = alloc->Allocate<char>(total_bytes);
+    void* src_ptr = GetBase(&tensor);
+    DeviceMemoryBase gpu_src_ptr(src_ptr, total_bytes);
+    stream->ThenMemcpy(buf, gpu_src_ptr, total_bytes);
   }
+  // Use of tensor may outlive stack scope, so keep a ref.
+  TensorReference tensor_ref(tensor);
+  dev_info->event_mgr->ThenExecute(stream, [stream, done, proto, buf,
+                                            total_bytes, alloc, tensor_ref]() {
+    if (!stream->ok()) {
+      LOG(FATAL) << "SetProtoFromGPU: GPU Memcpy failed";
+    }
+    tensor_ref.Unref();
+    if (total_bytes > 0) {
+      port::CopyFromArray(proto->mutable_tensor_content(), buf, total_bytes);
+      alloc->Deallocate<char>(buf, total_bytes);
+    }
+    done(Status::OK());
+  });
 }
 
 // static
@@ -114,67 +172,67 @@ void GPUUtil::DeviceToDeviceCopy(DeviceContext* send_dev_context,
                                  AllocatorAttributes dst_alloc_attr,
                                  const Tensor* input, Tensor* output,
                                  StatusCallback done) {
-  const void* src_ptr = DMAHelper::base(input);
-  void* dst_ptr = DMAHelper::base(output);
-  VLOG(2) << "src_ptr " << src_ptr << " dst_ptr " << dst_ptr;
-  const size_t total_bytes = input->TotalBytes();
-
-  gpu::Stream* stream = send_dev_context->stream();
-  if (stream == nullptr) {
-    done(errors::Internal("Failed to find device stream"));
+  const DeviceBase::GpuDeviceInfo* dev_info = nullptr;
+  gpu::Stream* stream = nullptr;
+  Status s =
+      PrepareCopy(src, send_dev_context, *input, output, &dev_info, &stream);
+  if (!s.ok()) {
+    done(s);
     return;
   }
-  auto* src_dev_info = src->tensorflow_gpu_device_info();
-  CHECK(src_dev_info);
 
-  DeviceMemoryBase gpu_dst_ptr(dst_ptr, total_bytes);
-  stream->ThenMemcpy(&gpu_dst_ptr,
-                     DeviceMemoryBase{const_cast<void*>(src_ptr), total_bytes},
-                     total_bytes);
-  if (dst->attributes().device_type() == DeviceType(DEVICE_GPU).type()) {
-    // Use of input may outlive stack scope, so keep a ref.
-    TensorReference input_ref(*input);
-    src_dev_info->event_mgr->ThenExecute(stream, [done, stream, input_ref]() {
-      input_ref.Unref();
-      if (!stream->ok()) {
-        done(errors::Internal("GPU->GPU Memcpy failed"));
-      } else {
-        done(Status::OK());
-      }
-    });
+  const int64 total_bytes = input->TotalBytes();
+  if (total_bytes > 0) {
+    void* src_ptr = GetBase(input);
+    DeviceMemoryBase gpu_src_ptr(src_ptr, total_bytes);
+    void* dst_ptr = GetBase(output);
+    DeviceMemoryBase gpu_dst_ptr(dst_ptr, total_bytes);
+    VLOG(2) << "src_ptr " << src_ptr << " dst_ptr " << dst_ptr;
+    stream->ThenMemcpy(&gpu_dst_ptr, gpu_src_ptr, total_bytes);
   }
+
+  // Use of input may outlive stack scope, so keep a ref.
+  TensorReference input_ref(*input);
+  dev_info->event_mgr->ThenExecute(stream, [done, stream, input_ref]() {
+    input_ref.Unref();
+    if (!stream->ok()) {
+      LOG(FATAL) << "GPU->GPU Memcpy failed";
+    }
+    done(Status::OK());
+  });
   send_dev_context->MaintainLifetimeOnStream(input, stream);
 }
 
 static CopyTensor::Registration register_gpu_gpu_copy(
     DEVICE_GPU, DEVICE_GPU, GPUUtil::DeviceToDeviceCopy);
 
+// static
 void GPUUtil::CopyGPUTensorToCPU(Device* gpu_device,
                                  const DeviceContext* device_context,
                                  const Tensor* gpu_tensor, Tensor* cpu_tensor,
                                  StatusCallback done) {
   VLOG(1) << "CopyGPUTensorToCPU";
-  size_t total_bytes = gpu_tensor->TotalBytes();
-  // Note that 0-size tensors have no backing buffer.
+  const DeviceBase::GpuDeviceInfo* dev_info = nullptr;
+  gpu::Stream* stream = nullptr;
+  Status s = PrepareCopy(gpu_device, device_context, *gpu_tensor, cpu_tensor,
+                         &dev_info, &stream);
+  if (!s.ok()) {
+    done(s);
+    return;
+  }
+  const int64 total_bytes = gpu_tensor->TotalBytes();
   if (total_bytes > 0) {
-    const void* src_ptr = DMAHelper::base(gpu_tensor);
-    void* dst_ptr = DMAHelper::base(cpu_tensor);
-    CHECK(dst_ptr);
-    auto* stream = gpu_device->tensorflow_gpu_device_info()->stream;
-    if (device_context) {
-      stream = static_cast<const GPUDeviceContext*>(device_context)->stream();
-    }
-    stream->ThenMemcpy(
-        dst_ptr, DeviceMemoryBase{const_cast<void*>(src_ptr), total_bytes},
-        total_bytes);
-    stream->BlockHostUntilDone();
+    void* src_ptr = GetBase(gpu_tensor);
+    DeviceMemoryBase gpu_src_ptr(src_ptr, total_bytes);
+    void* dst_ptr = GetBase(cpu_tensor);
+    stream->ThenMemcpy(dst_ptr, gpu_src_ptr, total_bytes);
+  }
+  dev_info->event_mgr->ThenExecute(stream, [stream, done]() {
     if (!stream->ok()) {
-      done(errors::Internal("CopyGPUTensorToCPU: GPU->CPU Memcpy failed"));
-      return;
+      LOG(FATAL) << "GPU->CPU Memcpy failed";
     }
-  }
-
-  done(Status::OK());
+    done(Status::OK());
+  });
 }
 
 /*  static */
@@ -183,47 +241,31 @@ void GPUUtil::CopyCPUTensorToGPU(const Tensor* cpu_tensor,
                                  Device* gpu_device, Tensor* gpu_tensor,
                                  StatusCallback done) {
   VLOG(1) << "CopyCPUTensorToGPU";
-  CHECK(DeviceType(gpu_device->attributes().device_type()) ==
-        DeviceType(DEVICE_GPU));
-
-  auto* dev_info = gpu_device->tensorflow_gpu_device_info();
-  if (!dev_info) {
-    done(errors::Internal("Failed to find dest device GPUDeviceInfo"));
-    return;
-  }
-  if (cpu_tensor->TotalBytes() != gpu_tensor->TotalBytes()) {
-    done(errors::Internal(
-        strings::StrCat("Can't copy ", cpu_tensor->TotalBytes(),
-                        " bytes of a tensor into another with ",
-                        gpu_tensor->TotalBytes(), " bytes buffer.")));
+  const DeviceBase::GpuDeviceInfo* dev_info = nullptr;
+  gpu::Stream* stream = nullptr;
+  Status s = PrepareCopy(gpu_device, device_context, *cpu_tensor, gpu_tensor,
+                         &dev_info, &stream);
+  if (!s.ok()) {
+    done(s);
     return;
   }
   const int64 total_bytes = cpu_tensor->TotalBytes();
   // Note that 0-size tensors have no backing buffer.
   if (total_bytes > 0) {
-    const void* src_ptr = DMAHelper::base(cpu_tensor);
-    void* dst_ptr = DMAHelper::base(gpu_tensor);
+    void* src_ptr = GetBase(cpu_tensor);
+    void* dst_ptr = GetBase(gpu_tensor);
     DeviceMemoryBase gpu_dst_ptr(dst_ptr, total_bytes);
-
-    CHECK(device_context);
-    auto* stream =
-        static_cast<const GPUDeviceContext*>(device_context)->stream();
     stream->ThenMemcpy(&gpu_dst_ptr, src_ptr, total_bytes);
-    auto* dev_info = gpu_device->tensorflow_gpu_device_info();
-    // Use of cpu_tensor may outlive stack scope, so keep a ref.
-    TensorReference input_ref(*cpu_tensor);
-    dev_info->event_mgr->ThenExecute(stream, [stream, done, input_ref]() {
-      input_ref.Unref();
-      if (!stream->ok()) {
-        done(errors::Internal("CopyCPUTensorToGPU: GPU Memcpy failed"));
-      } else {
-        done(Status::OK());
-      }
-    });
-  } else {
-    // empty tensor case
-    done(Status::OK());
   }
+  // Use of cpu_tensor may outlive stack scope, so keep a ref.
+  TensorReference input_ref(*cpu_tensor);
+  dev_info->event_mgr->ThenExecute(stream, [stream, done, input_ref]() {
+    input_ref.Unref();
+    if (!stream->ok()) {
+      LOG(FATAL) << "CPU->GPU Memcpy failed";
+    }
+    done(Status::OK());
+  });
 }
 
 Status GPUUtil::Sync(Device* gpu_device) {
@@ -257,7 +299,7 @@ string GPUUtil::MemoryDebugString(const Device* device, Tensor* tensor) {
   CHECK(tensor);
   const int64 num_bytes = std::min<int64>(
       FLAGS_brain_gpu_util_debug_string_maxlen, tensor->TotalBytes());
-  void* ptr = (num_bytes > 0) ? DMAHelper::base(tensor) : nullptr;
+  void* ptr = (num_bytes > 0) ? GetBase(tensor) : nullptr;
   strings::Appendf(&ret, "%p:", ptr);
   if (num_bytes > 0) {
     auto* dev_info = device->tensorflow_gpu_device_info();
@@ -295,14 +337,14 @@ uint64 GPUUtil::Checksum(Device* gpu_device,
 }
 
 uint64 GPUUtil::Checksum(const Tensor& tensor) {
-  const float* fptr = reinterpret_cast<const float*>(DMAHelper::base(&tensor));
+  const float* fptr = reinterpret_cast<const float*>(GetBase(&tensor));
   size_t num_bytes = tensor.TotalBytes();
   size_t num_floats = num_bytes / sizeof(float);
   for (size_t i = 0; i < num_floats; ++i) {
     CHECK(!std::isnan(fptr[i])) << " i " << i;
   }
   // TODO(tucker): consider using crc32c instead.
-  return Hash64(reinterpret_cast<const char*>(DMAHelper::base(&tensor)),
+  return Hash64(reinterpret_cast<const char*>(GetBase(&tensor)),
                 tensor.TotalBytes(), 0);
 }