diff options
Diffstat (limited to 'tensorflow/core/common_runtime/gpu/gpu_util.cc')
-rw-r--r-- | tensorflow/core/common_runtime/gpu/gpu_util.cc | 282 |
1 files changed, 162 insertions, 120 deletions
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.cc b/tensorflow/core/common_runtime/gpu/gpu_util.cc index 2accf92503..f34ac256d1 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_util.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_util.cc @@ -37,6 +37,18 @@ limitations under the License. #include "tensorflow/core/platform/tracing.h" #include "tensorflow/core/util/util.h" +// IMPLEMENTATION NOTE: +// +// 1. Within this module, we intentionally LOG(FATAL) if any stream +// involved in memcpy becomes !stream->ok(), because TF process +// today (1/2016) can not properly recover from such an error. +// +// 2. When 0-size tensor is being copied, we should not schedule a +// copy ThenMemcpy since there is no byte to move. However, we must +// ensure the causal ordering by arranging the copy done callback +// happens-after all activities scheduled on the given stream being +// finished. + // If this need to be runtime configurable, consider adding options to // ConfigProto. const tensorflow::int64 FLAGS_brain_gpu_util_debug_string_maxlen = 128; @@ -50,60 +62,106 @@ namespace tensorflow { namespace gpu = ::perftools::gputools; +Status PrepareCopy(Device* device, const DeviceContext* ctx, const Tensor& src, + const Tensor* dst, + const DeviceBase::GpuDeviceInfo** dev_info, + gpu::Stream** stream) { + if (device == nullptr) { + return errors::Internal("Unexpected null device."); + } + auto di = device->tensorflow_gpu_device_info(); + if (di == nullptr) { + return errors::Internal("Unexpected null device info."); + } + *dev_info = di; + if (ctx == nullptr) { + return errors::Internal("Unexpected null device context."); + } + auto gs = static_cast<const GPUDeviceContext*>(ctx)->stream(); + if (gs == nullptr) { + return errors::Internal("No gpu stream is available."); + } + *stream = gs; + if (dst != nullptr) { + if (src.dtype() != dst->dtype()) { + return errors::Internal("Can't copy a tensor of ", + DataTypeString(src.dtype()), " into a tensor of ", + DataTypeString(dst->dtype())); + } + if (src.TotalBytes() != dst->TotalBytes()) { + return errors::Internal("Can't copy ", src.TotalBytes(), + " bytes of a tensor into another with ", + dst->TotalBytes(), " bytes buffer."); + } + if ((src.TotalBytes() > 0) && !src.IsInitialized()) { + return errors::Internal("Src tensor is not initialized."); + } + if ((dst->TotalBytes() > 0) && !dst->IsInitialized()) { + return errors::Internal("Dst tensor is not initialized."); + } + } + if (!DMAHelper::CanUseDMA(&src)) { + return errors::Internal("GPU copy from non-DMA ", + DataTypeString(src.dtype()), "tensor"); + } + return Status::OK(); +} + +void* GetBase(const Tensor* src) { + return const_cast<void*>(DMAHelper::base(src)); +} + +void* GetBase(Tensor* dst) { return DMAHelper::base(dst); } + /*static*/ void GPUUtil::SetProtoFromGPU(const Tensor& tensor, Device* dev, const DeviceContext* device_context, TensorProto* proto, bool is_dead, StatusCallback done) { VLOG(1) << "SetProtoFromGPU device_context " << device_context; + const DeviceBase::GpuDeviceInfo* dev_info = nullptr; + gpu::Stream* stream = nullptr; + Status s = + PrepareCopy(dev, device_context, tensor, nullptr, &dev_info, &stream); + if (!s.ok()) { + done(s); + return; + } + // Tensor values need to be copied from GPU to CPU ram so that // we can build the protobuf response for a RecvTensor RPC. // "device context" identifies the stream where the _Send op executed. - CHECK(device_context); - gpu::Stream* stream = - static_cast<const GPUDeviceContext*>(device_context)->stream(); - - if (!DMAHelper::CanUseDMA(&tensor)) { - done(errors::Internal(strings::StrCat( - "GPU copy from non-DMA ", DataTypeString(tensor.dtype()), "tensor"))); - return; - } proto->set_dtype(tensor.dtype()); tensor.shape().AsProto(proto->mutable_tensor_shape()); - // Prepare a Cord with the right data buf size, and DMA the - // data over from the GPU buffer. Note that 0-size tensors - // do not have a backing buffer. - const size_t num_bytes = is_dead ? 0 : tensor.TotalBytes(); - if (num_bytes > 0) { + + // Prepare a proto with the right data buf size, and DMA the data + // over from the GPU buffer. Note that 0-size tensors do not have a + // backing buffer. + Allocator* alloc = nullptr; + char* buf = nullptr; + const int64 total_bytes = is_dead ? 0 : tensor.TotalBytes(); + if (total_bytes > 0) { port::Tracing::ScopedAnnotation annotation("SetProtoFromGPU"); - Allocator* alloc = ProcessState::singleton()->GetCUDAHostAllocator(0); - char* mb = alloc->Allocate<char>(num_bytes); - const char* src_ptr = - reinterpret_cast<const char*>(DMAHelper::base(&tensor)); - DeviceMemoryBase gpu_src_ptr(const_cast<char*>(src_ptr), num_bytes); - stream->ThenMemcpy(mb, gpu_src_ptr, num_bytes); - // Use of tensor may outlive stack scope, so keep a ref. - TensorReference tensor_ref(tensor); - dev->tensorflow_gpu_device_info()->event_mgr->ThenExecute( - stream, [stream, done, proto, mb, num_bytes, alloc, tensor_ref]() { - if (!stream->ok()) { - done(errors::Internal("SetProtoFromGPU: GPU Memcpy failed")); - // TODO(pbar) We currently have no way to recover the - // worker from a GPU stream in the error state. Until - // there is a way to reset the CUDA driver, it is - // preferable to crash the process and restart. Tracked - // under b/23717097 - LOG(FATAL) << "SetProtoFromGPU: GPU Memcpy failed"; - return; - } - tensor_ref.Unref(); - port::CopyFromArray(proto->mutable_tensor_content(), mb, num_bytes); - alloc->Deallocate<char>(mb, num_bytes); - done(Status::OK()); - }); - } else { - done(Status::OK()); + alloc = ProcessState::singleton()->GetCUDAHostAllocator(0); + buf = alloc->Allocate<char>(total_bytes); + void* src_ptr = GetBase(&tensor); + DeviceMemoryBase gpu_src_ptr(src_ptr, total_bytes); + stream->ThenMemcpy(buf, gpu_src_ptr, total_bytes); } + // Use of tensor may outlive stack scope, so keep a ref. + TensorReference tensor_ref(tensor); + dev_info->event_mgr->ThenExecute(stream, [stream, done, proto, buf, + total_bytes, alloc, tensor_ref]() { + if (!stream->ok()) { + LOG(FATAL) << "SetProtoFromGPU: GPU Memcpy failed"; + } + tensor_ref.Unref(); + if (total_bytes > 0) { + port::CopyFromArray(proto->mutable_tensor_content(), buf, total_bytes); + alloc->Deallocate<char>(buf, total_bytes); + } + done(Status::OK()); + }); } // static @@ -114,67 +172,67 @@ void GPUUtil::DeviceToDeviceCopy(DeviceContext* send_dev_context, AllocatorAttributes dst_alloc_attr, const Tensor* input, Tensor* output, StatusCallback done) { - const void* src_ptr = DMAHelper::base(input); - void* dst_ptr = DMAHelper::base(output); - VLOG(2) << "src_ptr " << src_ptr << " dst_ptr " << dst_ptr; - const size_t total_bytes = input->TotalBytes(); - - gpu::Stream* stream = send_dev_context->stream(); - if (stream == nullptr) { - done(errors::Internal("Failed to find device stream")); + const DeviceBase::GpuDeviceInfo* dev_info = nullptr; + gpu::Stream* stream = nullptr; + Status s = + PrepareCopy(src, send_dev_context, *input, output, &dev_info, &stream); + if (!s.ok()) { + done(s); return; } - auto* src_dev_info = src->tensorflow_gpu_device_info(); - CHECK(src_dev_info); - DeviceMemoryBase gpu_dst_ptr(dst_ptr, total_bytes); - stream->ThenMemcpy(&gpu_dst_ptr, - DeviceMemoryBase{const_cast<void*>(src_ptr), total_bytes}, - total_bytes); - if (dst->attributes().device_type() == DeviceType(DEVICE_GPU).type()) { - // Use of input may outlive stack scope, so keep a ref. - TensorReference input_ref(*input); - src_dev_info->event_mgr->ThenExecute(stream, [done, stream, input_ref]() { - input_ref.Unref(); - if (!stream->ok()) { - done(errors::Internal("GPU->GPU Memcpy failed")); - } else { - done(Status::OK()); - } - }); + const int64 total_bytes = input->TotalBytes(); + if (total_bytes > 0) { + void* src_ptr = GetBase(input); + DeviceMemoryBase gpu_src_ptr(src_ptr, total_bytes); + void* dst_ptr = GetBase(output); + DeviceMemoryBase gpu_dst_ptr(dst_ptr, total_bytes); + VLOG(2) << "src_ptr " << src_ptr << " dst_ptr " << dst_ptr; + stream->ThenMemcpy(&gpu_dst_ptr, gpu_src_ptr, total_bytes); } + + // Use of input may outlive stack scope, so keep a ref. + TensorReference input_ref(*input); + dev_info->event_mgr->ThenExecute(stream, [done, stream, input_ref]() { + input_ref.Unref(); + if (!stream->ok()) { + LOG(FATAL) << "GPU->GPU Memcpy failed"; + } + done(Status::OK()); + }); send_dev_context->MaintainLifetimeOnStream(input, stream); } static CopyTensor::Registration register_gpu_gpu_copy( DEVICE_GPU, DEVICE_GPU, GPUUtil::DeviceToDeviceCopy); +// static void GPUUtil::CopyGPUTensorToCPU(Device* gpu_device, const DeviceContext* device_context, const Tensor* gpu_tensor, Tensor* cpu_tensor, StatusCallback done) { VLOG(1) << "CopyGPUTensorToCPU"; - size_t total_bytes = gpu_tensor->TotalBytes(); - // Note that 0-size tensors have no backing buffer. + const DeviceBase::GpuDeviceInfo* dev_info = nullptr; + gpu::Stream* stream = nullptr; + Status s = PrepareCopy(gpu_device, device_context, *gpu_tensor, cpu_tensor, + &dev_info, &stream); + if (!s.ok()) { + done(s); + return; + } + const int64 total_bytes = gpu_tensor->TotalBytes(); if (total_bytes > 0) { - const void* src_ptr = DMAHelper::base(gpu_tensor); - void* dst_ptr = DMAHelper::base(cpu_tensor); - CHECK(dst_ptr); - auto* stream = gpu_device->tensorflow_gpu_device_info()->stream; - if (device_context) { - stream = static_cast<const GPUDeviceContext*>(device_context)->stream(); - } - stream->ThenMemcpy( - dst_ptr, DeviceMemoryBase{const_cast<void*>(src_ptr), total_bytes}, - total_bytes); - stream->BlockHostUntilDone(); + void* src_ptr = GetBase(gpu_tensor); + DeviceMemoryBase gpu_src_ptr(src_ptr, total_bytes); + void* dst_ptr = GetBase(cpu_tensor); + stream->ThenMemcpy(dst_ptr, gpu_src_ptr, total_bytes); + } + dev_info->event_mgr->ThenExecute(stream, [stream, done]() { if (!stream->ok()) { - done(errors::Internal("CopyGPUTensorToCPU: GPU->CPU Memcpy failed")); - return; + LOG(FATAL) << "GPU->CPU Memcpy failed"; } - } - - done(Status::OK()); + done(Status::OK()); + }); } /* static */ @@ -183,47 +241,31 @@ void GPUUtil::CopyCPUTensorToGPU(const Tensor* cpu_tensor, Device* gpu_device, Tensor* gpu_tensor, StatusCallback done) { VLOG(1) << "CopyCPUTensorToGPU"; - CHECK(DeviceType(gpu_device->attributes().device_type()) == - DeviceType(DEVICE_GPU)); - - auto* dev_info = gpu_device->tensorflow_gpu_device_info(); - if (!dev_info) { - done(errors::Internal("Failed to find dest device GPUDeviceInfo")); - return; - } - if (cpu_tensor->TotalBytes() != gpu_tensor->TotalBytes()) { - done(errors::Internal( - strings::StrCat("Can't copy ", cpu_tensor->TotalBytes(), - " bytes of a tensor into another with ", - gpu_tensor->TotalBytes(), " bytes buffer."))); + const DeviceBase::GpuDeviceInfo* dev_info = nullptr; + gpu::Stream* stream = nullptr; + Status s = PrepareCopy(gpu_device, device_context, *cpu_tensor, gpu_tensor, + &dev_info, &stream); + if (!s.ok()) { + done(s); return; } const int64 total_bytes = cpu_tensor->TotalBytes(); // Note that 0-size tensors have no backing buffer. if (total_bytes > 0) { - const void* src_ptr = DMAHelper::base(cpu_tensor); - void* dst_ptr = DMAHelper::base(gpu_tensor); + void* src_ptr = GetBase(cpu_tensor); + void* dst_ptr = GetBase(gpu_tensor); DeviceMemoryBase gpu_dst_ptr(dst_ptr, total_bytes); - - CHECK(device_context); - auto* stream = - static_cast<const GPUDeviceContext*>(device_context)->stream(); stream->ThenMemcpy(&gpu_dst_ptr, src_ptr, total_bytes); - auto* dev_info = gpu_device->tensorflow_gpu_device_info(); - // Use of cpu_tensor may outlive stack scope, so keep a ref. - TensorReference input_ref(*cpu_tensor); - dev_info->event_mgr->ThenExecute(stream, [stream, done, input_ref]() { - input_ref.Unref(); - if (!stream->ok()) { - done(errors::Internal("CopyCPUTensorToGPU: GPU Memcpy failed")); - } else { - done(Status::OK()); - } - }); - } else { - // empty tensor case - done(Status::OK()); } + // Use of cpu_tensor may outlive stack scope, so keep a ref. + TensorReference input_ref(*cpu_tensor); + dev_info->event_mgr->ThenExecute(stream, [stream, done, input_ref]() { + input_ref.Unref(); + if (!stream->ok()) { + LOG(FATAL) << "CPU->GPU Memcpy failed"; + } + done(Status::OK()); + }); } Status GPUUtil::Sync(Device* gpu_device) { @@ -257,7 +299,7 @@ string GPUUtil::MemoryDebugString(const Device* device, Tensor* tensor) { CHECK(tensor); const int64 num_bytes = std::min<int64>( FLAGS_brain_gpu_util_debug_string_maxlen, tensor->TotalBytes()); - void* ptr = (num_bytes > 0) ? DMAHelper::base(tensor) : nullptr; + void* ptr = (num_bytes > 0) ? GetBase(tensor) : nullptr; strings::Appendf(&ret, "%p:", ptr); if (num_bytes > 0) { auto* dev_info = device->tensorflow_gpu_device_info(); @@ -295,14 +337,14 @@ uint64 GPUUtil::Checksum(Device* gpu_device, } uint64 GPUUtil::Checksum(const Tensor& tensor) { - const float* fptr = reinterpret_cast<const float*>(DMAHelper::base(&tensor)); + const float* fptr = reinterpret_cast<const float*>(GetBase(&tensor)); size_t num_bytes = tensor.TotalBytes(); size_t num_floats = num_bytes / sizeof(float); for (size_t i = 0; i < num_floats; ++i) { CHECK(!std::isnan(fptr[i])) << " i " << i; } // TODO(tucker): consider using crc32c instead. - return Hash64(reinterpret_cast<const char*>(DMAHelper::base(&tensor)), + return Hash64(reinterpret_cast<const char*>(GetBase(&tensor)), tensor.TotalBytes(), 0); } |