diff options
Diffstat (limited to 'tensorflow/core/common_runtime/gpu/gpu_util.cc')
-rw-r--r-- | tensorflow/core/common_runtime/gpu/gpu_util.cc | 345 |
1 files changed, 345 insertions, 0 deletions
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.cc b/tensorflow/core/common_runtime/gpu/gpu_util.cc new file mode 100644 index 0000000000..a6a3ce01fc --- /dev/null +++ b/tensorflow/core/common_runtime/gpu/gpu_util.cc @@ -0,0 +1,345 @@ +#include "tensorflow/core/common_runtime/gpu/gpu_util.h" + +//#include "base/commandlineflags.h" +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/common_runtime/gpu_device_context.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/core/refcount.h" +#include "tensorflow/core/lib/gtl/array_slice.h" +#include "tensorflow/core/lib/gtl/stl_util.h" +#include "tensorflow/core/lib/hash/hash.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/lib/strings/stringprintf.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/tensor_coding.h" +#include "tensorflow/core/platform/tracing.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/common_runtime/gpu/dma_helper.h" +#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h" +#include "tensorflow/core/common_runtime/gpu/process_state.h" +#include "tensorflow/core/util/util.h" +#include "tensorflow/stream_executor/stream.h" +#include "tensorflow/stream_executor/stream_executor.h" + +#include "tensorflow/core/platform/stream_executor_util.h" + +#if defined(PLATFORM_GOOGLE) +DEFINE_int64(brain_gpu_util_debug_string_maxlen, 128, + "When dumping gpu memory, prints up to this many bytes."); + +DECLARE_bool(record_mem_types); +#else +tensorflow::int64 FLAGS_brain_gpu_util_debug_string_maxlen = 128; +bool FLAGS_EXPERIMENTAL_brain_gpu_multi_stream = false; +extern bool FLAGS_record_mem_types; +#endif + +using perftools::gputools::DeviceMemoryBase; +using perftools::gputools::DeviceMemory; +using perftools::gputools::Stream; + +namespace tensorflow { + +namespace gpu = ::perftools::gputools; + +/*static*/ +void GPUUtil::SetProtoFromGPU(const Tensor& tensor, Device* dev, + const DeviceContext* device_context, + TensorProto* proto, bool is_dead, + StatusCallback done) { + VLOG(1) << "SetProtoFromGPU device_context " << device_context; + // Tensor values need to be copied from GPU to CPU ram so that + // we can build the protobuf response for a RecvTensor RPC. + // "device context" identifies the stream where the _Send op executed. + CHECK(device_context); + gpu::Stream* stream = + static_cast<const GPUDeviceContext*>(device_context)->stream(); + + if (!DMAHelper::CanUseDMA(&tensor)) { + done(errors::Internal(strings::StrCat( + "GPU copy from non-DMA ", DataTypeString(tensor.dtype()), "tensor"))); + return; + } + proto->set_dtype(tensor.dtype()); + tensor.shape().AsProto(proto->mutable_tensor_shape()); + // Prepare a Cord with the right data buf size, and DMA the + // data over from the GPU buffer. Note that 0-size tensors + // do not have a backing buffer. + const size_t num_bytes = is_dead ? 0 : tensor.TotalBytes(); + if (num_bytes > 0) { + port::Tracing::ScopedAnnotation annotation("SetProtoFromGPU"); + Allocator* alloc = ProcessState::singleton()->GetCUDAHostAllocator(0); + char* mb = alloc->Allocate<char>(num_bytes); + const char* src_ptr = + reinterpret_cast<const char*>(DMAHelper::base(&tensor)); + DeviceMemoryBase gpu_src_ptr(const_cast<char*>(src_ptr), num_bytes); + stream->ThenMemcpy(mb, gpu_src_ptr, num_bytes); + // Use of tensor may outlive stack scope, so keep a ref. + Tensor* tensor_ref = new Tensor(tensor); + dev->tensorflow_gpu_device_info()->event_mgr->ThenExecute( + stream, [stream, done, proto, mb, num_bytes, alloc, tensor_ref]() { + if (!stream->ok()) { + done(errors::Internal("SetProtoFromGPU: GPU Memcpy failed")); + // TODO(pbar) We currently have no way to recover the + // worker from a GPU stream in the error state. Until + // there is a way to reset the CUDA driver, it is + // preferable to crash the process and restart. Tracked + // under b/23717097 + LOG(FATAL) << "SetProtoFromGPU: GPU Memcpy failed"; + return; + } + delete tensor_ref; + port::CopyFromArray(proto->mutable_tensor_content(), mb, num_bytes); + alloc->Deallocate<char>(mb); + done(Status::OK()); + }); + } else { + done(Status::OK()); + } +} + +typedef ProcessState::MemDesc PMD; + +/*static*/ +void GPUUtil::CopyViaDMA(const string& edge_name, + DeviceContext* send_dev_context, + DeviceContext* recv_dev_context, Device* src, + Device* dst, AllocatorAttributes src_alloc_attr, + AllocatorAttributes dst_alloc_attr, + const Tensor* input, Tensor* output, + StatusCallback done) { + port::Tracing::ScopedAnnotation annotation(edge_name); + VLOG(1) << "CopyViaDMA " << edge_name; + size_t total_bytes = input->TotalBytes(); + // Note that 0-size tensors have no backing buffer. + if (total_bytes > 0) { + const void* src_ptr = DMAHelper::base(input); + void* dst_ptr = DMAHelper::base(output); + VLOG(2) << "src_ptr " << src_ptr << " dst_ptr " << dst_ptr; + if (FLAGS_record_mem_types) { + ProcessState::MemDesc smd = ProcessState::singleton()->PtrType(src_ptr); + ProcessState::MemDesc dmd = ProcessState::singleton()->PtrType(dst_ptr); + VLOG(0) << "Src " << smd.DebugString() << " Dst " << dmd.DebugString(); + if (smd.loc == PMD::CPU && dmd.loc == PMD::GPU && (!smd.gpu_registered)) { + LOG(WARNING) << "CPU -> GPU no reg for " << edge_name; + } + if (dmd.loc == PMD::CPU && smd.loc == PMD::GPU && (!dmd.gpu_registered)) { + LOG(WARNING) << "GPU -> CPU no reg for " << edge_name; + } + } + + auto src_device_type = src->attributes().device_type(); + auto dst_device_type = dst->attributes().device_type(); + + bool non_cpu_src = (!src_alloc_attr.on_host() && + src_device_type != DeviceType(DEVICE_CPU).type()); + bool non_cpu_dst = (!dst_alloc_attr.on_host() && + dst_device_type != DeviceType(DEVICE_CPU).type()); + if (non_cpu_src) { + gpu::Stream* stream = send_dev_context->stream(); + if (stream == nullptr) { + done(errors::Internal("Failed to find device stream")); + return; + } + auto* src_dev_info = src->tensorflow_gpu_device_info(); + CHECK(src_dev_info); + + if (non_cpu_dst) { + // Device to device copy + DeviceMemoryBase gpu_dst_ptr(dst_ptr, total_bytes); + stream->ThenMemcpy( + &gpu_dst_ptr, + DeviceMemoryBase{const_cast<void*>(src_ptr), total_bytes}, + total_bytes); + if (dst_device_type == DeviceType(DEVICE_GPU).type()) { + // Use of input may outlive stack scope, so keep a ref. + Tensor* input_ref = new Tensor(*input); + src_dev_info->event_mgr->ThenExecute( + stream, [done, stream, input_ref]() { + delete input_ref; + if (!stream->ok()) { + done(errors::Internal("GPU->GPU Memcpy failed")); + } else { + done(Status::OK()); + } + }); + } + send_dev_context->MaintainLifetimeOnStream(input, stream); + } else { + // Device to host copy. + return send_dev_context->CopyDeviceTensorToCPU(input, edge_name, src, + output, done); + } + } else if (non_cpu_dst) { + // Host to Device copy. + // Note that this is already an async copy. + recv_dev_context->CopyCPUTensorToDevice(input, dst, output, done); + } else { + memcpy(dst_ptr, src_ptr, total_bytes); + done(Status::OK()); + } + } else { + // buffer is empty + done(Status::OK()); + } +} + +void GPUUtil::CopyGPUTensorToCPU(Device* gpu_device, + const DeviceContext* device_context, + const Tensor* gpu_tensor, Tensor* cpu_tensor, + StatusCallback done) { + VLOG(1) << "CopyGPUTensorToCPU"; + size_t total_bytes = gpu_tensor->TotalBytes(); + // Note that 0-size tensors have no backing buffer. + if (total_bytes > 0) { + const void* src_ptr = DMAHelper::base(gpu_tensor); + void* dst_ptr = DMAHelper::base(cpu_tensor); + CHECK(dst_ptr); + auto* stream = gpu_device->tensorflow_gpu_device_info()->stream; + if (device_context) { + stream = static_cast<const GPUDeviceContext*>(device_context)->stream(); + } + stream->ThenMemcpy( + dst_ptr, DeviceMemoryBase{const_cast<void*>(src_ptr), total_bytes}, + total_bytes); + stream->BlockHostUntilDone(); + if (!stream->ok()) { + done(errors::Internal("CopyGPUTensorToCPU: GPU->CPU Memcpy failed")); + return; + } + } + + done(Status::OK()); +} + +/* static */ +void GPUUtil::CopyCPUTensorToGPU(const Tensor* cpu_tensor, + const DeviceContext* device_context, + Device* gpu_device, Tensor* gpu_tensor, + StatusCallback done) { + VLOG(1) << "CopyCPUTensorToGPU"; + CHECK(DeviceType(gpu_device->attributes().device_type()) == + DeviceType(DEVICE_GPU)); + + auto* dev_info = gpu_device->tensorflow_gpu_device_info(); + if (!dev_info) { + done(errors::Internal("Failed to find dest device GPUDeviceInfo")); + return; + } + if (cpu_tensor->TotalBytes() != gpu_tensor->TotalBytes()) { + done(errors::Internal( + strings::StrCat("Can't copy ", cpu_tensor->TotalBytes(), + " bytes of a tensor into another with ", + gpu_tensor->TotalBytes(), " bytes buffer."))); + return; + } + const int64 total_bytes = cpu_tensor->TotalBytes(); + // Note that 0-size tensors have no backing buffer. + if (total_bytes > 0) { + const void* src_ptr = DMAHelper::base(cpu_tensor); + void* dst_ptr = DMAHelper::base(gpu_tensor); + DeviceMemoryBase gpu_dst_ptr(dst_ptr, total_bytes); + + CHECK(device_context); + auto* stream = + static_cast<const GPUDeviceContext*>(device_context)->stream(); + stream->ThenMemcpy(&gpu_dst_ptr, src_ptr, total_bytes); + auto* dev_info = gpu_device->tensorflow_gpu_device_info(); + // Use of cpu_tensor may outlive stack scope, so keep a ref. + Tensor* input_ref = new Tensor(*cpu_tensor); + dev_info->event_mgr->ThenExecute(stream, [stream, done, input_ref]() { + delete input_ref; + if (!stream->ok()) { + done(errors::Internal("CopyCPUTensorToGPU: GPU Memcpy failed")); + } else { + done(Status::OK()); + } + }); + } else { + // empty tensor case + done(Status::OK()); + } +} + +Status GPUUtil::Sync(Device* gpu_device) { + VLOG(1) << "GPUUtil::Sync"; + auto* dev_info = gpu_device->tensorflow_gpu_device_info(); + if (!dev_info) { + return errors::Internal("Failed to find dest device GPUDeviceInfo"); + } + dev_info->stream->BlockHostUntilDone(); + if (!dev_info->stream->ok()) { + LOG(FATAL) << "GPU sync failed"; + } + return Status::OK(); +} + +Status GPUUtil::SyncAll(Device* gpu_device) { + VLOG(1) << "GPUUtil::SyncAll"; + auto* dev_info = gpu_device->tensorflow_gpu_device_info(); + if (!dev_info) { + return errors::Internal("Failed to find dest device GPUDeviceInfo"); + } + if (!dev_info->stream->parent()->SynchronizeAllActivity() || + !dev_info->stream->ok()) { + LOG(FATAL) << "GPU sync failed"; + } + return Status::OK(); +} + +string GPUUtil::MemoryDebugString(const Device* device, Tensor* tensor) { + string ret; + CHECK(tensor); + const int64 num_bytes = std::min<int64>( + FLAGS_brain_gpu_util_debug_string_maxlen, tensor->TotalBytes()); + void* ptr = (num_bytes > 0) ? DMAHelper::base(tensor) : nullptr; + strings::Appendf(&ret, "%p:", ptr); + if (num_bytes > 0) { + auto* dev_info = device->tensorflow_gpu_device_info(); + if (!dev_info) { + strings::StrAppend( + &ret, PrintMemory(reinterpret_cast<const char*>(ptr), num_bytes)); + } else { + string buf; + buf.resize(num_bytes); + DeviceMemoryBase gpu_ptr(ptr, num_bytes); + Status s = dev_info->stream->parent()->SynchronousMemcpyD2H( + gpu_ptr, num_bytes, gtl::string_as_array(&buf)); + strings::StrAppend(&ret, + PrintMemory(gtl::string_as_array(&buf), num_bytes)); + } + } + return ret; +} + +// TODO(pbar) Checksum is called from places without a valid device context. +uint64 GPUUtil::Checksum(Device* gpu_device, + const DeviceContext* device_context, + const Tensor& tensor) { + Tensor copy(tensor.dtype(), tensor.shape()); + Status s; + Notification n; + CopyGPUTensorToCPU(gpu_device, device_context, &tensor, ©, + [&s, &n](Status status) { + s.Update(status); + n.Notify(); + }); + n.WaitForNotification(); + CHECK(s.ok()) << s; + return Checksum(copy); +} + +uint64 GPUUtil::Checksum(const Tensor& tensor) { + const float* fptr = reinterpret_cast<const float*>(DMAHelper::base(&tensor)); + size_t num_bytes = tensor.TotalBytes(); + size_t num_floats = num_bytes / sizeof(float); + for (size_t i = 0; i < num_floats; ++i) { + CHECK(!std::isnan(fptr[i])) << " i " << i; + } + // TODO(tucker): consider using crc32c instead. + return Hash64(reinterpret_cast<const char*>(DMAHelper::base(&tensor)), + tensor.TotalBytes(), 0); +} + +} // namespace tensorflow |