aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/core/common_runtime/gpu/gpu_util.cc
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow/core/common_runtime/gpu/gpu_util.cc')
-rw-r--r--tensorflow/core/common_runtime/gpu/gpu_util.cc282
1 files changed, 162 insertions, 120 deletions
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.cc b/tensorflow/core/common_runtime/gpu/gpu_util.cc
index 2accf92503..f34ac256d1 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_util.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_util.cc
@@ -37,6 +37,18 @@ limitations under the License.
#include "tensorflow/core/platform/tracing.h"
#include "tensorflow/core/util/util.h"
+// IMPLEMENTATION NOTE:
+//
+// 1. Within this module, we intentionally LOG(FATAL) if any stream
+// involved in memcpy becomes !stream->ok(), because TF process
+// today (1/2016) can not properly recover from such an error.
+//
+// 2. When 0-size tensor is being copied, we should not schedule a
+// copy ThenMemcpy since there is no byte to move. However, we must
+// ensure the causal ordering by arranging the copy done callback
+// happens-after all activities scheduled on the given stream being
+// finished.
+
// If this need to be runtime configurable, consider adding options to
// ConfigProto.
const tensorflow::int64 FLAGS_brain_gpu_util_debug_string_maxlen = 128;
@@ -50,60 +62,106 @@ namespace tensorflow {
namespace gpu = ::perftools::gputools;
+Status PrepareCopy(Device* device, const DeviceContext* ctx, const Tensor& src,
+ const Tensor* dst,
+ const DeviceBase::GpuDeviceInfo** dev_info,
+ gpu::Stream** stream) {
+ if (device == nullptr) {
+ return errors::Internal("Unexpected null device.");
+ }
+ auto di = device->tensorflow_gpu_device_info();
+ if (di == nullptr) {
+ return errors::Internal("Unexpected null device info.");
+ }
+ *dev_info = di;
+ if (ctx == nullptr) {
+ return errors::Internal("Unexpected null device context.");
+ }
+ auto gs = static_cast<const GPUDeviceContext*>(ctx)->stream();
+ if (gs == nullptr) {
+ return errors::Internal("No gpu stream is available.");
+ }
+ *stream = gs;
+ if (dst != nullptr) {
+ if (src.dtype() != dst->dtype()) {
+ return errors::Internal("Can't copy a tensor of ",
+ DataTypeString(src.dtype()), " into a tensor of ",
+ DataTypeString(dst->dtype()));
+ }
+ if (src.TotalBytes() != dst->TotalBytes()) {
+ return errors::Internal("Can't copy ", src.TotalBytes(),
+ " bytes of a tensor into another with ",
+ dst->TotalBytes(), " bytes buffer.");
+ }
+ if ((src.TotalBytes() > 0) && !src.IsInitialized()) {
+ return errors::Internal("Src tensor is not initialized.");
+ }
+ if ((dst->TotalBytes() > 0) && !dst->IsInitialized()) {
+ return errors::Internal("Dst tensor is not initialized.");
+ }
+ }
+ if (!DMAHelper::CanUseDMA(&src)) {
+ return errors::Internal("GPU copy from non-DMA ",
+ DataTypeString(src.dtype()), "tensor");
+ }
+ return Status::OK();
+}
+
+void* GetBase(const Tensor* src) {
+ return const_cast<void*>(DMAHelper::base(src));
+}
+
+void* GetBase(Tensor* dst) { return DMAHelper::base(dst); }
+
/*static*/
void GPUUtil::SetProtoFromGPU(const Tensor& tensor, Device* dev,
const DeviceContext* device_context,
TensorProto* proto, bool is_dead,
StatusCallback done) {
VLOG(1) << "SetProtoFromGPU device_context " << device_context;
+ const DeviceBase::GpuDeviceInfo* dev_info = nullptr;
+ gpu::Stream* stream = nullptr;
+ Status s =
+ PrepareCopy(dev, device_context, tensor, nullptr, &dev_info, &stream);
+ if (!s.ok()) {
+ done(s);
+ return;
+ }
+
// Tensor values need to be copied from GPU to CPU ram so that
// we can build the protobuf response for a RecvTensor RPC.
// "device context" identifies the stream where the _Send op executed.
- CHECK(device_context);
- gpu::Stream* stream =
- static_cast<const GPUDeviceContext*>(device_context)->stream();
-
- if (!DMAHelper::CanUseDMA(&tensor)) {
- done(errors::Internal(strings::StrCat(
- "GPU copy from non-DMA ", DataTypeString(tensor.dtype()), "tensor")));
- return;
- }
proto->set_dtype(tensor.dtype());
tensor.shape().AsProto(proto->mutable_tensor_shape());
- // Prepare a Cord with the right data buf size, and DMA the
- // data over from the GPU buffer. Note that 0-size tensors
- // do not have a backing buffer.
- const size_t num_bytes = is_dead ? 0 : tensor.TotalBytes();
- if (num_bytes > 0) {
+
+ // Prepare a proto with the right data buf size, and DMA the data
+ // over from the GPU buffer. Note that 0-size tensors do not have a
+ // backing buffer.
+ Allocator* alloc = nullptr;
+ char* buf = nullptr;
+ const int64 total_bytes = is_dead ? 0 : tensor.TotalBytes();
+ if (total_bytes > 0) {
port::Tracing::ScopedAnnotation annotation("SetProtoFromGPU");
- Allocator* alloc = ProcessState::singleton()->GetCUDAHostAllocator(0);
- char* mb = alloc->Allocate<char>(num_bytes);
- const char* src_ptr =
- reinterpret_cast<const char*>(DMAHelper::base(&tensor));
- DeviceMemoryBase gpu_src_ptr(const_cast<char*>(src_ptr), num_bytes);
- stream->ThenMemcpy(mb, gpu_src_ptr, num_bytes);
- // Use of tensor may outlive stack scope, so keep a ref.
- TensorReference tensor_ref(tensor);
- dev->tensorflow_gpu_device_info()->event_mgr->ThenExecute(
- stream, [stream, done, proto, mb, num_bytes, alloc, tensor_ref]() {
- if (!stream->ok()) {
- done(errors::Internal("SetProtoFromGPU: GPU Memcpy failed"));
- // TODO(pbar) We currently have no way to recover the
- // worker from a GPU stream in the error state. Until
- // there is a way to reset the CUDA driver, it is
- // preferable to crash the process and restart. Tracked
- // under b/23717097
- LOG(FATAL) << "SetProtoFromGPU: GPU Memcpy failed";
- return;
- }
- tensor_ref.Unref();
- port::CopyFromArray(proto->mutable_tensor_content(), mb, num_bytes);
- alloc->Deallocate<char>(mb, num_bytes);
- done(Status::OK());
- });
- } else {
- done(Status::OK());
+ alloc = ProcessState::singleton()->GetCUDAHostAllocator(0);
+ buf = alloc->Allocate<char>(total_bytes);
+ void* src_ptr = GetBase(&tensor);
+ DeviceMemoryBase gpu_src_ptr(src_ptr, total_bytes);
+ stream->ThenMemcpy(buf, gpu_src_ptr, total_bytes);
}
+ // Use of tensor may outlive stack scope, so keep a ref.
+ TensorReference tensor_ref(tensor);
+ dev_info->event_mgr->ThenExecute(stream, [stream, done, proto, buf,
+ total_bytes, alloc, tensor_ref]() {
+ if (!stream->ok()) {
+ LOG(FATAL) << "SetProtoFromGPU: GPU Memcpy failed";
+ }
+ tensor_ref.Unref();
+ if (total_bytes > 0) {
+ port::CopyFromArray(proto->mutable_tensor_content(), buf, total_bytes);
+ alloc->Deallocate<char>(buf, total_bytes);
+ }
+ done(Status::OK());
+ });
}
// static
@@ -114,67 +172,67 @@ void GPUUtil::DeviceToDeviceCopy(DeviceContext* send_dev_context,
AllocatorAttributes dst_alloc_attr,
const Tensor* input, Tensor* output,
StatusCallback done) {
- const void* src_ptr = DMAHelper::base(input);
- void* dst_ptr = DMAHelper::base(output);
- VLOG(2) << "src_ptr " << src_ptr << " dst_ptr " << dst_ptr;
- const size_t total_bytes = input->TotalBytes();
-
- gpu::Stream* stream = send_dev_context->stream();
- if (stream == nullptr) {
- done(errors::Internal("Failed to find device stream"));
+ const DeviceBase::GpuDeviceInfo* dev_info = nullptr;
+ gpu::Stream* stream = nullptr;
+ Status s =
+ PrepareCopy(src, send_dev_context, *input, output, &dev_info, &stream);
+ if (!s.ok()) {
+ done(s);
return;
}
- auto* src_dev_info = src->tensorflow_gpu_device_info();
- CHECK(src_dev_info);
- DeviceMemoryBase gpu_dst_ptr(dst_ptr, total_bytes);
- stream->ThenMemcpy(&gpu_dst_ptr,
- DeviceMemoryBase{const_cast<void*>(src_ptr), total_bytes},
- total_bytes);
- if (dst->attributes().device_type() == DeviceType(DEVICE_GPU).type()) {
- // Use of input may outlive stack scope, so keep a ref.
- TensorReference input_ref(*input);
- src_dev_info->event_mgr->ThenExecute(stream, [done, stream, input_ref]() {
- input_ref.Unref();
- if (!stream->ok()) {
- done(errors::Internal("GPU->GPU Memcpy failed"));
- } else {
- done(Status::OK());
- }
- });
+ const int64 total_bytes = input->TotalBytes();
+ if (total_bytes > 0) {
+ void* src_ptr = GetBase(input);
+ DeviceMemoryBase gpu_src_ptr(src_ptr, total_bytes);
+ void* dst_ptr = GetBase(output);
+ DeviceMemoryBase gpu_dst_ptr(dst_ptr, total_bytes);
+ VLOG(2) << "src_ptr " << src_ptr << " dst_ptr " << dst_ptr;
+ stream->ThenMemcpy(&gpu_dst_ptr, gpu_src_ptr, total_bytes);
}
+
+ // Use of input may outlive stack scope, so keep a ref.
+ TensorReference input_ref(*input);
+ dev_info->event_mgr->ThenExecute(stream, [done, stream, input_ref]() {
+ input_ref.Unref();
+ if (!stream->ok()) {
+ LOG(FATAL) << "GPU->GPU Memcpy failed";
+ }
+ done(Status::OK());
+ });
send_dev_context->MaintainLifetimeOnStream(input, stream);
}
static CopyTensor::Registration register_gpu_gpu_copy(
DEVICE_GPU, DEVICE_GPU, GPUUtil::DeviceToDeviceCopy);
+// static
void GPUUtil::CopyGPUTensorToCPU(Device* gpu_device,
const DeviceContext* device_context,
const Tensor* gpu_tensor, Tensor* cpu_tensor,
StatusCallback done) {
VLOG(1) << "CopyGPUTensorToCPU";
- size_t total_bytes = gpu_tensor->TotalBytes();
- // Note that 0-size tensors have no backing buffer.
+ const DeviceBase::GpuDeviceInfo* dev_info = nullptr;
+ gpu::Stream* stream = nullptr;
+ Status s = PrepareCopy(gpu_device, device_context, *gpu_tensor, cpu_tensor,
+ &dev_info, &stream);
+ if (!s.ok()) {
+ done(s);
+ return;
+ }
+ const int64 total_bytes = gpu_tensor->TotalBytes();
if (total_bytes > 0) {
- const void* src_ptr = DMAHelper::base(gpu_tensor);
- void* dst_ptr = DMAHelper::base(cpu_tensor);
- CHECK(dst_ptr);
- auto* stream = gpu_device->tensorflow_gpu_device_info()->stream;
- if (device_context) {
- stream = static_cast<const GPUDeviceContext*>(device_context)->stream();
- }
- stream->ThenMemcpy(
- dst_ptr, DeviceMemoryBase{const_cast<void*>(src_ptr), total_bytes},
- total_bytes);
- stream->BlockHostUntilDone();
+ void* src_ptr = GetBase(gpu_tensor);
+ DeviceMemoryBase gpu_src_ptr(src_ptr, total_bytes);
+ void* dst_ptr = GetBase(cpu_tensor);
+ stream->ThenMemcpy(dst_ptr, gpu_src_ptr, total_bytes);
+ }
+ dev_info->event_mgr->ThenExecute(stream, [stream, done]() {
if (!stream->ok()) {
- done(errors::Internal("CopyGPUTensorToCPU: GPU->CPU Memcpy failed"));
- return;
+ LOG(FATAL) << "GPU->CPU Memcpy failed";
}
- }
-
- done(Status::OK());
+ done(Status::OK());
+ });
}
/* static */
@@ -183,47 +241,31 @@ void GPUUtil::CopyCPUTensorToGPU(const Tensor* cpu_tensor,
Device* gpu_device, Tensor* gpu_tensor,
StatusCallback done) {
VLOG(1) << "CopyCPUTensorToGPU";
- CHECK(DeviceType(gpu_device->attributes().device_type()) ==
- DeviceType(DEVICE_GPU));
-
- auto* dev_info = gpu_device->tensorflow_gpu_device_info();
- if (!dev_info) {
- done(errors::Internal("Failed to find dest device GPUDeviceInfo"));
- return;
- }
- if (cpu_tensor->TotalBytes() != gpu_tensor->TotalBytes()) {
- done(errors::Internal(
- strings::StrCat("Can't copy ", cpu_tensor->TotalBytes(),
- " bytes of a tensor into another with ",
- gpu_tensor->TotalBytes(), " bytes buffer.")));
+ const DeviceBase::GpuDeviceInfo* dev_info = nullptr;
+ gpu::Stream* stream = nullptr;
+ Status s = PrepareCopy(gpu_device, device_context, *cpu_tensor, gpu_tensor,
+ &dev_info, &stream);
+ if (!s.ok()) {
+ done(s);
return;
}
const int64 total_bytes = cpu_tensor->TotalBytes();
// Note that 0-size tensors have no backing buffer.
if (total_bytes > 0) {
- const void* src_ptr = DMAHelper::base(cpu_tensor);
- void* dst_ptr = DMAHelper::base(gpu_tensor);
+ void* src_ptr = GetBase(cpu_tensor);
+ void* dst_ptr = GetBase(gpu_tensor);
DeviceMemoryBase gpu_dst_ptr(dst_ptr, total_bytes);
-
- CHECK(device_context);
- auto* stream =
- static_cast<const GPUDeviceContext*>(device_context)->stream();
stream->ThenMemcpy(&gpu_dst_ptr, src_ptr, total_bytes);
- auto* dev_info = gpu_device->tensorflow_gpu_device_info();
- // Use of cpu_tensor may outlive stack scope, so keep a ref.
- TensorReference input_ref(*cpu_tensor);
- dev_info->event_mgr->ThenExecute(stream, [stream, done, input_ref]() {
- input_ref.Unref();
- if (!stream->ok()) {
- done(errors::Internal("CopyCPUTensorToGPU: GPU Memcpy failed"));
- } else {
- done(Status::OK());
- }
- });
- } else {
- // empty tensor case
- done(Status::OK());
}
+ // Use of cpu_tensor may outlive stack scope, so keep a ref.
+ TensorReference input_ref(*cpu_tensor);
+ dev_info->event_mgr->ThenExecute(stream, [stream, done, input_ref]() {
+ input_ref.Unref();
+ if (!stream->ok()) {
+ LOG(FATAL) << "CPU->GPU Memcpy failed";
+ }
+ done(Status::OK());
+ });
}
Status GPUUtil::Sync(Device* gpu_device) {
@@ -257,7 +299,7 @@ string GPUUtil::MemoryDebugString(const Device* device, Tensor* tensor) {
CHECK(tensor);
const int64 num_bytes = std::min<int64>(
FLAGS_brain_gpu_util_debug_string_maxlen, tensor->TotalBytes());
- void* ptr = (num_bytes > 0) ? DMAHelper::base(tensor) : nullptr;
+ void* ptr = (num_bytes > 0) ? GetBase(tensor) : nullptr;
strings::Appendf(&ret, "%p:", ptr);
if (num_bytes > 0) {
auto* dev_info = device->tensorflow_gpu_device_info();
@@ -295,14 +337,14 @@ uint64 GPUUtil::Checksum(Device* gpu_device,
}
uint64 GPUUtil::Checksum(const Tensor& tensor) {
- const float* fptr = reinterpret_cast<const float*>(DMAHelper::base(&tensor));
+ const float* fptr = reinterpret_cast<const float*>(GetBase(&tensor));
size_t num_bytes = tensor.TotalBytes();
size_t num_floats = num_bytes / sizeof(float);
for (size_t i = 0; i < num_floats; ++i) {
CHECK(!std::isnan(fptr[i])) << " i " << i;
}
// TODO(tucker): consider using crc32c instead.
- return Hash64(reinterpret_cast<const char*>(DMAHelper::base(&tensor)),
+ return Hash64(reinterpret_cast<const char*>(GetBase(&tensor)),
tensor.TotalBytes(), 0);
}