diff options
author | 2016-01-22 14:45:34 -0800 | |
---|---|---|
committer | 2016-01-22 17:04:38 -0800 | |
commit | 4ba51b33357d68f882a920fb4f87bfe67bb034a0 (patch) | |
tree | 1d9741962b25bde407e90103c506ca9ab1ec2042 /tensorflow | |
parent | fde1dc4a489471bb9064f7a0013b9c89f46febf8 (diff) |
Disentangle the GPU code from the CPU code. This means a few things:
* The "core_cpu_internal" build target no longer includes files from the
common_runtime/gpu/ directory.
* tensorflow/core internal targets instead can get access to those headers via
the "gpu_runtime" target.
* The class "CopyTensor" is introduced. It lives in common_runtime/
but supports registration of copy functions so the "gpu_runtime"
target can add a GPU->GPU copy ability if it is linked in.
This registration should make it easier to add more device types
in the future.
* The "core_cpu" and "core_cpu_internal" build targets no longer
reference GPUUtil::CopyViaDMA; rendezvous_mgr uses CopyTensor
instead.
Also the "copy_tensor" build target was not needed.
Change: 112821119
Diffstat (limited to 'tensorflow')
-rw-r--r-- | tensorflow/core/BUILD | 22 | ||||
-rw-r--r-- | tensorflow/core/common_runtime/copy_tensor.cc | 110 | ||||
-rw-r--r-- | tensorflow/core/common_runtime/copy_tensor.h | 54 | ||||
-rw-r--r-- | tensorflow/core/common_runtime/dma_helper.h (renamed from tensorflow/core/common_runtime/gpu/dma_helper.h) | 12 | ||||
-rw-r--r-- | tensorflow/core/common_runtime/gpu/gpu_util.cc | 118 | ||||
-rw-r--r-- | tensorflow/core/common_runtime/gpu/gpu_util.h | 24 | ||||
-rw-r--r-- | tensorflow/core/common_runtime/rendezvous_mgr.cc | 41 |
7 files changed, 228 insertions, 153 deletions
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index f0e962a735..95d5531876 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -773,23 +773,13 @@ cc_library( deps = ["//tensorflow/core/platform/default/build_config:protos_cc"], ) -cc_library( - name = "copy_tensor", - deps = [ - ":lib", - ":protos_cc", - ":stream_executor", - "//third_party/eigen3", - ], -) - tf_cuda_library( name = "core_cpu_internal", srcs = glob( [ "client/**/*.cc", - "common_runtime/**/*.h", # TODO(josh11b): exclude common_runtime/gpu/ - "common_runtime/**/*.cc", + "common_runtime/*.h", + "common_runtime/*.cc", "graph/**/*.h", "graph/**/*.cc", "public/session.h", @@ -800,8 +790,6 @@ tf_cuda_library( exclude = [ "**/*test*", "**/*main.cc", - "common_runtime/gpu/*.cc", - "common_runtime/copy_tensor.cc", "common_runtime/gpu_device_factory.cc", "common_runtime/direct_session.cc", "common_runtime/direct_session.h", @@ -809,7 +797,7 @@ tf_cuda_library( ), hdrs = glob( [ - "common_runtime/**/*.h", # TODO(josh11b): exclude common_runtime/gpu/ + "common_runtime/*.h", "graph/**/*.h", ], exclude = [ @@ -819,7 +807,6 @@ tf_cuda_library( ), copts = tf_copts(), deps = [ - ":copy_tensor", ":framework", ":framework_internal", ":lib", @@ -861,7 +848,6 @@ tf_cuda_library( name = "gpu_runtime", srcs = glob( [ - "common_runtime/gpu/*.h", "common_runtime/gpu/*.cc", ], exclude = [ @@ -869,6 +855,7 @@ tf_cuda_library( "**/*test.cc", ], ), + hdrs = glob(["common_runtime/gpu/*.h"]), copts = tf_copts(), cuda_deps = [ ":cuda", @@ -1020,6 +1007,7 @@ tf_cc_tests( ":direct_session", ":framework", ":framework_internal", + ":gpu_runtime", ":kernels", ":lib", ":lib_internal", diff --git a/tensorflow/core/common_runtime/copy_tensor.cc b/tensorflow/core/common_runtime/copy_tensor.cc new file mode 100644 index 0000000000..54fe6a4a08 --- /dev/null +++ b/tensorflow/core/common_runtime/copy_tensor.cc @@ -0,0 +1,110 @@ +#include "tensorflow/core/common_runtime/copy_tensor.h" + +#include <vector> +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/tracing.h" + +namespace tensorflow { +namespace { + +static bool initialization_done = false; + +struct RegistrationInfo { + RegistrationInfo(DeviceType s, DeviceType r, CopyTensor::CopyFunction cf) + : sender_device_type(s), receiver_device_type(r), copy_function(cf) {} + DeviceType sender_device_type; + DeviceType receiver_device_type; + CopyTensor::CopyFunction copy_function; +}; + +// We use a vector instead of a map since we expect there to be very +// few registrations. +std::vector<RegistrationInfo>* MutableRegistry() { + static std::vector<RegistrationInfo>* registry = + new std::vector<RegistrationInfo>; + return registry; +} + +} // namespace + +// static +void CopyTensor::ViaDMA(const string& edge_name, + DeviceContext* send_dev_context, + DeviceContext* recv_dev_context, Device* src, + Device* dst, const AllocatorAttributes src_alloc_attr, + const AllocatorAttributes dst_alloc_attr, + const Tensor* input, Tensor* output, + StatusCallback done) { + initialization_done = true; + port::Tracing::ScopedAnnotation annotation(edge_name); + VLOG(1) << "CopyViaDMA " << edge_name; + const size_t total_bytes = input->TotalBytes(); + + // Note that 0-size tensors have no backing buffer. + if (total_bytes > 0) { + const DeviceType src_device_type(src_alloc_attr.on_host() + ? DEVICE_CPU + : src->attributes().device_type()); + const DeviceType dst_device_type(dst_alloc_attr.on_host() + ? DEVICE_CPU + : dst->attributes().device_type()); + const bool non_cpu_src = src_device_type != DeviceType(DEVICE_CPU); + const bool non_cpu_dst = dst_device_type != DeviceType(DEVICE_CPU); + + if (non_cpu_src) { + if (non_cpu_dst) { + // Device to device copy. Look through registry for an appropriate + // CopyFunction. + std::vector<RegistrationInfo>* registry = MutableRegistry(); + for (const RegistrationInfo& ri : *registry) { + if (ri.sender_device_type == src_device_type && + ri.receiver_device_type == dst_device_type) { + ri.copy_function(send_dev_context, recv_dev_context, src, dst, + src_alloc_attr, dst_alloc_attr, input, output, + done); + return; + } + } + + // TODO(josh11b): If no CopyFunction is found, we currently fail + // but we could copy between devices via CPU. + done(errors::Unimplemented( + "No function registered to copy from devices of type ", + src_device_type.type(), " to devices of type ", + dst_device_type.type())); + } else { + // Device to host copy. + return send_dev_context->CopyDeviceTensorToCPU(input, edge_name, src, + output, done); + } + } else if (non_cpu_dst) { + // Host to Device copy. + // Note that this is already an async copy. + recv_dev_context->CopyCPUTensorToDevice(input, dst, output, done); + } else { + *output = *input; + done(Status::OK()); + } + } else { + // buffer is empty + done(Status::OK()); + } +} + +// static +Status CopyTensor::Register(DeviceType sender_device_type, + DeviceType receiver_device_type, + CopyFunction copy_function) { + if (initialization_done) { + return errors::FailedPrecondition( + "May only register CopyTensor functions during before the first tensor " + "is copied."); + } + std::vector<RegistrationInfo>* registry = MutableRegistry(); + registry->emplace_back(sender_device_type, receiver_device_type, + copy_function); + return Status::OK(); +} + +} // namespace tensorflow diff --git a/tensorflow/core/common_runtime/copy_tensor.h b/tensorflow/core/common_runtime/copy_tensor.h new file mode 100644 index 0000000000..5bfeadbca1 --- /dev/null +++ b/tensorflow/core/common_runtime/copy_tensor.h @@ -0,0 +1,54 @@ +#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_COPY_TENSOR_H_ +#define TENSORFLOW_CORE_COMMON_RUNTIME_COPY_TENSOR_H_ + +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/device_base.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/types.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/platform/types.h" + +namespace tensorflow { + +class CopyTensor { + public: + typedef void (*CopyFunction)(DeviceContext* send_dev_context, + DeviceContext* recv_dev_context, Device* src, + Device* dst, + const AllocatorAttributes src_alloc_attr, + const AllocatorAttributes dst_alloc_attr, + const Tensor* input, Tensor* output, + StatusCallback done); + + // Copies "input" to "output" between devices accessible to the + // local process via some DMA-like method. "edge_name" is the name + // of the tensor being copied, for debugging purposes. Depending on + // the type of devices and memory in use, the copy may be performed + // synchronously or asynchronously. 'done' will be invoked only + // after the copy is actually complete. + static void ViaDMA(const string& edge_name, DeviceContext* send_dev_context, + DeviceContext* recv_dev_context, Device* src, Device* dst, + const AllocatorAttributes src_alloc_attr, + const AllocatorAttributes dst_alloc_attr, + const Tensor* input, Tensor* output, StatusCallback done); + + // Register a function for copying between two specific DeviceTypes. + static Status Register(DeviceType sender_device_type, + DeviceType receiver_device_type, + CopyFunction copy_function); + + // Object used to call Register() at static-initialization time. + class Registration { + public: + Registration(DeviceType sender_device_type, DeviceType receiver_device_type, + CopyFunction copy_function) { + TF_QCHECK_OK( + Register(sender_device_type, receiver_device_type, copy_function)); + } + }; +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_COMMON_RUNTIME_COPY_TENSOR_H_ diff --git a/tensorflow/core/common_runtime/gpu/dma_helper.h b/tensorflow/core/common_runtime/dma_helper.h index 57303bbf81..d0a6414189 100644 --- a/tensorflow/core/common_runtime/gpu/dma_helper.h +++ b/tensorflow/core/common_runtime/dma_helper.h @@ -13,14 +13,14 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#ifndef TENSORFLOW_COMMON_RUNTIME_GPU_DMA_HELPER_H_ -#define TENSORFLOW_COMMON_RUNTIME_GPU_DMA_HELPER_H_ +#ifndef TENSORFLOW_COMMON_RUNTIME_DMA_HELPER_H_ +#define TENSORFLOW_COMMON_RUNTIME_DMA_HELPER_H_ #include "tensorflow/core/public/tensor.h" -// For internal use only. Visibility should be limited to brain/framework. - namespace tensorflow { + +// For TensorFlow internal use only. class DMAHelper { public: static bool CanUseDMA(const Tensor* t) { return t->CanUseDMA(); } @@ -29,5 +29,7 @@ class DMAHelper { static TensorBuffer* buffer(Tensor* t) { return t->buf_; } static const TensorBuffer* buffer(const Tensor* t) { return t->buf_; } }; + } // namespace tensorflow -#endif // TENSORFLOW_COMMON_RUNTIME_GPU_DMA_HELPER_H_ + +#endif // TENSORFLOW_COMMON_RUNTIME_DMA_HELPER_H_ diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.cc b/tensorflow/core/common_runtime/gpu/gpu_util.cc index 9d2f917f51..b3d5024b5c 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_util.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_util.cc @@ -15,8 +15,9 @@ limitations under the License. #include "tensorflow/core/common_runtime/gpu/gpu_util.h" +#include "tensorflow/core/common_runtime/copy_tensor.h" #include "tensorflow/core/common_runtime/device.h" -#include "tensorflow/core/common_runtime/gpu/dma_helper.h" +#include "tensorflow/core/common_runtime/dma_helper.h" #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h" #include "tensorflow/core/common_runtime/gpu/process_state.h" #include "tensorflow/core/common_runtime/gpu_device_context.h" @@ -105,92 +106,49 @@ void GPUUtil::SetProtoFromGPU(const Tensor& tensor, Device* dev, } } -typedef ProcessState::MemDesc PMD; - -/*static*/ -void GPUUtil::CopyViaDMA(const string& edge_name, - DeviceContext* send_dev_context, - DeviceContext* recv_dev_context, Device* src, - Device* dst, AllocatorAttributes src_alloc_attr, - AllocatorAttributes dst_alloc_attr, - const Tensor* input, Tensor* output, - StatusCallback done) { - port::Tracing::ScopedAnnotation annotation(edge_name); - VLOG(1) << "CopyViaDMA " << edge_name; - size_t total_bytes = input->TotalBytes(); - // Note that 0-size tensors have no backing buffer. - if (total_bytes > 0) { - const void* src_ptr = DMAHelper::base(input); - void* dst_ptr = DMAHelper::base(output); - VLOG(2) << "src_ptr " << src_ptr << " dst_ptr " << dst_ptr; - if (FLAGS_brain_gpu_record_mem_types) { - ProcessState::MemDesc smd = ProcessState::singleton()->PtrType(src_ptr); - ProcessState::MemDesc dmd = ProcessState::singleton()->PtrType(dst_ptr); - VLOG(0) << "Src " << smd.DebugString() << " Dst " << dmd.DebugString(); - if (smd.loc == PMD::CPU && dmd.loc == PMD::GPU && (!smd.gpu_registered)) { - LOG(WARNING) << "CPU -> GPU no reg for " << edge_name; - } - if (dmd.loc == PMD::CPU && smd.loc == PMD::GPU && (!dmd.gpu_registered)) { - LOG(WARNING) << "GPU -> CPU no reg for " << edge_name; - } - } - - auto src_device_type = src->attributes().device_type(); - auto dst_device_type = dst->attributes().device_type(); +// static +void GPUUtil::DeviceToDeviceCopy(DeviceContext* send_dev_context, + DeviceContext* recv_dev_context, Device* src, + Device* dst, + AllocatorAttributes src_alloc_attr, + AllocatorAttributes dst_alloc_attr, + const Tensor* input, Tensor* output, + StatusCallback done) { + const void* src_ptr = DMAHelper::base(input); + void* dst_ptr = DMAHelper::base(output); + VLOG(2) << "src_ptr " << src_ptr << " dst_ptr " << dst_ptr; + const size_t total_bytes = input->TotalBytes(); - bool non_cpu_src = (!src_alloc_attr.on_host() && - src_device_type != DeviceType(DEVICE_CPU).type()); - bool non_cpu_dst = (!dst_alloc_attr.on_host() && - dst_device_type != DeviceType(DEVICE_CPU).type()); - if (non_cpu_src) { - if (non_cpu_dst) { - // Device to device copy - gpu::Stream* stream = send_dev_context->stream(); - if (stream == nullptr) { - done(errors::Internal("Failed to find device stream")); - return; - } - auto* src_dev_info = src->tensorflow_gpu_device_info(); - CHECK(src_dev_info); + gpu::Stream* stream = send_dev_context->stream(); + if (stream == nullptr) { + done(errors::Internal("Failed to find device stream")); + return; + } + auto* src_dev_info = src->tensorflow_gpu_device_info(); + CHECK(src_dev_info); - DeviceMemoryBase gpu_dst_ptr(dst_ptr, total_bytes); - stream->ThenMemcpy( - &gpu_dst_ptr, - DeviceMemoryBase{const_cast<void*>(src_ptr), total_bytes}, - total_bytes); - if (dst_device_type == DeviceType(DEVICE_GPU).type()) { - // Use of input may outlive stack scope, so keep a ref. - TensorReference input_ref(*input); - src_dev_info->event_mgr->ThenExecute( - stream, [done, stream, input_ref]() { - input_ref.Unref(); - if (!stream->ok()) { - done(errors::Internal("GPU->GPU Memcpy failed")); - } else { - done(Status::OK()); - } - }); - } - send_dev_context->MaintainLifetimeOnStream(input, stream); + DeviceMemoryBase gpu_dst_ptr(dst_ptr, total_bytes); + stream->ThenMemcpy(&gpu_dst_ptr, + DeviceMemoryBase{const_cast<void*>(src_ptr), total_bytes}, + total_bytes); + if (dst->attributes().device_type() == DeviceType(DEVICE_GPU).type()) { + // Use of input may outlive stack scope, so keep a ref. + TensorReference input_ref(*input); + src_dev_info->event_mgr->ThenExecute(stream, [done, stream, input_ref]() { + input_ref.Unref(); + if (!stream->ok()) { + done(errors::Internal("GPU->GPU Memcpy failed")); } else { - // Device to host copy. - return send_dev_context->CopyDeviceTensorToCPU(input, edge_name, src, - output, done); + done(Status::OK()); } - } else if (non_cpu_dst) { - // Host to Device copy. - // Note that this is already an async copy. - recv_dev_context->CopyCPUTensorToDevice(input, dst, output, done); - } else { - memcpy(dst_ptr, src_ptr, total_bytes); - done(Status::OK()); - } - } else { - // buffer is empty - done(Status::OK()); + }); } + send_dev_context->MaintainLifetimeOnStream(input, stream); } +static CopyTensor::Registration register_gpu_gpu_copy( + DEVICE_GPU, DEVICE_GPU, GPUUtil::DeviceToDeviceCopy); + void GPUUtil::CopyGPUTensorToCPU(Device* gpu_device, const DeviceContext* device_context, const Tensor* gpu_tensor, Tensor* cpu_tensor, diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.h b/tensorflow/core/common_runtime/gpu/gpu_util.h index b9044fcd08..ab9d87186a 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_util.h +++ b/tensorflow/core/common_runtime/gpu/gpu_util.h @@ -17,7 +17,7 @@ limitations under the License. #define TENSORFLOW_COMMON_RUNTIME_GPU_GPU_UTIL_H_ #include "tensorflow/core/common_runtime/device.h" -#include "tensorflow/core/common_runtime/gpu/dma_helper.h" +#include "tensorflow/core/common_runtime/dma_helper.h" #include "tensorflow/core/platform/stream_executor.h" #include "tensorflow/core/public/status.h" #include "tensorflow/core/public/tensor.h" @@ -42,20 +42,6 @@ class GPUUtil { TensorProto* proto, bool is_dead, StatusCallback done); - // Copies "input" to "output" between devices accessible to the - // local process via some DMA-like method. "edge_name" is the name - // of the tensor being copied, for debugging purposes. Depending on - // the type of devices and memory in use, the copy may be performed - // synchronously or asynchronously. 'done' will be invoked only - // after the copy is actually complete. - static void CopyViaDMA(const string& edge_name, - DeviceContext* send_dev_context, - DeviceContext* recv_dev_context, Device* src, - Device* dst, const AllocatorAttributes src_alloc_attr, - const AllocatorAttributes dst_alloc_attr, - const Tensor* input, Tensor* output, - StatusCallback done); - // Copies the data in 'gpu_tensor' into 'cpu_tensor'. // 'gpu_tensor''s backing memory must be on 'gpu_device' and // 'cpu_tensor' must be allocated to be of the same size as @@ -96,6 +82,14 @@ class GPUUtil { const DeviceContext* device_context, Device* gpu_device, Tensor* gpu_tensor, StatusCallback done); + + static void DeviceToDeviceCopy(DeviceContext* send_dev_context, + DeviceContext* recv_dev_context, Device* src, + Device* dst, + AllocatorAttributes src_alloc_attr, + AllocatorAttributes dst_alloc_attr, + const Tensor* input, Tensor* output, + StatusCallback done); }; } // namespace tensorflow diff --git a/tensorflow/core/common_runtime/rendezvous_mgr.cc b/tensorflow/core/common_runtime/rendezvous_mgr.cc index 95c11c075d..2f311f9db8 100644 --- a/tensorflow/core/common_runtime/rendezvous_mgr.cc +++ b/tensorflow/core/common_runtime/rendezvous_mgr.cc @@ -17,11 +17,9 @@ limitations under the License. #include <unordered_set> +#include "tensorflow/core/common_runtime/copy_tensor.h" #include "tensorflow/core/common_runtime/device.h" #include "tensorflow/core/common_runtime/device_mgr.h" -#if !defined(__ANDROID__) && (defined(PLATFORM_GOOGLE) || GOOGLE_CUDA) -#include "tensorflow/core/common_runtime/gpu/gpu_util.h" -#endif #include "tensorflow/core/framework/types.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/core/notification.h" @@ -33,35 +31,6 @@ limitations under the License. namespace tensorflow { -namespace { - -void CopyTensorBetweenDevices(const string& id, DeviceContext* send_dev_context, - DeviceContext* recv_dev_context, Device* src, - Device* dst, - const AllocatorAttributes src_alloc_attr, - const AllocatorAttributes dst_alloc_attr, - const Tensor* input, Tensor* output, - std::function<void(const Status&)> done) { - if (src->attributes().device_type() != dst->attributes().device_type()) { - done(errors::Unimplemented( - "Copy between device types not yet implemented: src=", src->name(), - " dst=", dst->name())); - } else if (src->attributes().device_type() != "CPU") { - done(errors::Unimplemented( - "Copy between non-CPU devices not yet implemented")); - } - *output = *input; - done(Status::OK()); -} - -#if !defined(__ANDROID__) && (defined(PLATFORM_GOOGLE) || GOOGLE_CUDA) -constexpr auto CopyTensorBetweenDevicesFunc = &GPUUtil::CopyViaDMA; -#else -constexpr auto CopyTensorBetweenDevicesFunc = &CopyTensorBetweenDevices; -#endif - -} // end namespace - IntraProcessRendezvous::IntraProcessRendezvous(const DeviceMgr* device_mgr) : device_mgr_(device_mgr), local_(NewLocalRendezvous()) {} @@ -136,10 +105,10 @@ void IntraProcessRendezvous::SameWorkerRecvDone( Tensor copy(out_allocator, in.dtype(), in.shape()); *out = copy; - CopyTensorBetweenDevicesFunc(parsed.edge_name, send_args.device_context, - recv_args.device_context, src_device, dst_device, - send_args.alloc_attrs, recv_args.alloc_attrs, - &in, out, done); + CopyTensor::ViaDMA(parsed.edge_name, send_args.device_context, + recv_args.device_context, src_device, dst_device, + send_args.alloc_attrs, recv_args.alloc_attrs, &in, out, + done); } void IntraProcessRendezvous::RecvAsync(const string& key, |