diff options
author | Smit Hinsu <hinsu@google.com> | 2018-05-21 17:42:15 -0700 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2018-05-21 17:44:41 -0700 |
commit | b1139814f91c5216eb5ff229ee7e1982e5f4e888 (patch) | |
tree | 7f85c8229bfd47eeba49890aa75b59c8680e619c /tensorflow/stream_executor | |
parent | d913a243196fa07d4728c8f7c1ce6444ecd086eb (diff) |
Introduce an option to allocate CUDA unified memory
PiperOrigin-RevId: 197490523
Diffstat (limited to 'tensorflow/stream_executor')
-rw-r--r-- | tensorflow/stream_executor/cuda/cuda_driver.cc | 32 | ||||
-rw-r--r-- | tensorflow/stream_executor/cuda/cuda_driver.h | 10 | ||||
-rw-r--r-- | tensorflow/stream_executor/cuda/cuda_gpu_executor.h | 8 | ||||
-rw-r--r-- | tensorflow/stream_executor/stream_executor_internal.h | 9 | ||||
-rw-r--r-- | tensorflow/stream_executor/stream_executor_pimpl.cc | 14 | ||||
-rw-r--r-- | tensorflow/stream_executor/stream_executor_pimpl.h | 10 |
6 files changed, 83 insertions, 0 deletions
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc index 273ed83997..09e9f9f758 100644 --- a/tensorflow/stream_executor/cuda/cuda_driver.cc +++ b/tensorflow/stream_executor/cuda/cuda_driver.cc @@ -21,6 +21,7 @@ limitations under the License. #include <set> #include <utility> +#include "cuda/include/cuda_runtime.h" #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h" #include "tensorflow/stream_executor/lib/casts.h" #include "tensorflow/stream_executor/lib/env.h" @@ -924,6 +925,37 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) { } } +/* static */ void *CUDADriver::UnifiedMemoryAllocate(CudaContext *context, + uint64 bytes) { + ScopedActivateContext activation(context); + CUdeviceptr result = 0; + // "Portable" memory is visible to all CUDA contexts. Safe for our use model. + CUresult res = cuMemAllocManaged(&result, bytes, CU_MEM_ATTACH_GLOBAL); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to alloc " << bytes + << " bytes unified memory; result: " << ToString(res); + return nullptr; + } + void *ptr = reinterpret_cast<void *>(result); + VLOG(2) << "allocated " << ptr << " for context " << context << " of " + << bytes << " bytes in unified memory"; + return ptr; +} + +/* static */ void CUDADriver::UnifiedMemoryDeallocate(CudaContext *context, + void *location) { + ScopedActivateContext activation(context); + CUdeviceptr pointer = port::bit_cast<CUdeviceptr>(location); + CUresult res = cuMemFree(pointer); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to free unified memory at " << location + << "; result: " << ToString(res); + } else { + VLOG(2) << "deallocated unified memory at " << location << " for context " + << context; + } +} + /* static */ void *CUDADriver::HostAllocate(CudaContext *context, uint64 bytes) { ScopedActivateContext activation(context); diff --git a/tensorflow/stream_executor/cuda/cuda_driver.h b/tensorflow/stream_executor/cuda/cuda_driver.h index b952cfaf68..3713a5b7b9 100644 --- a/tensorflow/stream_executor/cuda/cuda_driver.h +++ b/tensorflow/stream_executor/cuda/cuda_driver.h @@ -106,6 +106,16 @@ class CUDADriver { // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g89b3f154e17cc89b6eea277dbdf5c93a static void DeviceDeallocate(CudaContext* context, void *location); + // Allocates a unified memory space of size bytes associated with the given + // context via cuMemAllocManaged. + // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gb347ded34dc326af404aa02af5388a32 + static void* UnifiedMemoryAllocate(CudaContext* context, uint64 bytes); + + // Deallocates a unified memory space of size bytes associated with the given + // context via cuMemFree. + // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g89b3f154e17cc89b6eea277dbdf5c93a + static void UnifiedMemoryDeallocate(CudaContext* context, void* location); + // Allocates page-locked and CUDA-registered memory on the host via // cuMemAllocHost. // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gdd8311286d2c2691605362c689bc64e0 diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h index f686685474..773cbfb8a1 100644 --- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h +++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h @@ -74,6 +74,14 @@ class CUDAExecutor : public internal::StreamExecutorInterface { void Deallocate(DeviceMemoryBase *mem) override; + void *UnifiedMemoryAllocate(uint64 size) override { + return CUDADriver::UnifiedMemoryAllocate(context_, size); + } + + void UnifiedMemoryDeallocate(void *location) override { + return CUDADriver::UnifiedMemoryDeallocate(context_, location); + } + // CUDA allocation/registration functions are necessary because the driver // internally sets up buffers for DMA operations (and page locks them). // There's no external interface for us to otherwise control these DMA diff --git a/tensorflow/stream_executor/stream_executor_internal.h b/tensorflow/stream_executor/stream_executor_internal.h index 2584c92f0c..9c989b971d 100644 --- a/tensorflow/stream_executor/stream_executor_internal.h +++ b/tensorflow/stream_executor/stream_executor_internal.h @@ -174,6 +174,15 @@ class StreamExecutorInterface { virtual void *AllocateSubBuffer(DeviceMemoryBase *parent, uint64 offset, uint64 size) = 0; virtual void Deallocate(DeviceMemoryBase *mem) = 0; + // Allocates unified memory space of the given size, if supported. + // See + // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-unified-memory-programming-hd + // for more details on unified memory. + virtual void *UnifiedMemoryAllocate(uint64 size) { return nullptr; } + + // Deallocates unified memory space previously allocated with + // UnifiedMemoryAllocate. + virtual void UnifiedMemoryDeallocate(void *mem) {} virtual void *HostMemoryAllocate(uint64 size) = 0; virtual void HostMemoryDeallocate(void *mem) = 0; virtual bool HostMemoryRegister(void *mem, uint64 size) = 0; diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc index eecd5bfe1f..b222a4d82a 100644 --- a/tensorflow/stream_executor/stream_executor_pimpl.cc +++ b/tensorflow/stream_executor/stream_executor_pimpl.cc @@ -464,6 +464,20 @@ bool StreamExecutor::GetSymbol(const string &symbol_name, void **mem, return implementation_->GetSymbol(symbol_name, mem, bytes); } +void *StreamExecutor::UnifiedMemoryAllocate(uint64 bytes) { + void *buffer = implementation_->UnifiedMemoryAllocate(bytes); + VLOG(1) << "Called StreamExecutor::UnifiedMemoryAllocate(size=" << bytes + << ") returns " << buffer << StackTraceIfVLOG10(); + return buffer; +} + +void StreamExecutor::UnifiedMemoryDeallocate(void *location) { + VLOG(1) << "Called StreamExecutor::UnifiedMemoryDeallocate(location=" + << location << ")" << StackTraceIfVLOG10(); + + return implementation_->UnifiedMemoryDeallocate(location); +} + void *StreamExecutor::HostMemoryAllocate(uint64 size) { void *buffer = implementation_->HostMemoryAllocate(size); VLOG(1) << "Called StreamExecutor::HostMemoryAllocate(size=" << size diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h index e426cf9931..ad80a1ba25 100644 --- a/tensorflow/stream_executor/stream_executor_pimpl.h +++ b/tensorflow/stream_executor/stream_executor_pimpl.h @@ -190,6 +190,16 @@ class StreamExecutor { // activated. void GetMemAllocs(std::map<void *, AllocRecord> *records_out); + // Allocates unified memory space of the given size, if supported. + // See + // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-unified-memory-programming-hd + // for more details on unified memory. + void *UnifiedMemoryAllocate(uint64 bytes); + + // Deallocates unified memory space previously allocated with + // UnifiedMemoryAllocate. + void UnifiedMemoryDeallocate(void *location); + // Allocates a region of host memory and registers it with the platform API. // Memory allocated in this manner (or allocated and registered with // HostMemoryRegister() is required for use in asynchronous memcpy operations, |