Introduce an option to allocate CUDA unified memory

PiperOrigin-RevId: 197490523
author: Smit Hinsu <hinsu@google.com> 2018-05-21 17:42:15 -0700
committer: TensorFlower Gardener <gardener@tensorflow.org> 2018-05-21 17:44:41 -0700
commit: b1139814f91c5216eb5ff229ee7e1982e5f4e888 (patch)
tree: 7f85c8229bfd47eeba49890aa75b59c8680e619c /tensorflow/stream_executor/cuda
parent: d913a243196fa07d4728c8f7c1ce6444ecd086eb (diff)
3 files changed, 50 insertions, 0 deletions
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index 273ed83997..09e9f9f758 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <set>
 #include <utility>
 
+#include "cuda/include/cuda_runtime.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
 #include "tensorflow/stream_executor/lib/casts.h"
 #include "tensorflow/stream_executor/lib/env.h"
@@ -924,6 +925,37 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   }
 }
 
+/* static */ void *CUDADriver::UnifiedMemoryAllocate(CudaContext *context,
+                                                     uint64 bytes) {
+  ScopedActivateContext activation(context);
+  CUdeviceptr result = 0;
+  // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
+  CUresult res = cuMemAllocManaged(&result, bytes, CU_MEM_ATTACH_GLOBAL);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to alloc " << bytes
+               << " bytes unified memory; result: " << ToString(res);
+    return nullptr;
+  }
+  void *ptr = reinterpret_cast<void *>(result);
+  VLOG(2) << "allocated " << ptr << " for context " << context << " of "
+          << bytes << " bytes in unified memory";
+  return ptr;
+}
+
+/* static */ void CUDADriver::UnifiedMemoryDeallocate(CudaContext *context,
+                                                      void *location) {
+  ScopedActivateContext activation(context);
+  CUdeviceptr pointer = port::bit_cast<CUdeviceptr>(location);
+  CUresult res = cuMemFree(pointer);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to free unified memory at " << location
+               << "; result: " << ToString(res);
+  } else {
+    VLOG(2) << "deallocated unified memory at " << location << " for context "
+            << context;
+  }
+}
+
 /* static */ void *CUDADriver::HostAllocate(CudaContext *context,
                                             uint64 bytes) {
   ScopedActivateContext activation(context);
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.h b/tensorflow/stream_executor/cuda/cuda_driver.h
index b952cfaf68..3713a5b7b9 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.h
+++ b/tensorflow/stream_executor/cuda/cuda_driver.h
@@ -106,6 +106,16 @@ class CUDADriver {
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g89b3f154e17cc89b6eea277dbdf5c93a
   static void DeviceDeallocate(CudaContext* context, void *location);
 
+  // Allocates a unified memory space of size bytes associated with the given
+  // context via cuMemAllocManaged.
+  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gb347ded34dc326af404aa02af5388a32
+  static void* UnifiedMemoryAllocate(CudaContext* context, uint64 bytes);
+
+  // Deallocates a unified memory space of size bytes associated with the given
+  // context via cuMemFree.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g89b3f154e17cc89b6eea277dbdf5c93a
+  static void UnifiedMemoryDeallocate(CudaContext* context, void* location);
+
   // Allocates page-locked and CUDA-registered memory on the host via
   // cuMemAllocHost.
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gdd8311286d2c2691605362c689bc64e0
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
index f686685474..773cbfb8a1 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
@@ -74,6 +74,14 @@ class CUDAExecutor : public internal::StreamExecutorInterface {
 
   void Deallocate(DeviceMemoryBase *mem) override;
 
+  void *UnifiedMemoryAllocate(uint64 size) override {
+    return CUDADriver::UnifiedMemoryAllocate(context_, size);
+  }
+
+  void UnifiedMemoryDeallocate(void *location) override {
+    return CUDADriver::UnifiedMemoryDeallocate(context_, location);
+  }
+
   // CUDA allocation/registration functions are necessary because the driver
   // internally sets up buffers for DMA operations (and page locks them).
   // There's no external interface for us to otherwise control these DMA
author	Smit Hinsu <hinsu@google.com>	2018-05-21 17:42:15 -0700
committer	TensorFlower Gardener <gardener@tensorflow.org>	2018-05-21 17:44:41 -0700
commit	b1139814f91c5216eb5ff229ee7e1982e5f4e888 (patch)
tree	7f85c8229bfd47eeba49890aa75b59c8680e619c /tensorflow/stream_executor/cuda
parent	d913a243196fa07d4728c8f7c1ce6444ecd086eb (diff)