aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/stream_executor/cuda
diff options
context:
space:
mode:
authorGravatar Smit Hinsu <hinsu@google.com>2018-05-21 17:42:15 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-05-21 17:44:41 -0700
commitb1139814f91c5216eb5ff229ee7e1982e5f4e888 (patch)
tree7f85c8229bfd47eeba49890aa75b59c8680e619c /tensorflow/stream_executor/cuda
parentd913a243196fa07d4728c8f7c1ce6444ecd086eb (diff)
Introduce an option to allocate CUDA unified memory
PiperOrigin-RevId: 197490523
Diffstat (limited to 'tensorflow/stream_executor/cuda')
-rw-r--r--tensorflow/stream_executor/cuda/cuda_driver.cc32
-rw-r--r--tensorflow/stream_executor/cuda/cuda_driver.h10
-rw-r--r--tensorflow/stream_executor/cuda/cuda_gpu_executor.h8
3 files changed, 50 insertions, 0 deletions
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index 273ed83997..09e9f9f758 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -21,6 +21,7 @@ limitations under the License.
#include <set>
#include <utility>
+#include "cuda/include/cuda_runtime.h"
#include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
#include "tensorflow/stream_executor/lib/casts.h"
#include "tensorflow/stream_executor/lib/env.h"
@@ -924,6 +925,37 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
}
}
+/* static */ void *CUDADriver::UnifiedMemoryAllocate(CudaContext *context,
+ uint64 bytes) {
+ ScopedActivateContext activation(context);
+ CUdeviceptr result = 0;
+ // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
+ CUresult res = cuMemAllocManaged(&result, bytes, CU_MEM_ATTACH_GLOBAL);
+ if (res != CUDA_SUCCESS) {
+ LOG(ERROR) << "failed to alloc " << bytes
+ << " bytes unified memory; result: " << ToString(res);
+ return nullptr;
+ }
+ void *ptr = reinterpret_cast<void *>(result);
+ VLOG(2) << "allocated " << ptr << " for context " << context << " of "
+ << bytes << " bytes in unified memory";
+ return ptr;
+}
+
+/* static */ void CUDADriver::UnifiedMemoryDeallocate(CudaContext *context,
+ void *location) {
+ ScopedActivateContext activation(context);
+ CUdeviceptr pointer = port::bit_cast<CUdeviceptr>(location);
+ CUresult res = cuMemFree(pointer);
+ if (res != CUDA_SUCCESS) {
+ LOG(ERROR) << "failed to free unified memory at " << location
+ << "; result: " << ToString(res);
+ } else {
+ VLOG(2) << "deallocated unified memory at " << location << " for context "
+ << context;
+ }
+}
+
/* static */ void *CUDADriver::HostAllocate(CudaContext *context,
uint64 bytes) {
ScopedActivateContext activation(context);
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.h b/tensorflow/stream_executor/cuda/cuda_driver.h
index b952cfaf68..3713a5b7b9 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.h
+++ b/tensorflow/stream_executor/cuda/cuda_driver.h
@@ -106,6 +106,16 @@ class CUDADriver {
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g89b3f154e17cc89b6eea277dbdf5c93a
static void DeviceDeallocate(CudaContext* context, void *location);
+ // Allocates a unified memory space of size bytes associated with the given
+ // context via cuMemAllocManaged.
+ // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gb347ded34dc326af404aa02af5388a32
+ static void* UnifiedMemoryAllocate(CudaContext* context, uint64 bytes);
+
+ // Deallocates a unified memory space of size bytes associated with the given
+ // context via cuMemFree.
+ // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g89b3f154e17cc89b6eea277dbdf5c93a
+ static void UnifiedMemoryDeallocate(CudaContext* context, void* location);
+
// Allocates page-locked and CUDA-registered memory on the host via
// cuMemAllocHost.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gdd8311286d2c2691605362c689bc64e0
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
index f686685474..773cbfb8a1 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
@@ -74,6 +74,14 @@ class CUDAExecutor : public internal::StreamExecutorInterface {
void Deallocate(DeviceMemoryBase *mem) override;
+ void *UnifiedMemoryAllocate(uint64 size) override {
+ return CUDADriver::UnifiedMemoryAllocate(context_, size);
+ }
+
+ void UnifiedMemoryDeallocate(void *location) override {
+ return CUDADriver::UnifiedMemoryDeallocate(context_, location);
+ }
+
// CUDA allocation/registration functions are necessary because the driver
// internally sets up buffers for DMA operations (and page locks them).
// There's no external interface for us to otherwise control these DMA