aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/stream_executor
diff options
context:
space:
mode:
authorGravatar Smit Hinsu <hinsu@google.com>2018-05-21 17:42:15 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-05-21 17:44:41 -0700
commitb1139814f91c5216eb5ff229ee7e1982e5f4e888 (patch)
tree7f85c8229bfd47eeba49890aa75b59c8680e619c /tensorflow/stream_executor
parentd913a243196fa07d4728c8f7c1ce6444ecd086eb (diff)
Introduce an option to allocate CUDA unified memory
PiperOrigin-RevId: 197490523
Diffstat (limited to 'tensorflow/stream_executor')
-rw-r--r--tensorflow/stream_executor/cuda/cuda_driver.cc32
-rw-r--r--tensorflow/stream_executor/cuda/cuda_driver.h10
-rw-r--r--tensorflow/stream_executor/cuda/cuda_gpu_executor.h8
-rw-r--r--tensorflow/stream_executor/stream_executor_internal.h9
-rw-r--r--tensorflow/stream_executor/stream_executor_pimpl.cc14
-rw-r--r--tensorflow/stream_executor/stream_executor_pimpl.h10
6 files changed, 83 insertions, 0 deletions
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index 273ed83997..09e9f9f758 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -21,6 +21,7 @@ limitations under the License.
#include <set>
#include <utility>
+#include "cuda/include/cuda_runtime.h"
#include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
#include "tensorflow/stream_executor/lib/casts.h"
#include "tensorflow/stream_executor/lib/env.h"
@@ -924,6 +925,37 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
}
}
+/* static */ void *CUDADriver::UnifiedMemoryAllocate(CudaContext *context,
+ uint64 bytes) {
+ ScopedActivateContext activation(context);
+ CUdeviceptr result = 0;
+ // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
+ CUresult res = cuMemAllocManaged(&result, bytes, CU_MEM_ATTACH_GLOBAL);
+ if (res != CUDA_SUCCESS) {
+ LOG(ERROR) << "failed to alloc " << bytes
+ << " bytes unified memory; result: " << ToString(res);
+ return nullptr;
+ }
+ void *ptr = reinterpret_cast<void *>(result);
+ VLOG(2) << "allocated " << ptr << " for context " << context << " of "
+ << bytes << " bytes in unified memory";
+ return ptr;
+}
+
+/* static */ void CUDADriver::UnifiedMemoryDeallocate(CudaContext *context,
+ void *location) {
+ ScopedActivateContext activation(context);
+ CUdeviceptr pointer = port::bit_cast<CUdeviceptr>(location);
+ CUresult res = cuMemFree(pointer);
+ if (res != CUDA_SUCCESS) {
+ LOG(ERROR) << "failed to free unified memory at " << location
+ << "; result: " << ToString(res);
+ } else {
+ VLOG(2) << "deallocated unified memory at " << location << " for context "
+ << context;
+ }
+}
+
/* static */ void *CUDADriver::HostAllocate(CudaContext *context,
uint64 bytes) {
ScopedActivateContext activation(context);
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.h b/tensorflow/stream_executor/cuda/cuda_driver.h
index b952cfaf68..3713a5b7b9 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.h
+++ b/tensorflow/stream_executor/cuda/cuda_driver.h
@@ -106,6 +106,16 @@ class CUDADriver {
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g89b3f154e17cc89b6eea277dbdf5c93a
static void DeviceDeallocate(CudaContext* context, void *location);
+ // Allocates a unified memory space of size bytes associated with the given
+ // context via cuMemAllocManaged.
+ // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gb347ded34dc326af404aa02af5388a32
+ static void* UnifiedMemoryAllocate(CudaContext* context, uint64 bytes);
+
+ // Deallocates a unified memory space of size bytes associated with the given
+ // context via cuMemFree.
+ // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g89b3f154e17cc89b6eea277dbdf5c93a
+ static void UnifiedMemoryDeallocate(CudaContext* context, void* location);
+
// Allocates page-locked and CUDA-registered memory on the host via
// cuMemAllocHost.
// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gdd8311286d2c2691605362c689bc64e0
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
index f686685474..773cbfb8a1 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
@@ -74,6 +74,14 @@ class CUDAExecutor : public internal::StreamExecutorInterface {
void Deallocate(DeviceMemoryBase *mem) override;
+ void *UnifiedMemoryAllocate(uint64 size) override {
+ return CUDADriver::UnifiedMemoryAllocate(context_, size);
+ }
+
+ void UnifiedMemoryDeallocate(void *location) override {
+ return CUDADriver::UnifiedMemoryDeallocate(context_, location);
+ }
+
// CUDA allocation/registration functions are necessary because the driver
// internally sets up buffers for DMA operations (and page locks them).
// There's no external interface for us to otherwise control these DMA
diff --git a/tensorflow/stream_executor/stream_executor_internal.h b/tensorflow/stream_executor/stream_executor_internal.h
index 2584c92f0c..9c989b971d 100644
--- a/tensorflow/stream_executor/stream_executor_internal.h
+++ b/tensorflow/stream_executor/stream_executor_internal.h
@@ -174,6 +174,15 @@ class StreamExecutorInterface {
virtual void *AllocateSubBuffer(DeviceMemoryBase *parent, uint64 offset,
uint64 size) = 0;
virtual void Deallocate(DeviceMemoryBase *mem) = 0;
+ // Allocates unified memory space of the given size, if supported.
+ // See
+ // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-unified-memory-programming-hd
+ // for more details on unified memory.
+ virtual void *UnifiedMemoryAllocate(uint64 size) { return nullptr; }
+
+ // Deallocates unified memory space previously allocated with
+ // UnifiedMemoryAllocate.
+ virtual void UnifiedMemoryDeallocate(void *mem) {}
virtual void *HostMemoryAllocate(uint64 size) = 0;
virtual void HostMemoryDeallocate(void *mem) = 0;
virtual bool HostMemoryRegister(void *mem, uint64 size) = 0;
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc
index eecd5bfe1f..b222a4d82a 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.cc
+++ b/tensorflow/stream_executor/stream_executor_pimpl.cc
@@ -464,6 +464,20 @@ bool StreamExecutor::GetSymbol(const string &symbol_name, void **mem,
return implementation_->GetSymbol(symbol_name, mem, bytes);
}
+void *StreamExecutor::UnifiedMemoryAllocate(uint64 bytes) {
+ void *buffer = implementation_->UnifiedMemoryAllocate(bytes);
+ VLOG(1) << "Called StreamExecutor::UnifiedMemoryAllocate(size=" << bytes
+ << ") returns " << buffer << StackTraceIfVLOG10();
+ return buffer;
+}
+
+void StreamExecutor::UnifiedMemoryDeallocate(void *location) {
+ VLOG(1) << "Called StreamExecutor::UnifiedMemoryDeallocate(location="
+ << location << ")" << StackTraceIfVLOG10();
+
+ return implementation_->UnifiedMemoryDeallocate(location);
+}
+
void *StreamExecutor::HostMemoryAllocate(uint64 size) {
void *buffer = implementation_->HostMemoryAllocate(size);
VLOG(1) << "Called StreamExecutor::HostMemoryAllocate(size=" << size
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h
index e426cf9931..ad80a1ba25 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.h
+++ b/tensorflow/stream_executor/stream_executor_pimpl.h
@@ -190,6 +190,16 @@ class StreamExecutor {
// activated.
void GetMemAllocs(std::map<void *, AllocRecord> *records_out);
+ // Allocates unified memory space of the given size, if supported.
+ // See
+ // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-unified-memory-programming-hd
+ // for more details on unified memory.
+ void *UnifiedMemoryAllocate(uint64 bytes);
+
+ // Deallocates unified memory space previously allocated with
+ // UnifiedMemoryAllocate.
+ void UnifiedMemoryDeallocate(void *location);
+
// Allocates a region of host memory and registers it with the platform API.
// Memory allocated in this manner (or allocated and registered with
// HostMemoryRegister() is required for use in asynchronous memcpy operations,