aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/stream_executor/cuda
diff options
context:
space:
mode:
authorGravatar Sanjoy Das <sanjoy@google.com>2018-07-23 16:17:12 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-07-23 16:20:36 -0700
commit632e48c27e09b53ab52523149e759f9bc1711e71 (patch)
tree6c080226ca18ed1937b8a2afe973702d0ffffaee /tensorflow/stream_executor/cuda
parent9225bbbe0aaaa14b69176576097bb67bae98e6c5 (diff)
Teach StreamExecutor to load modules and resolve symbols in them
This will be used in a future CL. PiperOrigin-RevId: 205742731
Diffstat (limited to 'tensorflow/stream_executor/cuda')
-rw-r--r--tensorflow/stream_executor/cuda/cuda_gpu_executor.cc179
-rw-r--r--tensorflow/stream_executor/cuda/cuda_gpu_executor.h16
2 files changed, 138 insertions, 57 deletions
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index 259c813c57..73f05b94db 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -206,6 +206,48 @@ static string GetBinaryDir(bool strip_exe) {
return exe_path;
}
+bool CUDAExecutor::LoadModuleFromCuBin(const char *cubin, CUmodule *module) {
+ uint64_t module_refcount;
+ std::tie(*module, module_refcount) = gpu_binary_to_module_[cubin];
+
+ if (*module == nullptr) {
+ auto load_status = CUDADriver::LoadCubin(context_, cubin, module);
+ if (!load_status.ok()) {
+ LOG(ERROR) << "failed to load CUBIN: " << load_status;
+ return false;
+ }
+ module_refcount = 1;
+ VLOG(3) << "Loaded CUBIN " << static_cast<const void *>(cubin)
+ << " as module " << *module;
+ } else {
+ ++module_refcount;
+ VLOG(3) << "CUBIN " << static_cast<const void *>(cubin)
+ << " is already loaded as module " << *module;
+ }
+ gpu_binary_to_module_[cubin] = {*module, module_refcount};
+ return true;
+}
+
+bool CUDAExecutor::LoadModuleFromPtx(const char *ptx, CUmodule *module) {
+ uint64_t module_refcount;
+ std::tie(*module, module_refcount) = gpu_binary_to_module_[ptx];
+
+ if (*module == nullptr) {
+ if (!CUDADriver::LoadPtx(context_, ptx, module)) {
+ return false;
+ }
+ VLOG(3) << "Loaded PTX " << static_cast<const void *>(ptx) << " as module "
+ << *module;
+ module_refcount = 1;
+ } else {
+ ++module_refcount;
+ VLOG(3) << "PTX " << static_cast<const void *>(ptx)
+ << " is already loaded as module " << module;
+ }
+ gpu_binary_to_module_[ptx] = {*module, module_refcount};
+ return true;
+}
+
bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec,
KernelBase *kernel) {
CUDAKernel *cuda_kernel = AsCUDAKernel(kernel);
@@ -215,28 +257,13 @@ bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec,
VLOG(3) << "GetKernel on kernel " << kernel << " : " << kernel->name();
if (spec.has_cuda_cubin_in_memory()) {
+ mutex_lock lock{in_memory_modules_mu_};
kernelname = &spec.cuda_cubin_in_memory().kernelname();
const char *cubin = spec.cuda_cubin_in_memory().bytes();
- mutex_lock lock{in_memory_modules_mu_};
- uint64_t module_refcount;
- std::tie(module, module_refcount) = gpu_binary_to_module_[cubin];
-
- if (module == nullptr) {
- auto load_status = CUDADriver::LoadCubin(context_, cubin, &module);
- if (!load_status.ok()) {
- LOG(ERROR) << "failed to load CUBIN: " << load_status;
- return false;
- }
- module_refcount = 1;
- VLOG(3) << "Loaded CUBIN " << static_cast<const void *>(cubin)
- << " as module " << module;
- } else {
- ++module_refcount;
- VLOG(3) << "CUBIN " << static_cast<const void *>(cubin)
- << " is already loaded as module " << module;
+ if (!LoadModuleFromCuBin(cubin, &module)) {
+ return false;
}
kernel_to_gpu_binary_[kernel] = cubin;
- gpu_binary_to_module_[cubin] = {module, module_refcount};
} else if (spec.has_cuda_ptx_in_memory()) {
kernelname = &spec.cuda_ptx_in_memory().kernelname();
@@ -254,24 +281,10 @@ bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec,
}
mutex_lock lock{in_memory_modules_mu_};
- uint64_t module_refcount;
- std::tie(module, module_refcount) = gpu_binary_to_module_[ptx];
-
- if (module == nullptr) {
- if (!CUDADriver::LoadPtx(context_, ptx, &module)) {
- LOG(ERROR) << "failed to load PTX for kernel " << *kernelname;
- return false;
- }
- VLOG(3) << "Loaded PTX " << static_cast<const void *>(ptx)
- << " as module " << module;
- module_refcount = 1;
- } else {
- ++module_refcount;
- VLOG(3) << "PTX " << static_cast<const void *>(ptx)
- << " is already loaded as module " << module;
+ if (!LoadModuleFromPtx(ptx, &module)) {
+ return false;
}
kernel_to_gpu_binary_[kernel] = ptx;
- gpu_binary_to_module_[ptx] = {module, module_refcount};
} else {
LOG(WARNING) << "no method of loading CUDA kernel provided";
return false;
@@ -295,6 +308,23 @@ bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec,
return true;
}
+bool CUDAExecutor::UnloadGpuBinary(const void *gpu_binary) {
+ auto module_it = gpu_binary_to_module_.find(gpu_binary);
+ if (gpu_binary_to_module_.end() == module_it) {
+ VLOG(3) << "No loaded CUDA module for " << gpu_binary;
+ return false;
+ }
+ auto &module = module_it->second.first;
+ auto &refcount = module_it->second.second;
+ VLOG(3) << "Found CUDA module " << module << " with refcount " << refcount;
+ if (--refcount == 0) {
+ VLOG(3) << "Unloading CUDA module " << module;
+ CUDADriver::UnloadModule(context_, module);
+ gpu_binary_to_module_.erase(module_it);
+ }
+ return true;
+}
+
void CUDAExecutor::UnloadKernel(const KernelBase *kernel) {
VLOG(3) << "Unloading kernel " << kernel << " : " << kernel->name();
@@ -307,25 +337,52 @@ void CUDAExecutor::UnloadKernel(const KernelBase *kernel) {
}
VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
<< " has loaded GPU code " << gpu_binary_it->second;
- auto module_it = gpu_binary_to_module_.find(gpu_binary_it->second);
- if (gpu_binary_to_module_.end() == module_it) {
- VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
- << " has no loaded CUDA module.";
- return; // This kernel never loaded any modules
- }
- auto &module = module_it->second.first;
- auto &refcount = module_it->second.second;
- VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
- << " has loaded GPU code " << gpu_binary_it->second
- << " into CUDA module " << module << " with refcount " << refcount;
- if (--refcount == 0) {
- VLOG(3) << "Unloading CUDA module " << module;
- CUDADriver::UnloadModule(context_, module);
- gpu_binary_to_module_.erase(module_it);
- }
+ UnloadGpuBinary(gpu_binary_it->second);
kernel_to_gpu_binary_.erase(gpu_binary_it);
}
+bool CUDAExecutor::LoadModule(const MultiModuleLoaderSpec &spec,
+ ModuleHandle *module_handle) {
+ // In CUDAExecutor we store the pointer to the GPU binary (PTX or CUBIN) as
+ // ModuleHandle::id().
+ CUmodule cu_module;
+ if (spec.has_cuda_cubin_in_memory()) {
+ mutex_lock lock{in_memory_modules_mu_};
+ if (!LoadModuleFromCuBin(
+ reinterpret_cast<const char *>(spec.cuda_cubin_in_memory().data()),
+ &cu_module)) {
+ return false;
+ }
+ *module_handle = ModuleHandle(const_cast<void *>(
+ static_cast<const void *>(spec.cuda_cubin_in_memory().data())));
+ return true;
+ } else if (spec.has_cuda_ptx_in_memory()) {
+ if (cc_major_ == 0 && cc_minor_ == 0) {
+ return false;
+ }
+
+ if (!spec.cuda_ptx_in_memory()) {
+ return false;
+ }
+
+ mutex_lock lock{in_memory_modules_mu_};
+ if (!LoadModuleFromPtx(spec.cuda_ptx_in_memory(), &cu_module)) {
+ return false;
+ }
+ *module_handle = ModuleHandle(const_cast<void *>(
+ static_cast<const void *>(spec.cuda_ptx_in_memory())));
+ return true;
+ }
+ LOG(WARNING) << "no method of loading CUDA module provided";
+ return false;
+}
+
+bool CUDAExecutor::UnloadModule(ModuleHandle module_handle) {
+ const char *gpu_binary = reinterpret_cast<const char *>(module_handle.id());
+ mutex_lock lock{in_memory_modules_mu_};
+ return UnloadGpuBinary(gpu_binary);
+}
+
bool CUDAExecutor::GetKernelMetadata(CUDAKernel *cuda_kernel,
KernelMetadata *kernel_metadata) {
int value;
@@ -783,16 +840,26 @@ bool CUDAExecutor::DeviceMemoryUsage(int64 *free, int64 *total) const {
return CUDADriver::GetDeviceMemoryInfo(context_, free, total);
}
-bool CUDAExecutor::GetSymbol(const string& symbol_name, void **mem,
+bool CUDAExecutor::GetSymbol(const string &symbol_name,
+ ModuleHandle module_handle, void **mem,
size_t *bytes) {
+ auto lookup_in_module = [&](CUmodule module) {
+ CHECK(module != nullptr);
+ return CUDADriver::GetModuleSymbol(context_, module, symbol_name.c_str(),
+ reinterpret_cast<CUdeviceptr *>(mem),
+ bytes);
+ };
+
{ // give limited scope to mutex_lock
mutex_lock lock{in_memory_modules_mu_};
+ if (static_cast<bool>(module_handle)) {
+ auto it = gpu_binary_to_module_.find(module_handle.id());
+ CHECK(it != gpu_binary_to_module_.end());
+ return lookup_in_module(it->second.first);
+ }
+
for (auto &it : gpu_binary_to_module_) {
- CUmodule module = it.second.first;
- CHECK(module != nullptr);
- if (CUDADriver::GetModuleSymbol(context_, module, symbol_name.c_str(),
- reinterpret_cast<CUdeviceptr *>(mem),
- bytes)) {
+ if (lookup_in_module(it.second.first)) {
return true;
}
}
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
index f7c341c857..8a954d5461 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
@@ -62,6 +62,9 @@ class CUDAExecutor : public internal::StreamExecutorInterface {
bool GetKernel(const MultiKernelLoaderSpec &spec,
KernelBase *kernel) override;
void UnloadKernel(const KernelBase *kernel) override;
+ bool LoadModule(const MultiModuleLoaderSpec &spec,
+ ModuleHandle *module_handle) override;
+ bool UnloadModule(ModuleHandle module_handle) override;
bool Launch(Stream *stream, const ThreadDim &thread_dims,
const BlockDim &block_dims, const KernelBase &k,
@@ -175,7 +178,8 @@ class CUDAExecutor : public internal::StreamExecutorInterface {
// Search for the symbol and returns a device pointer and size.
// Returns false if symbol does not exist.
- bool GetSymbol(const string& symbol_name, void **mem, size_t *bytes) override;
+ bool GetSymbol(const string &symbol_name, ModuleHandle module_handle,
+ void **mem, size_t *bytes) override;
DeviceDescription *PopulateDeviceDescription() const override;
@@ -239,6 +243,16 @@ class CUDAExecutor : public internal::StreamExecutorInterface {
void VlogOccupancyInfo(const KernelBase &kernel, const ThreadDim &thread_dims,
const BlockDim &block_dims);
+ bool LoadModuleFromCuBin(const char *cubin, CUmodule *module)
+ EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
+
+ // Loads the PTX text `ptx` as a CUDA module. `ptx` must be null terminated.
+ bool LoadModuleFromPtx(const char *ptx, CUmodule *module)
+ EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
+
+ bool UnloadGpuBinary(const void *gpu_binary)
+ EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
+
// Guards the in-memory-module mapping.
mutex in_memory_modules_mu_;