aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow/stream_executor/cuda/cuda_gpu_executor.cc')
-rw-r--r--tensorflow/stream_executor/cuda/cuda_gpu_executor.cc181
1 files changed, 124 insertions, 57 deletions
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index f11022ef1d..73f05b94db 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -206,6 +206,48 @@ static string GetBinaryDir(bool strip_exe) {
return exe_path;
}
+bool CUDAExecutor::LoadModuleFromCuBin(const char *cubin, CUmodule *module) {
+ uint64_t module_refcount;
+ std::tie(*module, module_refcount) = gpu_binary_to_module_[cubin];
+
+ if (*module == nullptr) {
+ auto load_status = CUDADriver::LoadCubin(context_, cubin, module);
+ if (!load_status.ok()) {
+ LOG(ERROR) << "failed to load CUBIN: " << load_status;
+ return false;
+ }
+ module_refcount = 1;
+ VLOG(3) << "Loaded CUBIN " << static_cast<const void *>(cubin)
+ << " as module " << *module;
+ } else {
+ ++module_refcount;
+ VLOG(3) << "CUBIN " << static_cast<const void *>(cubin)
+ << " is already loaded as module " << *module;
+ }
+ gpu_binary_to_module_[cubin] = {*module, module_refcount};
+ return true;
+}
+
+bool CUDAExecutor::LoadModuleFromPtx(const char *ptx, CUmodule *module) {
+ uint64_t module_refcount;
+ std::tie(*module, module_refcount) = gpu_binary_to_module_[ptx];
+
+ if (*module == nullptr) {
+ if (!CUDADriver::LoadPtx(context_, ptx, module)) {
+ return false;
+ }
+ VLOG(3) << "Loaded PTX " << static_cast<const void *>(ptx) << " as module "
+ << *module;
+ module_refcount = 1;
+ } else {
+ ++module_refcount;
+ VLOG(3) << "PTX " << static_cast<const void *>(ptx)
+ << " is already loaded as module " << module;
+ }
+ gpu_binary_to_module_[ptx] = {*module, module_refcount};
+ return true;
+}
+
bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec,
KernelBase *kernel) {
CUDAKernel *cuda_kernel = AsCUDAKernel(kernel);
@@ -215,28 +257,13 @@ bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec,
VLOG(3) << "GetKernel on kernel " << kernel << " : " << kernel->name();
if (spec.has_cuda_cubin_in_memory()) {
+ mutex_lock lock{in_memory_modules_mu_};
kernelname = &spec.cuda_cubin_in_memory().kernelname();
const char *cubin = spec.cuda_cubin_in_memory().bytes();
- mutex_lock lock{in_memory_modules_mu_};
- uint64_t module_refcount;
- std::tie(module, module_refcount) = gpu_binary_to_module_[cubin];
-
- if (module == nullptr) {
- auto load_status = CUDADriver::LoadCubin(context_, cubin, &module);
- if (!load_status.ok()) {
- LOG(ERROR) << "failed to load CUBIN: " << load_status;
- return false;
- }
- module_refcount = 1;
- VLOG(3) << "Loaded CUBIN " << static_cast<const void *>(cubin)
- << " as module " << module;
- } else {
- ++module_refcount;
- VLOG(3) << "CUBIN " << static_cast<const void *>(cubin)
- << " is already loaded as module " << module;
+ if (!LoadModuleFromCuBin(cubin, &module)) {
+ return false;
}
kernel_to_gpu_binary_[kernel] = cubin;
- gpu_binary_to_module_[cubin] = {module, module_refcount};
} else if (spec.has_cuda_ptx_in_memory()) {
kernelname = &spec.cuda_ptx_in_memory().kernelname();
@@ -254,24 +281,10 @@ bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec,
}
mutex_lock lock{in_memory_modules_mu_};
- uint64_t module_refcount;
- std::tie(module, module_refcount) = gpu_binary_to_module_[ptx];
-
- if (module == nullptr) {
- if (!CUDADriver::LoadPtx(context_, ptx, &module)) {
- LOG(ERROR) << "failed to load PTX for kernel " << *kernelname;
- return false;
- }
- VLOG(3) << "Loaded PTX " << static_cast<const void *>(ptx)
- << " as module " << module;
- module_refcount = 1;
- } else {
- ++module_refcount;
- VLOG(3) << "PTX " << static_cast<const void *>(ptx)
- << " is already loaded as module " << module;
+ if (!LoadModuleFromPtx(ptx, &module)) {
+ return false;
}
kernel_to_gpu_binary_[kernel] = ptx;
- gpu_binary_to_module_[ptx] = {module, module_refcount};
} else {
LOG(WARNING) << "no method of loading CUDA kernel provided";
return false;
@@ -295,6 +308,23 @@ bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec,
return true;
}
+bool CUDAExecutor::UnloadGpuBinary(const void *gpu_binary) {
+ auto module_it = gpu_binary_to_module_.find(gpu_binary);
+ if (gpu_binary_to_module_.end() == module_it) {
+ VLOG(3) << "No loaded CUDA module for " << gpu_binary;
+ return false;
+ }
+ auto &module = module_it->second.first;
+ auto &refcount = module_it->second.second;
+ VLOG(3) << "Found CUDA module " << module << " with refcount " << refcount;
+ if (--refcount == 0) {
+ VLOG(3) << "Unloading CUDA module " << module;
+ CUDADriver::UnloadModule(context_, module);
+ gpu_binary_to_module_.erase(module_it);
+ }
+ return true;
+}
+
void CUDAExecutor::UnloadKernel(const KernelBase *kernel) {
VLOG(3) << "Unloading kernel " << kernel << " : " << kernel->name();
@@ -307,25 +337,52 @@ void CUDAExecutor::UnloadKernel(const KernelBase *kernel) {
}
VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
<< " has loaded GPU code " << gpu_binary_it->second;
- auto module_it = gpu_binary_to_module_.find(gpu_binary_it->second);
- if (gpu_binary_to_module_.end() == module_it) {
- VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
- << " has no loaded CUDA module.";
- return; // This kernel never loaded any modules
- }
- auto &module = module_it->second.first;
- auto &refcount = module_it->second.second;
- VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
- << " has loaded GPU code " << gpu_binary_it->second
- << " into CUDA module " << module << " with refcount " << refcount;
- if (--refcount == 0) {
- VLOG(3) << "Unloading CUDA module " << module;
- CUDADriver::UnloadModule(context_, module);
- gpu_binary_to_module_.erase(module_it);
- }
+ UnloadGpuBinary(gpu_binary_it->second);
kernel_to_gpu_binary_.erase(gpu_binary_it);
}
+bool CUDAExecutor::LoadModule(const MultiModuleLoaderSpec &spec,
+ ModuleHandle *module_handle) {
+ // In CUDAExecutor we store the pointer to the GPU binary (PTX or CUBIN) as
+ // ModuleHandle::id().
+ CUmodule cu_module;
+ if (spec.has_cuda_cubin_in_memory()) {
+ mutex_lock lock{in_memory_modules_mu_};
+ if (!LoadModuleFromCuBin(
+ reinterpret_cast<const char *>(spec.cuda_cubin_in_memory().data()),
+ &cu_module)) {
+ return false;
+ }
+ *module_handle = ModuleHandle(const_cast<void *>(
+ static_cast<const void *>(spec.cuda_cubin_in_memory().data())));
+ return true;
+ } else if (spec.has_cuda_ptx_in_memory()) {
+ if (cc_major_ == 0 && cc_minor_ == 0) {
+ return false;
+ }
+
+ if (!spec.cuda_ptx_in_memory()) {
+ return false;
+ }
+
+ mutex_lock lock{in_memory_modules_mu_};
+ if (!LoadModuleFromPtx(spec.cuda_ptx_in_memory(), &cu_module)) {
+ return false;
+ }
+ *module_handle = ModuleHandle(const_cast<void *>(
+ static_cast<const void *>(spec.cuda_ptx_in_memory())));
+ return true;
+ }
+ LOG(WARNING) << "no method of loading CUDA module provided";
+ return false;
+}
+
+bool CUDAExecutor::UnloadModule(ModuleHandle module_handle) {
+ const char *gpu_binary = reinterpret_cast<const char *>(module_handle.id());
+ mutex_lock lock{in_memory_modules_mu_};
+ return UnloadGpuBinary(gpu_binary);
+}
+
bool CUDAExecutor::GetKernelMetadata(CUDAKernel *cuda_kernel,
KernelMetadata *kernel_metadata) {
int value;
@@ -783,16 +840,26 @@ bool CUDAExecutor::DeviceMemoryUsage(int64 *free, int64 *total) const {
return CUDADriver::GetDeviceMemoryInfo(context_, free, total);
}
-bool CUDAExecutor::GetSymbol(const string& symbol_name, void **mem,
+bool CUDAExecutor::GetSymbol(const string &symbol_name,
+ ModuleHandle module_handle, void **mem,
size_t *bytes) {
+ auto lookup_in_module = [&](CUmodule module) {
+ CHECK(module != nullptr);
+ return CUDADriver::GetModuleSymbol(context_, module, symbol_name.c_str(),
+ reinterpret_cast<CUdeviceptr *>(mem),
+ bytes);
+ };
+
{ // give limited scope to mutex_lock
mutex_lock lock{in_memory_modules_mu_};
+ if (static_cast<bool>(module_handle)) {
+ auto it = gpu_binary_to_module_.find(module_handle.id());
+ CHECK(it != gpu_binary_to_module_.end());
+ return lookup_in_module(it->second.first);
+ }
+
for (auto &it : gpu_binary_to_module_) {
- CUmodule module = it.second.first;
- CHECK(module != nullptr);
- if (CUDADriver::GetModuleSymbol(context_, module, symbol_name.c_str(),
- reinterpret_cast<CUdeviceptr *>(mem),
- bytes)) {
+ if (lookup_in_module(it.second.first)) {
return true;
}
}
@@ -844,7 +911,7 @@ CUDAExecutor::GetTimerImplementation() {
return std::unique_ptr<internal::TimerInterface>(new CUDATimer(this));
}
-void *CUDAExecutor::CudaContextHack() { return context_; }
+void *CUDAExecutor::GpuContextHack() { return context_; }
CudaContext* CUDAExecutor::cuda_context() { return context_; }