diff options
author | Sanjoy Das <sanjoy@google.com> | 2018-07-23 16:17:12 -0700 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2018-07-23 16:20:36 -0700 |
commit | 632e48c27e09b53ab52523149e759f9bc1711e71 (patch) | |
tree | 6c080226ca18ed1937b8a2afe973702d0ffffaee /tensorflow/stream_executor/cuda | |
parent | 9225bbbe0aaaa14b69176576097bb67bae98e6c5 (diff) |
Teach StreamExecutor to load modules and resolve symbols in them
This will be used in a future CL.
PiperOrigin-RevId: 205742731
Diffstat (limited to 'tensorflow/stream_executor/cuda')
-rw-r--r-- | tensorflow/stream_executor/cuda/cuda_gpu_executor.cc | 179 | ||||
-rw-r--r-- | tensorflow/stream_executor/cuda/cuda_gpu_executor.h | 16 |
2 files changed, 138 insertions, 57 deletions
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc index 259c813c57..73f05b94db 100644 --- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc +++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc @@ -206,6 +206,48 @@ static string GetBinaryDir(bool strip_exe) { return exe_path; } +bool CUDAExecutor::LoadModuleFromCuBin(const char *cubin, CUmodule *module) { + uint64_t module_refcount; + std::tie(*module, module_refcount) = gpu_binary_to_module_[cubin]; + + if (*module == nullptr) { + auto load_status = CUDADriver::LoadCubin(context_, cubin, module); + if (!load_status.ok()) { + LOG(ERROR) << "failed to load CUBIN: " << load_status; + return false; + } + module_refcount = 1; + VLOG(3) << "Loaded CUBIN " << static_cast<const void *>(cubin) + << " as module " << *module; + } else { + ++module_refcount; + VLOG(3) << "CUBIN " << static_cast<const void *>(cubin) + << " is already loaded as module " << *module; + } + gpu_binary_to_module_[cubin] = {*module, module_refcount}; + return true; +} + +bool CUDAExecutor::LoadModuleFromPtx(const char *ptx, CUmodule *module) { + uint64_t module_refcount; + std::tie(*module, module_refcount) = gpu_binary_to_module_[ptx]; + + if (*module == nullptr) { + if (!CUDADriver::LoadPtx(context_, ptx, module)) { + return false; + } + VLOG(3) << "Loaded PTX " << static_cast<const void *>(ptx) << " as module " + << *module; + module_refcount = 1; + } else { + ++module_refcount; + VLOG(3) << "PTX " << static_cast<const void *>(ptx) + << " is already loaded as module " << module; + } + gpu_binary_to_module_[ptx] = {*module, module_refcount}; + return true; +} + bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec, KernelBase *kernel) { CUDAKernel *cuda_kernel = AsCUDAKernel(kernel); @@ -215,28 +257,13 @@ bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec, VLOG(3) << "GetKernel on kernel " << kernel << " : " << kernel->name(); if (spec.has_cuda_cubin_in_memory()) { + mutex_lock lock{in_memory_modules_mu_}; kernelname = &spec.cuda_cubin_in_memory().kernelname(); const char *cubin = spec.cuda_cubin_in_memory().bytes(); - mutex_lock lock{in_memory_modules_mu_}; - uint64_t module_refcount; - std::tie(module, module_refcount) = gpu_binary_to_module_[cubin]; - - if (module == nullptr) { - auto load_status = CUDADriver::LoadCubin(context_, cubin, &module); - if (!load_status.ok()) { - LOG(ERROR) << "failed to load CUBIN: " << load_status; - return false; - } - module_refcount = 1; - VLOG(3) << "Loaded CUBIN " << static_cast<const void *>(cubin) - << " as module " << module; - } else { - ++module_refcount; - VLOG(3) << "CUBIN " << static_cast<const void *>(cubin) - << " is already loaded as module " << module; + if (!LoadModuleFromCuBin(cubin, &module)) { + return false; } kernel_to_gpu_binary_[kernel] = cubin; - gpu_binary_to_module_[cubin] = {module, module_refcount}; } else if (spec.has_cuda_ptx_in_memory()) { kernelname = &spec.cuda_ptx_in_memory().kernelname(); @@ -254,24 +281,10 @@ bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec, } mutex_lock lock{in_memory_modules_mu_}; - uint64_t module_refcount; - std::tie(module, module_refcount) = gpu_binary_to_module_[ptx]; - - if (module == nullptr) { - if (!CUDADriver::LoadPtx(context_, ptx, &module)) { - LOG(ERROR) << "failed to load PTX for kernel " << *kernelname; - return false; - } - VLOG(3) << "Loaded PTX " << static_cast<const void *>(ptx) - << " as module " << module; - module_refcount = 1; - } else { - ++module_refcount; - VLOG(3) << "PTX " << static_cast<const void *>(ptx) - << " is already loaded as module " << module; + if (!LoadModuleFromPtx(ptx, &module)) { + return false; } kernel_to_gpu_binary_[kernel] = ptx; - gpu_binary_to_module_[ptx] = {module, module_refcount}; } else { LOG(WARNING) << "no method of loading CUDA kernel provided"; return false; @@ -295,6 +308,23 @@ bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec, return true; } +bool CUDAExecutor::UnloadGpuBinary(const void *gpu_binary) { + auto module_it = gpu_binary_to_module_.find(gpu_binary); + if (gpu_binary_to_module_.end() == module_it) { + VLOG(3) << "No loaded CUDA module for " << gpu_binary; + return false; + } + auto &module = module_it->second.first; + auto &refcount = module_it->second.second; + VLOG(3) << "Found CUDA module " << module << " with refcount " << refcount; + if (--refcount == 0) { + VLOG(3) << "Unloading CUDA module " << module; + CUDADriver::UnloadModule(context_, module); + gpu_binary_to_module_.erase(module_it); + } + return true; +} + void CUDAExecutor::UnloadKernel(const KernelBase *kernel) { VLOG(3) << "Unloading kernel " << kernel << " : " << kernel->name(); @@ -307,25 +337,52 @@ void CUDAExecutor::UnloadKernel(const KernelBase *kernel) { } VLOG(3) << "Kernel " << kernel << " : " << kernel->name() << " has loaded GPU code " << gpu_binary_it->second; - auto module_it = gpu_binary_to_module_.find(gpu_binary_it->second); - if (gpu_binary_to_module_.end() == module_it) { - VLOG(3) << "Kernel " << kernel << " : " << kernel->name() - << " has no loaded CUDA module."; - return; // This kernel never loaded any modules - } - auto &module = module_it->second.first; - auto &refcount = module_it->second.second; - VLOG(3) << "Kernel " << kernel << " : " << kernel->name() - << " has loaded GPU code " << gpu_binary_it->second - << " into CUDA module " << module << " with refcount " << refcount; - if (--refcount == 0) { - VLOG(3) << "Unloading CUDA module " << module; - CUDADriver::UnloadModule(context_, module); - gpu_binary_to_module_.erase(module_it); - } + UnloadGpuBinary(gpu_binary_it->second); kernel_to_gpu_binary_.erase(gpu_binary_it); } +bool CUDAExecutor::LoadModule(const MultiModuleLoaderSpec &spec, + ModuleHandle *module_handle) { + // In CUDAExecutor we store the pointer to the GPU binary (PTX or CUBIN) as + // ModuleHandle::id(). + CUmodule cu_module; + if (spec.has_cuda_cubin_in_memory()) { + mutex_lock lock{in_memory_modules_mu_}; + if (!LoadModuleFromCuBin( + reinterpret_cast<const char *>(spec.cuda_cubin_in_memory().data()), + &cu_module)) { + return false; + } + *module_handle = ModuleHandle(const_cast<void *>( + static_cast<const void *>(spec.cuda_cubin_in_memory().data()))); + return true; + } else if (spec.has_cuda_ptx_in_memory()) { + if (cc_major_ == 0 && cc_minor_ == 0) { + return false; + } + + if (!spec.cuda_ptx_in_memory()) { + return false; + } + + mutex_lock lock{in_memory_modules_mu_}; + if (!LoadModuleFromPtx(spec.cuda_ptx_in_memory(), &cu_module)) { + return false; + } + *module_handle = ModuleHandle(const_cast<void *>( + static_cast<const void *>(spec.cuda_ptx_in_memory()))); + return true; + } + LOG(WARNING) << "no method of loading CUDA module provided"; + return false; +} + +bool CUDAExecutor::UnloadModule(ModuleHandle module_handle) { + const char *gpu_binary = reinterpret_cast<const char *>(module_handle.id()); + mutex_lock lock{in_memory_modules_mu_}; + return UnloadGpuBinary(gpu_binary); +} + bool CUDAExecutor::GetKernelMetadata(CUDAKernel *cuda_kernel, KernelMetadata *kernel_metadata) { int value; @@ -783,16 +840,26 @@ bool CUDAExecutor::DeviceMemoryUsage(int64 *free, int64 *total) const { return CUDADriver::GetDeviceMemoryInfo(context_, free, total); } -bool CUDAExecutor::GetSymbol(const string& symbol_name, void **mem, +bool CUDAExecutor::GetSymbol(const string &symbol_name, + ModuleHandle module_handle, void **mem, size_t *bytes) { + auto lookup_in_module = [&](CUmodule module) { + CHECK(module != nullptr); + return CUDADriver::GetModuleSymbol(context_, module, symbol_name.c_str(), + reinterpret_cast<CUdeviceptr *>(mem), + bytes); + }; + { // give limited scope to mutex_lock mutex_lock lock{in_memory_modules_mu_}; + if (static_cast<bool>(module_handle)) { + auto it = gpu_binary_to_module_.find(module_handle.id()); + CHECK(it != gpu_binary_to_module_.end()); + return lookup_in_module(it->second.first); + } + for (auto &it : gpu_binary_to_module_) { - CUmodule module = it.second.first; - CHECK(module != nullptr); - if (CUDADriver::GetModuleSymbol(context_, module, symbol_name.c_str(), - reinterpret_cast<CUdeviceptr *>(mem), - bytes)) { + if (lookup_in_module(it.second.first)) { return true; } } diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h index f7c341c857..8a954d5461 100644 --- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h +++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h @@ -62,6 +62,9 @@ class CUDAExecutor : public internal::StreamExecutorInterface { bool GetKernel(const MultiKernelLoaderSpec &spec, KernelBase *kernel) override; void UnloadKernel(const KernelBase *kernel) override; + bool LoadModule(const MultiModuleLoaderSpec &spec, + ModuleHandle *module_handle) override; + bool UnloadModule(ModuleHandle module_handle) override; bool Launch(Stream *stream, const ThreadDim &thread_dims, const BlockDim &block_dims, const KernelBase &k, @@ -175,7 +178,8 @@ class CUDAExecutor : public internal::StreamExecutorInterface { // Search for the symbol and returns a device pointer and size. // Returns false if symbol does not exist. - bool GetSymbol(const string& symbol_name, void **mem, size_t *bytes) override; + bool GetSymbol(const string &symbol_name, ModuleHandle module_handle, + void **mem, size_t *bytes) override; DeviceDescription *PopulateDeviceDescription() const override; @@ -239,6 +243,16 @@ class CUDAExecutor : public internal::StreamExecutorInterface { void VlogOccupancyInfo(const KernelBase &kernel, const ThreadDim &thread_dims, const BlockDim &block_dims); + bool LoadModuleFromCuBin(const char *cubin, CUmodule *module) + EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_); + + // Loads the PTX text `ptx` as a CUDA module. `ptx` must be null terminated. + bool LoadModuleFromPtx(const char *ptx, CUmodule *module) + EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_); + + bool UnloadGpuBinary(const void *gpu_binary) + EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_); + // Guards the in-memory-module mapping. mutex in_memory_modules_mu_; |