From 632e48c27e09b53ab52523149e759f9bc1711e71 Mon Sep 17 00:00:00 2001 From: Sanjoy Das Date: Mon, 23 Jul 2018 16:17:12 -0700 Subject: Teach StreamExecutor to load modules and resolve symbols in them This will be used in a future CL. PiperOrigin-RevId: 205742731 --- .../stream_executor/cuda/cuda_gpu_executor.cc | 179 ++++++++++++++------- .../stream_executor/cuda/cuda_gpu_executor.h | 16 +- tensorflow/stream_executor/module_spec.h | 65 ++++++++ .../stream_executor/stream_executor_internal.h | 32 +++- .../stream_executor/stream_executor_pimpl.cc | 38 ++++- tensorflow/stream_executor/stream_executor_pimpl.h | 76 +++++++-- 6 files changed, 331 insertions(+), 75 deletions(-) create mode 100644 tensorflow/stream_executor/module_spec.h (limited to 'tensorflow/stream_executor') diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc index 259c813c57..73f05b94db 100644 --- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc +++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc @@ -206,6 +206,48 @@ static string GetBinaryDir(bool strip_exe) { return exe_path; } +bool CUDAExecutor::LoadModuleFromCuBin(const char *cubin, CUmodule *module) { + uint64_t module_refcount; + std::tie(*module, module_refcount) = gpu_binary_to_module_[cubin]; + + if (*module == nullptr) { + auto load_status = CUDADriver::LoadCubin(context_, cubin, module); + if (!load_status.ok()) { + LOG(ERROR) << "failed to load CUBIN: " << load_status; + return false; + } + module_refcount = 1; + VLOG(3) << "Loaded CUBIN " << static_cast(cubin) + << " as module " << *module; + } else { + ++module_refcount; + VLOG(3) << "CUBIN " << static_cast(cubin) + << " is already loaded as module " << *module; + } + gpu_binary_to_module_[cubin] = {*module, module_refcount}; + return true; +} + +bool CUDAExecutor::LoadModuleFromPtx(const char *ptx, CUmodule *module) { + uint64_t module_refcount; + std::tie(*module, module_refcount) = gpu_binary_to_module_[ptx]; + + if (*module == nullptr) { + if (!CUDADriver::LoadPtx(context_, ptx, module)) { + return false; + } + VLOG(3) << "Loaded PTX " << static_cast(ptx) << " as module " + << *module; + module_refcount = 1; + } else { + ++module_refcount; + VLOG(3) << "PTX " << static_cast(ptx) + << " is already loaded as module " << module; + } + gpu_binary_to_module_[ptx] = {*module, module_refcount}; + return true; +} + bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec, KernelBase *kernel) { CUDAKernel *cuda_kernel = AsCUDAKernel(kernel); @@ -215,28 +257,13 @@ bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec, VLOG(3) << "GetKernel on kernel " << kernel << " : " << kernel->name(); if (spec.has_cuda_cubin_in_memory()) { + mutex_lock lock{in_memory_modules_mu_}; kernelname = &spec.cuda_cubin_in_memory().kernelname(); const char *cubin = spec.cuda_cubin_in_memory().bytes(); - mutex_lock lock{in_memory_modules_mu_}; - uint64_t module_refcount; - std::tie(module, module_refcount) = gpu_binary_to_module_[cubin]; - - if (module == nullptr) { - auto load_status = CUDADriver::LoadCubin(context_, cubin, &module); - if (!load_status.ok()) { - LOG(ERROR) << "failed to load CUBIN: " << load_status; - return false; - } - module_refcount = 1; - VLOG(3) << "Loaded CUBIN " << static_cast(cubin) - << " as module " << module; - } else { - ++module_refcount; - VLOG(3) << "CUBIN " << static_cast(cubin) - << " is already loaded as module " << module; + if (!LoadModuleFromCuBin(cubin, &module)) { + return false; } kernel_to_gpu_binary_[kernel] = cubin; - gpu_binary_to_module_[cubin] = {module, module_refcount}; } else if (spec.has_cuda_ptx_in_memory()) { kernelname = &spec.cuda_ptx_in_memory().kernelname(); @@ -254,24 +281,10 @@ bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec, } mutex_lock lock{in_memory_modules_mu_}; - uint64_t module_refcount; - std::tie(module, module_refcount) = gpu_binary_to_module_[ptx]; - - if (module == nullptr) { - if (!CUDADriver::LoadPtx(context_, ptx, &module)) { - LOG(ERROR) << "failed to load PTX for kernel " << *kernelname; - return false; - } - VLOG(3) << "Loaded PTX " << static_cast(ptx) - << " as module " << module; - module_refcount = 1; - } else { - ++module_refcount; - VLOG(3) << "PTX " << static_cast(ptx) - << " is already loaded as module " << module; + if (!LoadModuleFromPtx(ptx, &module)) { + return false; } kernel_to_gpu_binary_[kernel] = ptx; - gpu_binary_to_module_[ptx] = {module, module_refcount}; } else { LOG(WARNING) << "no method of loading CUDA kernel provided"; return false; @@ -295,6 +308,23 @@ bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec, return true; } +bool CUDAExecutor::UnloadGpuBinary(const void *gpu_binary) { + auto module_it = gpu_binary_to_module_.find(gpu_binary); + if (gpu_binary_to_module_.end() == module_it) { + VLOG(3) << "No loaded CUDA module for " << gpu_binary; + return false; + } + auto &module = module_it->second.first; + auto &refcount = module_it->second.second; + VLOG(3) << "Found CUDA module " << module << " with refcount " << refcount; + if (--refcount == 0) { + VLOG(3) << "Unloading CUDA module " << module; + CUDADriver::UnloadModule(context_, module); + gpu_binary_to_module_.erase(module_it); + } + return true; +} + void CUDAExecutor::UnloadKernel(const KernelBase *kernel) { VLOG(3) << "Unloading kernel " << kernel << " : " << kernel->name(); @@ -307,25 +337,52 @@ void CUDAExecutor::UnloadKernel(const KernelBase *kernel) { } VLOG(3) << "Kernel " << kernel << " : " << kernel->name() << " has loaded GPU code " << gpu_binary_it->second; - auto module_it = gpu_binary_to_module_.find(gpu_binary_it->second); - if (gpu_binary_to_module_.end() == module_it) { - VLOG(3) << "Kernel " << kernel << " : " << kernel->name() - << " has no loaded CUDA module."; - return; // This kernel never loaded any modules - } - auto &module = module_it->second.first; - auto &refcount = module_it->second.second; - VLOG(3) << "Kernel " << kernel << " : " << kernel->name() - << " has loaded GPU code " << gpu_binary_it->second - << " into CUDA module " << module << " with refcount " << refcount; - if (--refcount == 0) { - VLOG(3) << "Unloading CUDA module " << module; - CUDADriver::UnloadModule(context_, module); - gpu_binary_to_module_.erase(module_it); - } + UnloadGpuBinary(gpu_binary_it->second); kernel_to_gpu_binary_.erase(gpu_binary_it); } +bool CUDAExecutor::LoadModule(const MultiModuleLoaderSpec &spec, + ModuleHandle *module_handle) { + // In CUDAExecutor we store the pointer to the GPU binary (PTX or CUBIN) as + // ModuleHandle::id(). + CUmodule cu_module; + if (spec.has_cuda_cubin_in_memory()) { + mutex_lock lock{in_memory_modules_mu_}; + if (!LoadModuleFromCuBin( + reinterpret_cast(spec.cuda_cubin_in_memory().data()), + &cu_module)) { + return false; + } + *module_handle = ModuleHandle(const_cast( + static_cast(spec.cuda_cubin_in_memory().data()))); + return true; + } else if (spec.has_cuda_ptx_in_memory()) { + if (cc_major_ == 0 && cc_minor_ == 0) { + return false; + } + + if (!spec.cuda_ptx_in_memory()) { + return false; + } + + mutex_lock lock{in_memory_modules_mu_}; + if (!LoadModuleFromPtx(spec.cuda_ptx_in_memory(), &cu_module)) { + return false; + } + *module_handle = ModuleHandle(const_cast( + static_cast(spec.cuda_ptx_in_memory()))); + return true; + } + LOG(WARNING) << "no method of loading CUDA module provided"; + return false; +} + +bool CUDAExecutor::UnloadModule(ModuleHandle module_handle) { + const char *gpu_binary = reinterpret_cast(module_handle.id()); + mutex_lock lock{in_memory_modules_mu_}; + return UnloadGpuBinary(gpu_binary); +} + bool CUDAExecutor::GetKernelMetadata(CUDAKernel *cuda_kernel, KernelMetadata *kernel_metadata) { int value; @@ -783,16 +840,26 @@ bool CUDAExecutor::DeviceMemoryUsage(int64 *free, int64 *total) const { return CUDADriver::GetDeviceMemoryInfo(context_, free, total); } -bool CUDAExecutor::GetSymbol(const string& symbol_name, void **mem, +bool CUDAExecutor::GetSymbol(const string &symbol_name, + ModuleHandle module_handle, void **mem, size_t *bytes) { + auto lookup_in_module = [&](CUmodule module) { + CHECK(module != nullptr); + return CUDADriver::GetModuleSymbol(context_, module, symbol_name.c_str(), + reinterpret_cast(mem), + bytes); + }; + { // give limited scope to mutex_lock mutex_lock lock{in_memory_modules_mu_}; + if (static_cast(module_handle)) { + auto it = gpu_binary_to_module_.find(module_handle.id()); + CHECK(it != gpu_binary_to_module_.end()); + return lookup_in_module(it->second.first); + } + for (auto &it : gpu_binary_to_module_) { - CUmodule module = it.second.first; - CHECK(module != nullptr); - if (CUDADriver::GetModuleSymbol(context_, module, symbol_name.c_str(), - reinterpret_cast(mem), - bytes)) { + if (lookup_in_module(it.second.first)) { return true; } } diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h index f7c341c857..8a954d5461 100644 --- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h +++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h @@ -62,6 +62,9 @@ class CUDAExecutor : public internal::StreamExecutorInterface { bool GetKernel(const MultiKernelLoaderSpec &spec, KernelBase *kernel) override; void UnloadKernel(const KernelBase *kernel) override; + bool LoadModule(const MultiModuleLoaderSpec &spec, + ModuleHandle *module_handle) override; + bool UnloadModule(ModuleHandle module_handle) override; bool Launch(Stream *stream, const ThreadDim &thread_dims, const BlockDim &block_dims, const KernelBase &k, @@ -175,7 +178,8 @@ class CUDAExecutor : public internal::StreamExecutorInterface { // Search for the symbol and returns a device pointer and size. // Returns false if symbol does not exist. - bool GetSymbol(const string& symbol_name, void **mem, size_t *bytes) override; + bool GetSymbol(const string &symbol_name, ModuleHandle module_handle, + void **mem, size_t *bytes) override; DeviceDescription *PopulateDeviceDescription() const override; @@ -239,6 +243,16 @@ class CUDAExecutor : public internal::StreamExecutorInterface { void VlogOccupancyInfo(const KernelBase &kernel, const ThreadDim &thread_dims, const BlockDim &block_dims); + bool LoadModuleFromCuBin(const char *cubin, CUmodule *module) + EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_); + + // Loads the PTX text `ptx` as a CUDA module. `ptx` must be null terminated. + bool LoadModuleFromPtx(const char *ptx, CUmodule *module) + EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_); + + bool UnloadGpuBinary(const void *gpu_binary) + EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_); + // Guards the in-memory-module mapping. mutex in_memory_modules_mu_; diff --git a/tensorflow/stream_executor/module_spec.h b/tensorflow/stream_executor/module_spec.h new file mode 100644 index 0000000000..212ae7ba9c --- /dev/null +++ b/tensorflow/stream_executor/module_spec.h @@ -0,0 +1,65 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_STREAM_EXECUTOR_MODULE_SPEC_H_ +#define TENSORFLOW_STREAM_EXECUTOR_MODULE_SPEC_H_ + +#include "tensorflow/stream_executor/lib/array_slice.h" +#include "tensorflow/stream_executor/lib/stringpiece.h" +#include "tensorflow/stream_executor/platform/logging.h" +#include "tensorflow/stream_executor/platform/port.h" + +namespace stream_executor { + +// Describes how to load a module on a target platform. +// +// The exact meaning of a "module" may differ from platform to platform but +// loosely speaking a module a collection of kernels and global variables. It +// corresponds to CUmodule when running on CUDA. +class MultiModuleLoaderSpec { + public: + bool has_cuda_cubin_in_memory() const { return has_cuda_cubin_in_memory_; } + port::ArraySlice cuda_cubin_in_memory() const { + CHECK(has_cuda_cubin_in_memory()); + return {cuda_cubin_in_memory_.data(), cuda_cubin_in_memory_.size()}; + } + + bool has_cuda_ptx_in_memory() const { return has_cuda_ptx_in_memory_; } + const char* cuda_ptx_in_memory() const { + CHECK(has_cuda_ptx_in_memory()); + return cuda_ptx_in_memory_; + } + + void AddCudaCubinInMemory(port::ArraySlice cubin_bytes) { + has_cuda_cubin_in_memory_ = true; + cuda_cubin_in_memory_ = cubin_bytes; + } + + void AddCudaPtxInMemory(const char* ptx) { + has_cuda_ptx_in_memory_ = true; + // The CUDA driver does not like getting an empty string as PTX. + cuda_ptx_in_memory_ = *ptx ? ptx : nullptr; + } + + private: + port::ArraySlice cuda_cubin_in_memory_; + bool has_cuda_cubin_in_memory_ = false; + const char* cuda_ptx_in_memory_; + bool has_cuda_ptx_in_memory_ = false; +}; + +} // namespace stream_executor + +#endif // TENSORFLOW_STREAM_EXECUTOR_MODULE_SPEC_H_ diff --git a/tensorflow/stream_executor/stream_executor_internal.h b/tensorflow/stream_executor/stream_executor_internal.h index fb1b92cb84..f34b1fc083 100644 --- a/tensorflow/stream_executor/stream_executor_internal.h +++ b/tensorflow/stream_executor/stream_executor_internal.h @@ -36,20 +36,38 @@ limitations under the License. #include "tensorflow/stream_executor/kernel_cache_config.h" #include "tensorflow/stream_executor/kernel_spec.h" #include "tensorflow/stream_executor/launch_dim.h" +#include "tensorflow/stream_executor/lib/inlined_vector.h" #include "tensorflow/stream_executor/lib/status.h" #include "tensorflow/stream_executor/lib/statusor.h" +#include "tensorflow/stream_executor/module_spec.h" #include "tensorflow/stream_executor/platform.h" #include "tensorflow/stream_executor/platform/port.h" #include "tensorflow/stream_executor/plugin_registry.h" #include "tensorflow/stream_executor/shared_memory_config.h" #include "tensorflow/stream_executor/trace_listener.h" -#include "tensorflow/stream_executor/lib/inlined_vector.h" namespace stream_executor { class Stream; class Timer; +// An opaque handle to a loaded module. +// +// An instance of this is returned from StreamExecutor::GetModule. +class ModuleHandle { + public: + /*implicit*/ ModuleHandle(void *id = nullptr) : id_(id) {} + + // A ModuleHandle with id() == nullptr is an invalid module handle, akin to a + // null pointer. + void *id() const { return id_; } + + explicit operator bool() const { return id() != nullptr; } + + private: + void *id_; +}; + namespace internal { // Platform-dependent interface class for the generic Events interface, in @@ -164,6 +182,11 @@ class StreamExecutorInterface { KernelBase *kernel) { return false; } + virtual bool LoadModule(const MultiModuleLoaderSpec &spec, + ModuleHandle *module_handle) { + return false; + } + virtual bool UnloadModule(ModuleHandle module_handle) { return false; } virtual bool Launch(Stream *stream, const ThreadDim &thread_dims, const BlockDim &block_dims, const KernelBase &k, const KernelArgsArrayBase &args) { @@ -247,7 +270,12 @@ class StreamExecutorInterface { // null, however, both of them cannot be null at the same time. To use // constant memory in CUDA, GetSymbol has to be used. Returns true if symbol // is found. - virtual bool GetSymbol(const string& symbol_name, void **mem, size_t *bytes) { + // + // If ModuleHandle is set then we search for `symbol_name` only within the + // module corresponding to `module_handle`. Otherwise all loaded modules are + // searched. + virtual bool GetSymbol(const string &symbol_name, ModuleHandle module_handle, + void **mem, size_t *bytes) { return false; } diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc index 000795ff00..2e0137a485 100644 --- a/tensorflow/stream_executor/stream_executor_pimpl.cc +++ b/tensorflow/stream_executor/stream_executor_pimpl.cc @@ -220,6 +220,15 @@ void StreamExecutor::UnloadKernel(const KernelBase *kernel) { implementation_->UnloadKernel(kernel); } +bool StreamExecutor::LoadModule(const MultiModuleLoaderSpec &spec, + ModuleHandle *module_handle) { + return implementation_->LoadModule(spec, module_handle); +} + +bool StreamExecutor::UnloadModule(ModuleHandle module_handle) { + return implementation_->UnloadModule(module_handle); +} + void StreamExecutor::Deallocate(DeviceMemoryBase *mem) { VLOG(1) << "Called StreamExecutor::Deallocate(mem=" << mem->opaque() << ") mem->size()=" << mem->size() << StackTraceIfVLOG10(); @@ -459,9 +468,34 @@ void *StreamExecutor::Allocate(uint64 size) { return buf; } -bool StreamExecutor::GetSymbol(const string &symbol_name, void **mem, +port::StatusOr StreamExecutor::GetUntypedSymbol( + const string &symbol_name, ModuleHandle module_handle) { + // If failed to get the symbol, opaque/bytes are unchanged. Initialize them to + // be nullptr/0 for consistency with DeviceMemory semantics. + void *opaque = nullptr; + size_t bytes = 0; + if (GetSymbol(symbol_name, module_handle, &opaque, &bytes)) { + return DeviceMemoryBase(opaque, bytes); + } + + if (static_cast(module_handle)) { + return port::Status( + port::error::NOT_FOUND, + port::StrCat("Check if module containing symbol ", symbol_name, + " is loaded (module_handle = ", + reinterpret_cast(module_handle.id()), ")")); + } else { + return port::Status( + port::error::NOT_FOUND, + port::StrCat("Check if kernel using the symbol is loaded: ", + symbol_name)); + } +} + +bool StreamExecutor::GetSymbol(const string &symbol_name, + ModuleHandle module_handle, void **mem, size_t *bytes) { - return implementation_->GetSymbol(symbol_name, mem, bytes); + return implementation_->GetSymbol(symbol_name, module_handle, mem, bytes); } void *StreamExecutor::UnifiedMemoryAllocate(uint64 bytes) { diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h index ad80a1ba25..47b3a2b030 100644 --- a/tensorflow/stream_executor/stream_executor_pimpl.h +++ b/tensorflow/stream_executor/stream_executor_pimpl.h @@ -106,6 +106,16 @@ class StreamExecutor { // Releases any state associated with the previously loaded kernel. void UnloadKernel(const KernelBase *kernel); + // Loads a module for the platform this StreamExecutor is acting upon. + // + // `spec` describes the module to be loaded. On success writes the handle for + // the loaded module to `module_handle` and returns true. Else returns false. + bool LoadModule(const MultiModuleLoaderSpec &spec, + ModuleHandle *module_handle); + + // Unloads the module with handle `module_handle`. + bool UnloadModule(ModuleHandle module_handle); + // Synchronously allocates an array on the device of type T with element_count // elements. template @@ -169,8 +179,16 @@ class StreamExecutor { // type of symbol and T match. // - Note: symbol_name should include its namespace as well. For example, // pass "nms0::symbol" if referring to nms0::symbol. + // + // If `module_handle` is set then searches only within the module + // corresponding to `module_handle`. template - port::StatusOr> GetSymbol(const string &symbol_name); + port::StatusOr> GetSymbol(const string &symbol_name, + ModuleHandle module_handle = {}); + + // An untyped version of GetSymbol. + port::StatusOr GetUntypedSymbol( + const string &symbol_name, ModuleHandle module_handle = {}); // Deallocate the DeviceMemory previously allocated via this interface. // Deallocation of a nullptr-representative value is permitted. @@ -507,7 +525,8 @@ class StreamExecutor { // Finds and retrieves device memory for the symbol on the underlying // platform. - bool GetSymbol(const string& symbol_name, void **mem, size_t *bytes); + bool GetSymbol(const string &symbol_name, ModuleHandle module_handle, + void **mem, size_t *bytes); // Entrains a memcpy operation onto stream, with a host destination location // host_dst and a device memory source, with target size size. @@ -678,6 +697,41 @@ class StreamExecutor { SE_DISALLOW_COPY_AND_ASSIGN(StreamExecutor); }; +// A wrapper around ModuleHandle that uses RAII to manage its lifetime. +class ScopedModuleHandle { + public: + explicit ScopedModuleHandle(StreamExecutor *executor, + ModuleHandle module_handle) + : executor_(executor), module_handle_(module_handle) {} + + ScopedModuleHandle(ScopedModuleHandle &&other) { + executor_ = other.executor_; + module_handle_ = other.module_handle_; + other.executor_ = nullptr; + other.module_handle_ = ModuleHandle(); + } + + ScopedModuleHandle &operator=(ScopedModuleHandle &&other) { + executor_ = other.executor_; + module_handle_ = other.module_handle_; + other.executor_ = nullptr; + other.module_handle_ = ModuleHandle(); + return *this; + } + + ~ScopedModuleHandle() { + if (static_cast(module_handle_)) { + CHECK(executor_->UnloadModule(module_handle_)); + } + } + + private: + StreamExecutor *executor_; + ModuleHandle module_handle_; + + TF_DISALLOW_COPY_AND_ASSIGN(ScopedModuleHandle); +}; + //////////// // Inlines @@ -690,19 +744,13 @@ inline DeviceMemory StreamExecutor::AllocateArray(uint64 element_count) { template inline port::StatusOr> StreamExecutor::GetSymbol( - const string &symbol_name) { - // If failed to get the symbol, opaque/bytes are unchanged. Initialize them to - // be nullptr/0 for consistency with DeviceMemory semantics. - void *opaque = nullptr; - size_t bytes = 0; - if (GetSymbol(symbol_name, &opaque, &bytes)) { - CHECK_EQ(bytes % sizeof(T), 0); - return DeviceMemory::MakeFromByteSize(opaque, bytes); + const string &symbol_name, ModuleHandle module_handle) { + port::StatusOr untyped_symbol = + GetUntypedSymbol(symbol_name, module_handle); + if (!untyped_symbol.ok()) { + return untyped_symbol.status(); } - return port::Status( - port::error::NOT_FOUND, - port::StrCat("Check if kernel using the symbol is loaded: ", - symbol_name)); + return DeviceMemory(untyped_symbol.ValueOrDie()); } template -- cgit v1.2.3