aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/stream_executor
diff options
context:
space:
mode:
authorGravatar Sanjoy Das <sanjoy@google.com>2018-07-23 16:17:12 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-07-23 16:20:36 -0700
commit632e48c27e09b53ab52523149e759f9bc1711e71 (patch)
tree6c080226ca18ed1937b8a2afe973702d0ffffaee /tensorflow/stream_executor
parent9225bbbe0aaaa14b69176576097bb67bae98e6c5 (diff)
Teach StreamExecutor to load modules and resolve symbols in them
This will be used in a future CL. PiperOrigin-RevId: 205742731
Diffstat (limited to 'tensorflow/stream_executor')
-rw-r--r--tensorflow/stream_executor/cuda/cuda_gpu_executor.cc179
-rw-r--r--tensorflow/stream_executor/cuda/cuda_gpu_executor.h16
-rw-r--r--tensorflow/stream_executor/module_spec.h65
-rw-r--r--tensorflow/stream_executor/stream_executor_internal.h32
-rw-r--r--tensorflow/stream_executor/stream_executor_pimpl.cc38
-rw-r--r--tensorflow/stream_executor/stream_executor_pimpl.h76
6 files changed, 331 insertions, 75 deletions
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index 259c813c57..73f05b94db 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -206,6 +206,48 @@ static string GetBinaryDir(bool strip_exe) {
return exe_path;
}
+bool CUDAExecutor::LoadModuleFromCuBin(const char *cubin, CUmodule *module) {
+ uint64_t module_refcount;
+ std::tie(*module, module_refcount) = gpu_binary_to_module_[cubin];
+
+ if (*module == nullptr) {
+ auto load_status = CUDADriver::LoadCubin(context_, cubin, module);
+ if (!load_status.ok()) {
+ LOG(ERROR) << "failed to load CUBIN: " << load_status;
+ return false;
+ }
+ module_refcount = 1;
+ VLOG(3) << "Loaded CUBIN " << static_cast<const void *>(cubin)
+ << " as module " << *module;
+ } else {
+ ++module_refcount;
+ VLOG(3) << "CUBIN " << static_cast<const void *>(cubin)
+ << " is already loaded as module " << *module;
+ }
+ gpu_binary_to_module_[cubin] = {*module, module_refcount};
+ return true;
+}
+
+bool CUDAExecutor::LoadModuleFromPtx(const char *ptx, CUmodule *module) {
+ uint64_t module_refcount;
+ std::tie(*module, module_refcount) = gpu_binary_to_module_[ptx];
+
+ if (*module == nullptr) {
+ if (!CUDADriver::LoadPtx(context_, ptx, module)) {
+ return false;
+ }
+ VLOG(3) << "Loaded PTX " << static_cast<const void *>(ptx) << " as module "
+ << *module;
+ module_refcount = 1;
+ } else {
+ ++module_refcount;
+ VLOG(3) << "PTX " << static_cast<const void *>(ptx)
+ << " is already loaded as module " << module;
+ }
+ gpu_binary_to_module_[ptx] = {*module, module_refcount};
+ return true;
+}
+
bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec,
KernelBase *kernel) {
CUDAKernel *cuda_kernel = AsCUDAKernel(kernel);
@@ -215,28 +257,13 @@ bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec,
VLOG(3) << "GetKernel on kernel " << kernel << " : " << kernel->name();
if (spec.has_cuda_cubin_in_memory()) {
+ mutex_lock lock{in_memory_modules_mu_};
kernelname = &spec.cuda_cubin_in_memory().kernelname();
const char *cubin = spec.cuda_cubin_in_memory().bytes();
- mutex_lock lock{in_memory_modules_mu_};
- uint64_t module_refcount;
- std::tie(module, module_refcount) = gpu_binary_to_module_[cubin];
-
- if (module == nullptr) {
- auto load_status = CUDADriver::LoadCubin(context_, cubin, &module);
- if (!load_status.ok()) {
- LOG(ERROR) << "failed to load CUBIN: " << load_status;
- return false;
- }
- module_refcount = 1;
- VLOG(3) << "Loaded CUBIN " << static_cast<const void *>(cubin)
- << " as module " << module;
- } else {
- ++module_refcount;
- VLOG(3) << "CUBIN " << static_cast<const void *>(cubin)
- << " is already loaded as module " << module;
+ if (!LoadModuleFromCuBin(cubin, &module)) {
+ return false;
}
kernel_to_gpu_binary_[kernel] = cubin;
- gpu_binary_to_module_[cubin] = {module, module_refcount};
} else if (spec.has_cuda_ptx_in_memory()) {
kernelname = &spec.cuda_ptx_in_memory().kernelname();
@@ -254,24 +281,10 @@ bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec,
}
mutex_lock lock{in_memory_modules_mu_};
- uint64_t module_refcount;
- std::tie(module, module_refcount) = gpu_binary_to_module_[ptx];
-
- if (module == nullptr) {
- if (!CUDADriver::LoadPtx(context_, ptx, &module)) {
- LOG(ERROR) << "failed to load PTX for kernel " << *kernelname;
- return false;
- }
- VLOG(3) << "Loaded PTX " << static_cast<const void *>(ptx)
- << " as module " << module;
- module_refcount = 1;
- } else {
- ++module_refcount;
- VLOG(3) << "PTX " << static_cast<const void *>(ptx)
- << " is already loaded as module " << module;
+ if (!LoadModuleFromPtx(ptx, &module)) {
+ return false;
}
kernel_to_gpu_binary_[kernel] = ptx;
- gpu_binary_to_module_[ptx] = {module, module_refcount};
} else {
LOG(WARNING) << "no method of loading CUDA kernel provided";
return false;
@@ -295,6 +308,23 @@ bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec,
return true;
}
+bool CUDAExecutor::UnloadGpuBinary(const void *gpu_binary) {
+ auto module_it = gpu_binary_to_module_.find(gpu_binary);
+ if (gpu_binary_to_module_.end() == module_it) {
+ VLOG(3) << "No loaded CUDA module for " << gpu_binary;
+ return false;
+ }
+ auto &module = module_it->second.first;
+ auto &refcount = module_it->second.second;
+ VLOG(3) << "Found CUDA module " << module << " with refcount " << refcount;
+ if (--refcount == 0) {
+ VLOG(3) << "Unloading CUDA module " << module;
+ CUDADriver::UnloadModule(context_, module);
+ gpu_binary_to_module_.erase(module_it);
+ }
+ return true;
+}
+
void CUDAExecutor::UnloadKernel(const KernelBase *kernel) {
VLOG(3) << "Unloading kernel " << kernel << " : " << kernel->name();
@@ -307,25 +337,52 @@ void CUDAExecutor::UnloadKernel(const KernelBase *kernel) {
}
VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
<< " has loaded GPU code " << gpu_binary_it->second;
- auto module_it = gpu_binary_to_module_.find(gpu_binary_it->second);
- if (gpu_binary_to_module_.end() == module_it) {
- VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
- << " has no loaded CUDA module.";
- return; // This kernel never loaded any modules
- }
- auto &module = module_it->second.first;
- auto &refcount = module_it->second.second;
- VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
- << " has loaded GPU code " << gpu_binary_it->second
- << " into CUDA module " << module << " with refcount " << refcount;
- if (--refcount == 0) {
- VLOG(3) << "Unloading CUDA module " << module;
- CUDADriver::UnloadModule(context_, module);
- gpu_binary_to_module_.erase(module_it);
- }
+ UnloadGpuBinary(gpu_binary_it->second);
kernel_to_gpu_binary_.erase(gpu_binary_it);
}
+bool CUDAExecutor::LoadModule(const MultiModuleLoaderSpec &spec,
+ ModuleHandle *module_handle) {
+ // In CUDAExecutor we store the pointer to the GPU binary (PTX or CUBIN) as
+ // ModuleHandle::id().
+ CUmodule cu_module;
+ if (spec.has_cuda_cubin_in_memory()) {
+ mutex_lock lock{in_memory_modules_mu_};
+ if (!LoadModuleFromCuBin(
+ reinterpret_cast<const char *>(spec.cuda_cubin_in_memory().data()),
+ &cu_module)) {
+ return false;
+ }
+ *module_handle = ModuleHandle(const_cast<void *>(
+ static_cast<const void *>(spec.cuda_cubin_in_memory().data())));
+ return true;
+ } else if (spec.has_cuda_ptx_in_memory()) {
+ if (cc_major_ == 0 && cc_minor_ == 0) {
+ return false;
+ }
+
+ if (!spec.cuda_ptx_in_memory()) {
+ return false;
+ }
+
+ mutex_lock lock{in_memory_modules_mu_};
+ if (!LoadModuleFromPtx(spec.cuda_ptx_in_memory(), &cu_module)) {
+ return false;
+ }
+ *module_handle = ModuleHandle(const_cast<void *>(
+ static_cast<const void *>(spec.cuda_ptx_in_memory())));
+ return true;
+ }
+ LOG(WARNING) << "no method of loading CUDA module provided";
+ return false;
+}
+
+bool CUDAExecutor::UnloadModule(ModuleHandle module_handle) {
+ const char *gpu_binary = reinterpret_cast<const char *>(module_handle.id());
+ mutex_lock lock{in_memory_modules_mu_};
+ return UnloadGpuBinary(gpu_binary);
+}
+
bool CUDAExecutor::GetKernelMetadata(CUDAKernel *cuda_kernel,
KernelMetadata *kernel_metadata) {
int value;
@@ -783,16 +840,26 @@ bool CUDAExecutor::DeviceMemoryUsage(int64 *free, int64 *total) const {
return CUDADriver::GetDeviceMemoryInfo(context_, free, total);
}
-bool CUDAExecutor::GetSymbol(const string& symbol_name, void **mem,
+bool CUDAExecutor::GetSymbol(const string &symbol_name,
+ ModuleHandle module_handle, void **mem,
size_t *bytes) {
+ auto lookup_in_module = [&](CUmodule module) {
+ CHECK(module != nullptr);
+ return CUDADriver::GetModuleSymbol(context_, module, symbol_name.c_str(),
+ reinterpret_cast<CUdeviceptr *>(mem),
+ bytes);
+ };
+
{ // give limited scope to mutex_lock
mutex_lock lock{in_memory_modules_mu_};
+ if (static_cast<bool>(module_handle)) {
+ auto it = gpu_binary_to_module_.find(module_handle.id());
+ CHECK(it != gpu_binary_to_module_.end());
+ return lookup_in_module(it->second.first);
+ }
+
for (auto &it : gpu_binary_to_module_) {
- CUmodule module = it.second.first;
- CHECK(module != nullptr);
- if (CUDADriver::GetModuleSymbol(context_, module, symbol_name.c_str(),
- reinterpret_cast<CUdeviceptr *>(mem),
- bytes)) {
+ if (lookup_in_module(it.second.first)) {
return true;
}
}
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
index f7c341c857..8a954d5461 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
@@ -62,6 +62,9 @@ class CUDAExecutor : public internal::StreamExecutorInterface {
bool GetKernel(const MultiKernelLoaderSpec &spec,
KernelBase *kernel) override;
void UnloadKernel(const KernelBase *kernel) override;
+ bool LoadModule(const MultiModuleLoaderSpec &spec,
+ ModuleHandle *module_handle) override;
+ bool UnloadModule(ModuleHandle module_handle) override;
bool Launch(Stream *stream, const ThreadDim &thread_dims,
const BlockDim &block_dims, const KernelBase &k,
@@ -175,7 +178,8 @@ class CUDAExecutor : public internal::StreamExecutorInterface {
// Search for the symbol and returns a device pointer and size.
// Returns false if symbol does not exist.
- bool GetSymbol(const string& symbol_name, void **mem, size_t *bytes) override;
+ bool GetSymbol(const string &symbol_name, ModuleHandle module_handle,
+ void **mem, size_t *bytes) override;
DeviceDescription *PopulateDeviceDescription() const override;
@@ -239,6 +243,16 @@ class CUDAExecutor : public internal::StreamExecutorInterface {
void VlogOccupancyInfo(const KernelBase &kernel, const ThreadDim &thread_dims,
const BlockDim &block_dims);
+ bool LoadModuleFromCuBin(const char *cubin, CUmodule *module)
+ EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
+
+ // Loads the PTX text `ptx` as a CUDA module. `ptx` must be null terminated.
+ bool LoadModuleFromPtx(const char *ptx, CUmodule *module)
+ EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
+
+ bool UnloadGpuBinary(const void *gpu_binary)
+ EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
+
// Guards the in-memory-module mapping.
mutex in_memory_modules_mu_;
diff --git a/tensorflow/stream_executor/module_spec.h b/tensorflow/stream_executor/module_spec.h
new file mode 100644
index 0000000000..212ae7ba9c
--- /dev/null
+++ b/tensorflow/stream_executor/module_spec.h
@@ -0,0 +1,65 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_MODULE_SPEC_H_
+#define TENSORFLOW_STREAM_EXECUTOR_MODULE_SPEC_H_
+
+#include "tensorflow/stream_executor/lib/array_slice.h"
+#include "tensorflow/stream_executor/lib/stringpiece.h"
+#include "tensorflow/stream_executor/platform/logging.h"
+#include "tensorflow/stream_executor/platform/port.h"
+
+namespace stream_executor {
+
+// Describes how to load a module on a target platform.
+//
+// The exact meaning of a "module" may differ from platform to platform but
+// loosely speaking a module a collection of kernels and global variables. It
+// corresponds to CUmodule when running on CUDA.
+class MultiModuleLoaderSpec {
+ public:
+ bool has_cuda_cubin_in_memory() const { return has_cuda_cubin_in_memory_; }
+ port::ArraySlice<const uint8> cuda_cubin_in_memory() const {
+ CHECK(has_cuda_cubin_in_memory());
+ return {cuda_cubin_in_memory_.data(), cuda_cubin_in_memory_.size()};
+ }
+
+ bool has_cuda_ptx_in_memory() const { return has_cuda_ptx_in_memory_; }
+ const char* cuda_ptx_in_memory() const {
+ CHECK(has_cuda_ptx_in_memory());
+ return cuda_ptx_in_memory_;
+ }
+
+ void AddCudaCubinInMemory(port::ArraySlice<const uint8> cubin_bytes) {
+ has_cuda_cubin_in_memory_ = true;
+ cuda_cubin_in_memory_ = cubin_bytes;
+ }
+
+ void AddCudaPtxInMemory(const char* ptx) {
+ has_cuda_ptx_in_memory_ = true;
+ // The CUDA driver does not like getting an empty string as PTX.
+ cuda_ptx_in_memory_ = *ptx ? ptx : nullptr;
+ }
+
+ private:
+ port::ArraySlice<const uint8> cuda_cubin_in_memory_;
+ bool has_cuda_cubin_in_memory_ = false;
+ const char* cuda_ptx_in_memory_;
+ bool has_cuda_ptx_in_memory_ = false;
+};
+
+} // namespace stream_executor
+
+#endif // TENSORFLOW_STREAM_EXECUTOR_MODULE_SPEC_H_
diff --git a/tensorflow/stream_executor/stream_executor_internal.h b/tensorflow/stream_executor/stream_executor_internal.h
index fb1b92cb84..f34b1fc083 100644
--- a/tensorflow/stream_executor/stream_executor_internal.h
+++ b/tensorflow/stream_executor/stream_executor_internal.h
@@ -36,20 +36,38 @@ limitations under the License.
#include "tensorflow/stream_executor/kernel_cache_config.h"
#include "tensorflow/stream_executor/kernel_spec.h"
#include "tensorflow/stream_executor/launch_dim.h"
+#include "tensorflow/stream_executor/lib/inlined_vector.h"
#include "tensorflow/stream_executor/lib/status.h"
#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/module_spec.h"
#include "tensorflow/stream_executor/platform.h"
#include "tensorflow/stream_executor/platform/port.h"
#include "tensorflow/stream_executor/plugin_registry.h"
#include "tensorflow/stream_executor/shared_memory_config.h"
#include "tensorflow/stream_executor/trace_listener.h"
-#include "tensorflow/stream_executor/lib/inlined_vector.h"
namespace stream_executor {
class Stream;
class Timer;
+// An opaque handle to a loaded module.
+//
+// An instance of this is returned from StreamExecutor::GetModule.
+class ModuleHandle {
+ public:
+ /*implicit*/ ModuleHandle(void *id = nullptr) : id_(id) {}
+
+ // A ModuleHandle with id() == nullptr is an invalid module handle, akin to a
+ // null pointer.
+ void *id() const { return id_; }
+
+ explicit operator bool() const { return id() != nullptr; }
+
+ private:
+ void *id_;
+};
+
namespace internal {
// Platform-dependent interface class for the generic Events interface, in
@@ -164,6 +182,11 @@ class StreamExecutorInterface {
KernelBase *kernel) {
return false;
}
+ virtual bool LoadModule(const MultiModuleLoaderSpec &spec,
+ ModuleHandle *module_handle) {
+ return false;
+ }
+ virtual bool UnloadModule(ModuleHandle module_handle) { return false; }
virtual bool Launch(Stream *stream, const ThreadDim &thread_dims,
const BlockDim &block_dims, const KernelBase &k,
const KernelArgsArrayBase &args) {
@@ -247,7 +270,12 @@ class StreamExecutorInterface {
// null, however, both of them cannot be null at the same time. To use
// constant memory in CUDA, GetSymbol has to be used. Returns true if symbol
// is found.
- virtual bool GetSymbol(const string& symbol_name, void **mem, size_t *bytes) {
+ //
+ // If ModuleHandle is set then we search for `symbol_name` only within the
+ // module corresponding to `module_handle`. Otherwise all loaded modules are
+ // searched.
+ virtual bool GetSymbol(const string &symbol_name, ModuleHandle module_handle,
+ void **mem, size_t *bytes) {
return false;
}
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc
index 000795ff00..2e0137a485 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.cc
+++ b/tensorflow/stream_executor/stream_executor_pimpl.cc
@@ -220,6 +220,15 @@ void StreamExecutor::UnloadKernel(const KernelBase *kernel) {
implementation_->UnloadKernel(kernel);
}
+bool StreamExecutor::LoadModule(const MultiModuleLoaderSpec &spec,
+ ModuleHandle *module_handle) {
+ return implementation_->LoadModule(spec, module_handle);
+}
+
+bool StreamExecutor::UnloadModule(ModuleHandle module_handle) {
+ return implementation_->UnloadModule(module_handle);
+}
+
void StreamExecutor::Deallocate(DeviceMemoryBase *mem) {
VLOG(1) << "Called StreamExecutor::Deallocate(mem=" << mem->opaque()
<< ") mem->size()=" << mem->size() << StackTraceIfVLOG10();
@@ -459,9 +468,34 @@ void *StreamExecutor::Allocate(uint64 size) {
return buf;
}
-bool StreamExecutor::GetSymbol(const string &symbol_name, void **mem,
+port::StatusOr<DeviceMemoryBase> StreamExecutor::GetUntypedSymbol(
+ const string &symbol_name, ModuleHandle module_handle) {
+ // If failed to get the symbol, opaque/bytes are unchanged. Initialize them to
+ // be nullptr/0 for consistency with DeviceMemory semantics.
+ void *opaque = nullptr;
+ size_t bytes = 0;
+ if (GetSymbol(symbol_name, module_handle, &opaque, &bytes)) {
+ return DeviceMemoryBase(opaque, bytes);
+ }
+
+ if (static_cast<bool>(module_handle)) {
+ return port::Status(
+ port::error::NOT_FOUND,
+ port::StrCat("Check if module containing symbol ", symbol_name,
+ " is loaded (module_handle = ",
+ reinterpret_cast<uintptr_t>(module_handle.id()), ")"));
+ } else {
+ return port::Status(
+ port::error::NOT_FOUND,
+ port::StrCat("Check if kernel using the symbol is loaded: ",
+ symbol_name));
+ }
+}
+
+bool StreamExecutor::GetSymbol(const string &symbol_name,
+ ModuleHandle module_handle, void **mem,
size_t *bytes) {
- return implementation_->GetSymbol(symbol_name, mem, bytes);
+ return implementation_->GetSymbol(symbol_name, module_handle, mem, bytes);
}
void *StreamExecutor::UnifiedMemoryAllocate(uint64 bytes) {
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h
index ad80a1ba25..47b3a2b030 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.h
+++ b/tensorflow/stream_executor/stream_executor_pimpl.h
@@ -106,6 +106,16 @@ class StreamExecutor {
// Releases any state associated with the previously loaded kernel.
void UnloadKernel(const KernelBase *kernel);
+ // Loads a module for the platform this StreamExecutor is acting upon.
+ //
+ // `spec` describes the module to be loaded. On success writes the handle for
+ // the loaded module to `module_handle` and returns true. Else returns false.
+ bool LoadModule(const MultiModuleLoaderSpec &spec,
+ ModuleHandle *module_handle);
+
+ // Unloads the module with handle `module_handle`.
+ bool UnloadModule(ModuleHandle module_handle);
+
// Synchronously allocates an array on the device of type T with element_count
// elements.
template <typename T>
@@ -169,8 +179,16 @@ class StreamExecutor {
// type of symbol and T match.
// - Note: symbol_name should include its namespace as well. For example,
// pass "nms0::symbol" if referring to nms0::symbol.
+ //
+ // If `module_handle` is set then searches only within the module
+ // corresponding to `module_handle`.
template <typename T>
- port::StatusOr<DeviceMemory<T>> GetSymbol(const string &symbol_name);
+ port::StatusOr<DeviceMemory<T>> GetSymbol(const string &symbol_name,
+ ModuleHandle module_handle = {});
+
+ // An untyped version of GetSymbol.
+ port::StatusOr<DeviceMemoryBase> GetUntypedSymbol(
+ const string &symbol_name, ModuleHandle module_handle = {});
// Deallocate the DeviceMemory previously allocated via this interface.
// Deallocation of a nullptr-representative value is permitted.
@@ -507,7 +525,8 @@ class StreamExecutor {
// Finds and retrieves device memory for the symbol on the underlying
// platform.
- bool GetSymbol(const string& symbol_name, void **mem, size_t *bytes);
+ bool GetSymbol(const string &symbol_name, ModuleHandle module_handle,
+ void **mem, size_t *bytes);
// Entrains a memcpy operation onto stream, with a host destination location
// host_dst and a device memory source, with target size size.
@@ -678,6 +697,41 @@ class StreamExecutor {
SE_DISALLOW_COPY_AND_ASSIGN(StreamExecutor);
};
+// A wrapper around ModuleHandle that uses RAII to manage its lifetime.
+class ScopedModuleHandle {
+ public:
+ explicit ScopedModuleHandle(StreamExecutor *executor,
+ ModuleHandle module_handle)
+ : executor_(executor), module_handle_(module_handle) {}
+
+ ScopedModuleHandle(ScopedModuleHandle &&other) {
+ executor_ = other.executor_;
+ module_handle_ = other.module_handle_;
+ other.executor_ = nullptr;
+ other.module_handle_ = ModuleHandle();
+ }
+
+ ScopedModuleHandle &operator=(ScopedModuleHandle &&other) {
+ executor_ = other.executor_;
+ module_handle_ = other.module_handle_;
+ other.executor_ = nullptr;
+ other.module_handle_ = ModuleHandle();
+ return *this;
+ }
+
+ ~ScopedModuleHandle() {
+ if (static_cast<bool>(module_handle_)) {
+ CHECK(executor_->UnloadModule(module_handle_));
+ }
+ }
+
+ private:
+ StreamExecutor *executor_;
+ ModuleHandle module_handle_;
+
+ TF_DISALLOW_COPY_AND_ASSIGN(ScopedModuleHandle);
+};
+
////////////
// Inlines
@@ -690,19 +744,13 @@ inline DeviceMemory<T> StreamExecutor::AllocateArray(uint64 element_count) {
template <typename T>
inline port::StatusOr<DeviceMemory<T>> StreamExecutor::GetSymbol(
- const string &symbol_name) {
- // If failed to get the symbol, opaque/bytes are unchanged. Initialize them to
- // be nullptr/0 for consistency with DeviceMemory semantics.
- void *opaque = nullptr;
- size_t bytes = 0;
- if (GetSymbol(symbol_name, &opaque, &bytes)) {
- CHECK_EQ(bytes % sizeof(T), 0);
- return DeviceMemory<T>::MakeFromByteSize(opaque, bytes);
+ const string &symbol_name, ModuleHandle module_handle) {
+ port::StatusOr<DeviceMemoryBase> untyped_symbol =
+ GetUntypedSymbol(symbol_name, module_handle);
+ if (!untyped_symbol.ok()) {
+ return untyped_symbol.status();
}
- return port::Status(
- port::error::NOT_FOUND,
- port::StrCat("Check if kernel using the symbol is loaded: ",
- symbol_name));
+ return DeviceMemory<T>(untyped_symbol.ValueOrDie());
}
template <typename ElemT>