diff options
Diffstat (limited to 'tensorflow/stream_executor/cuda/cuda_kernel.h')
-rw-r--r-- | tensorflow/stream_executor/cuda/cuda_kernel.h | 115 |
1 files changed, 115 insertions, 0 deletions
diff --git a/tensorflow/stream_executor/cuda/cuda_kernel.h b/tensorflow/stream_executor/cuda/cuda_kernel.h new file mode 100644 index 0000000000..e8ad3955e9 --- /dev/null +++ b/tensorflow/stream_executor/cuda/cuda_kernel.h @@ -0,0 +1,115 @@ +// The CUDA implementation of the StreamExecutorInterface functionality. +// CUDA inclusions are ideally confined to this implementation file. +// +// The notions from the StreamExecutor basically correspond to the CUDA streams +// programming model provided by the libcuda.so driver APIs, so we don't have +// to do much more than wrap the calls to the libraries appropriately. +#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_KERNEL_H_ +#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_KERNEL_H_ + +#include "tensorflow/stream_executor/kernel_cache_config.h" +#include "tensorflow/stream_executor/stream_executor_internal.h" +#include "tensorflow/stream_executor/cuda/cuda_driver.h" +#include "tensorflow/stream_executor/lib/casts.h" +#include "tensorflow/stream_executor/platform/port.h" +#include "tensorflow/stream_executor/platform/logging.h" +#include "third_party/gpus/cuda/include/cuda.h" + +#ifdef PLATFORMS_GPUS_CUDA_DYNAMIC_LIBCUDA_DYNAMIC_LIBCUDA_H_ +#error \ + "No driver calls in this file, wrap driver functionality in cuda_driver.cc." +#endif + +#ifdef __CUDA_RUNTIME_H__ +#error \ + "CUDA runtime being included into CUDA GPU executor; should be driver only." +#endif + +namespace perftools { +namespace gputools { +namespace cuda { + +// Wraps a CUfunction to implement the platform-independent KernelInterface. +class CUDAKernel : public internal::KernelInterface { + public: + CUDAKernel() : cuda_function_(nullptr), arity_(0), + preferred_cache_config_(KernelCacheConfig::kNoPreference) {} + + // Note that the function is unloaded when the module is unloaded, and the + // module that the function is contained in is owned by the CUDAExecutor. + ~CUDAKernel() override {} + + // As arity cannot be reflected upon using the CUDA API, the arity is + // explicitly set during the CUDAExecutor::GetKernel initialization process. + void set_arity(unsigned arity) { arity_ = arity; } + unsigned Arity() const override { return arity_; } + + // Returns the CUfunction value for passing to the CUDA API. + CUfunction AsCUDAFunctionValue() const { + DCHECK(cuda_function_ != nullptr); + return const_cast<CUfunction>(cuda_function_); + } + + // Returns the slot that the CUfunction is stored within for this object, + // for the CUDA API which wants to load into a CUfunction*. + CUfunction *cuda_function_ptr() { return &cuda_function_; } + + // CUDA supports setting the preferred cache configuration of a CUfunction + // (more-or-less equivalent to a CUDAKernel). We support this via the below + // functions; users can set a preference, and that is applied when the kernel + // is [lazy-]loaded (in CUDAExecutor::Launch). The alternative would be to + // load the kernel & set the preference when the user calls the setter below; + // either approach is valid. + // Sets the current kernel cache configuration preference. + void SetPreferredCacheConfig(KernelCacheConfig config) override { + preferred_cache_config_ = config; + } + + // Returns the current kernel cache configuration preference. + KernelCacheConfig GetPreferredCacheConfig() const override { + return preferred_cache_config_; + } + + // Returns the current kernel cache configuration preference as a + // CUfunc_cache. + CUfunc_cache GetCUDACacheConfig() const { + switch (preferred_cache_config_) { + case KernelCacheConfig::kNoPreference: + return CU_FUNC_CACHE_PREFER_NONE; + case KernelCacheConfig::kPreferShared: + return CU_FUNC_CACHE_PREFER_SHARED; + case KernelCacheConfig::kPreferL1: + return CU_FUNC_CACHE_PREFER_L1; + case KernelCacheConfig::kPreferEqual: + return CU_FUNC_CACHE_PREFER_EQUAL; + default: + LOG(FATAL) << "Unknown KernelCacheConfig" + << static_cast<int32>(preferred_cache_config_); + } + } + + private: + CUfunction cuda_function_; // Wrapped CUDA kernel handle. + unsigned arity_; // Number of formal parameters the kernel takes. + + // Preferred (but not required) cache configuration for this kernel. + KernelCacheConfig preferred_cache_config_; +}; + +// Given a platform-independent kernel datatype, returns the (const) internal +// CUDA platform implementation pointer. +inline const CUDAKernel *AsCUDAKernel(const KernelBase *kernel) { + return static_cast<const CUDAKernel *>(kernel->implementation()); +} + +// Given a platform-independent kernel datatype, returns the (non-const) +// internal CUDA platform implementation pointer. +inline CUDAKernel *AsCUDAKernel(KernelBase *kernel) { + return static_cast<CUDAKernel *>(kernel->implementation()); +} + +} // namespace cuda +} // namespace gputools +} // namespace perftools + +#endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_KERNEL_H_ |