aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/stream_executor/cuda/cuda_kernel.h
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow/stream_executor/cuda/cuda_kernel.h')
-rw-r--r--tensorflow/stream_executor/cuda/cuda_kernel.h115
1 files changed, 115 insertions, 0 deletions
diff --git a/tensorflow/stream_executor/cuda/cuda_kernel.h b/tensorflow/stream_executor/cuda/cuda_kernel.h
new file mode 100644
index 0000000000..e8ad3955e9
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_kernel.h
@@ -0,0 +1,115 @@
+// The CUDA implementation of the StreamExecutorInterface functionality.
+// CUDA inclusions are ideally confined to this implementation file.
+//
+// The notions from the StreamExecutor basically correspond to the CUDA streams
+// programming model provided by the libcuda.so driver APIs, so we don't have
+// to do much more than wrap the calls to the libraries appropriately.
+#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_KERNEL_H_
+#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_KERNEL_H_
+
+#include "tensorflow/stream_executor/kernel_cache_config.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+#include "tensorflow/stream_executor/cuda/cuda_driver.h"
+#include "tensorflow/stream_executor/lib/casts.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/platform/logging.h"
+#include "third_party/gpus/cuda/include/cuda.h"
+
+#ifdef PLATFORMS_GPUS_CUDA_DYNAMIC_LIBCUDA_DYNAMIC_LIBCUDA_H_
+#error \
+ "No driver calls in this file, wrap driver functionality in cuda_driver.cc."
+#endif
+
+#ifdef __CUDA_RUNTIME_H__
+#error \
+ "CUDA runtime being included into CUDA GPU executor; should be driver only."
+#endif
+
+namespace perftools {
+namespace gputools {
+namespace cuda {
+
+// Wraps a CUfunction to implement the platform-independent KernelInterface.
+class CUDAKernel : public internal::KernelInterface {
+ public:
+ CUDAKernel() : cuda_function_(nullptr), arity_(0),
+ preferred_cache_config_(KernelCacheConfig::kNoPreference) {}
+
+ // Note that the function is unloaded when the module is unloaded, and the
+ // module that the function is contained in is owned by the CUDAExecutor.
+ ~CUDAKernel() override {}
+
+ // As arity cannot be reflected upon using the CUDA API, the arity is
+ // explicitly set during the CUDAExecutor::GetKernel initialization process.
+ void set_arity(unsigned arity) { arity_ = arity; }
+ unsigned Arity() const override { return arity_; }
+
+ // Returns the CUfunction value for passing to the CUDA API.
+ CUfunction AsCUDAFunctionValue() const {
+ DCHECK(cuda_function_ != nullptr);
+ return const_cast<CUfunction>(cuda_function_);
+ }
+
+ // Returns the slot that the CUfunction is stored within for this object,
+ // for the CUDA API which wants to load into a CUfunction*.
+ CUfunction *cuda_function_ptr() { return &cuda_function_; }
+
+ // CUDA supports setting the preferred cache configuration of a CUfunction
+ // (more-or-less equivalent to a CUDAKernel). We support this via the below
+ // functions; users can set a preference, and that is applied when the kernel
+ // is [lazy-]loaded (in CUDAExecutor::Launch). The alternative would be to
+ // load the kernel & set the preference when the user calls the setter below;
+ // either approach is valid.
+ // Sets the current kernel cache configuration preference.
+ void SetPreferredCacheConfig(KernelCacheConfig config) override {
+ preferred_cache_config_ = config;
+ }
+
+ // Returns the current kernel cache configuration preference.
+ KernelCacheConfig GetPreferredCacheConfig() const override {
+ return preferred_cache_config_;
+ }
+
+ // Returns the current kernel cache configuration preference as a
+ // CUfunc_cache.
+ CUfunc_cache GetCUDACacheConfig() const {
+ switch (preferred_cache_config_) {
+ case KernelCacheConfig::kNoPreference:
+ return CU_FUNC_CACHE_PREFER_NONE;
+ case KernelCacheConfig::kPreferShared:
+ return CU_FUNC_CACHE_PREFER_SHARED;
+ case KernelCacheConfig::kPreferL1:
+ return CU_FUNC_CACHE_PREFER_L1;
+ case KernelCacheConfig::kPreferEqual:
+ return CU_FUNC_CACHE_PREFER_EQUAL;
+ default:
+ LOG(FATAL) << "Unknown KernelCacheConfig"
+ << static_cast<int32>(preferred_cache_config_);
+ }
+ }
+
+ private:
+ CUfunction cuda_function_; // Wrapped CUDA kernel handle.
+ unsigned arity_; // Number of formal parameters the kernel takes.
+
+ // Preferred (but not required) cache configuration for this kernel.
+ KernelCacheConfig preferred_cache_config_;
+};
+
+// Given a platform-independent kernel datatype, returns the (const) internal
+// CUDA platform implementation pointer.
+inline const CUDAKernel *AsCUDAKernel(const KernelBase *kernel) {
+ return static_cast<const CUDAKernel *>(kernel->implementation());
+}
+
+// Given a platform-independent kernel datatype, returns the (non-const)
+// internal CUDA platform implementation pointer.
+inline CUDAKernel *AsCUDAKernel(KernelBase *kernel) {
+ return static_cast<CUDAKernel *>(kernel->implementation());
+}
+
+} // namespace cuda
+} // namespace gputools
+} // namespace perftools
+
+#endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_KERNEL_H_