blob: e8ad3955e9a8f30b3443448a9c0572b75c07fba6 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
|
// The CUDA implementation of the StreamExecutorInterface functionality.
// CUDA inclusions are ideally confined to this implementation file.
//
// The notions from the StreamExecutor basically correspond to the CUDA streams
// programming model provided by the libcuda.so driver APIs, so we don't have
// to do much more than wrap the calls to the libraries appropriately.
#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_KERNEL_H_
#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_KERNEL_H_
#include "tensorflow/stream_executor/kernel_cache_config.h"
#include "tensorflow/stream_executor/stream_executor_internal.h"
#include "tensorflow/stream_executor/cuda/cuda_driver.h"
#include "tensorflow/stream_executor/lib/casts.h"
#include "tensorflow/stream_executor/platform/port.h"
#include "tensorflow/stream_executor/platform/logging.h"
#include "third_party/gpus/cuda/include/cuda.h"
#ifdef PLATFORMS_GPUS_CUDA_DYNAMIC_LIBCUDA_DYNAMIC_LIBCUDA_H_
#error \
"No driver calls in this file, wrap driver functionality in cuda_driver.cc."
#endif
#ifdef __CUDA_RUNTIME_H__
#error \
"CUDA runtime being included into CUDA GPU executor; should be driver only."
#endif
namespace perftools {
namespace gputools {
namespace cuda {
// Wraps a CUfunction to implement the platform-independent KernelInterface.
class CUDAKernel : public internal::KernelInterface {
public:
CUDAKernel() : cuda_function_(nullptr), arity_(0),
preferred_cache_config_(KernelCacheConfig::kNoPreference) {}
// Note that the function is unloaded when the module is unloaded, and the
// module that the function is contained in is owned by the CUDAExecutor.
~CUDAKernel() override {}
// As arity cannot be reflected upon using the CUDA API, the arity is
// explicitly set during the CUDAExecutor::GetKernel initialization process.
void set_arity(unsigned arity) { arity_ = arity; }
unsigned Arity() const override { return arity_; }
// Returns the CUfunction value for passing to the CUDA API.
CUfunction AsCUDAFunctionValue() const {
DCHECK(cuda_function_ != nullptr);
return const_cast<CUfunction>(cuda_function_);
}
// Returns the slot that the CUfunction is stored within for this object,
// for the CUDA API which wants to load into a CUfunction*.
CUfunction *cuda_function_ptr() { return &cuda_function_; }
// CUDA supports setting the preferred cache configuration of a CUfunction
// (more-or-less equivalent to a CUDAKernel). We support this via the below
// functions; users can set a preference, and that is applied when the kernel
// is [lazy-]loaded (in CUDAExecutor::Launch). The alternative would be to
// load the kernel & set the preference when the user calls the setter below;
// either approach is valid.
// Sets the current kernel cache configuration preference.
void SetPreferredCacheConfig(KernelCacheConfig config) override {
preferred_cache_config_ = config;
}
// Returns the current kernel cache configuration preference.
KernelCacheConfig GetPreferredCacheConfig() const override {
return preferred_cache_config_;
}
// Returns the current kernel cache configuration preference as a
// CUfunc_cache.
CUfunc_cache GetCUDACacheConfig() const {
switch (preferred_cache_config_) {
case KernelCacheConfig::kNoPreference:
return CU_FUNC_CACHE_PREFER_NONE;
case KernelCacheConfig::kPreferShared:
return CU_FUNC_CACHE_PREFER_SHARED;
case KernelCacheConfig::kPreferL1:
return CU_FUNC_CACHE_PREFER_L1;
case KernelCacheConfig::kPreferEqual:
return CU_FUNC_CACHE_PREFER_EQUAL;
default:
LOG(FATAL) << "Unknown KernelCacheConfig"
<< static_cast<int32>(preferred_cache_config_);
}
}
private:
CUfunction cuda_function_; // Wrapped CUDA kernel handle.
unsigned arity_; // Number of formal parameters the kernel takes.
// Preferred (but not required) cache configuration for this kernel.
KernelCacheConfig preferred_cache_config_;
};
// Given a platform-independent kernel datatype, returns the (const) internal
// CUDA platform implementation pointer.
inline const CUDAKernel *AsCUDAKernel(const KernelBase *kernel) {
return static_cast<const CUDAKernel *>(kernel->implementation());
}
// Given a platform-independent kernel datatype, returns the (non-const)
// internal CUDA platform implementation pointer.
inline CUDAKernel *AsCUDAKernel(KernelBase *kernel) {
return static_cast<CUDAKernel *>(kernel->implementation());
}
} // namespace cuda
} // namespace gputools
} // namespace perftools
#endif // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_KERNEL_H_
|