tensorflow/stream_executor/gcuda.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415

// Common declarations and includes for mixed-mode GPU usage at Google.
//
// This header serves to define a "common baseline" for GPU usage,
// either with gcudacc or nvcc, and on the host or device. The rule of thumb is,
// "if you're working with mixed-mode GPU code at Google, include this header."
#ifndef TENSORFLOW_STREAM_EXECUTOR_GCUDA_H_
#define TENSORFLOW_STREAM_EXECUTOR_GCUDA_H_

// Symbol glossary:
//   __CUDACC__: CUDA capable compiler, compiling host or device
//   __CUDA_ARCH__: Compiling device code
//   __GCUDACC__: Using gcudacc
//   __NVCC__: Using nvcc

// For device code compiled with gcudacc, CUDA_ASSUME(X) tells the compiler
// that it may assume that X is true. This can enable further optimization.
// It is undefined behavior if X is not true. X should not have side-effects
// and gcudacc will try to warn you if it does.
#if defined(__CUDA_ARCH__) && defined(__GCUDACC__)
#define CUDA_ASSUME(X) __builtin_assume(X)
#else
#define CUDA_ASSUME(X) do {} while (false)
#endif

namespace perftools {
namespace gputools {
namespace cache_config {
// A version of the KernelCacheConfig enum class, exposed for pre-C++11
// compilers.
enum CacheConfig {
  // Indicates no preference for device L1/shared memory configuration.
  kNoPreference,

  // Indicates a preference for more shared memory than L1 cache.
  kPreferShared,

  // Indicates a preference for more L1 cache than shared memory.
  kPreferL1,

  // Indicates a preference for equal amounts of L1 cache and shared memory.
  kPreferEqual,
};
}  // namespace cache_config

namespace shared_mem_config {
// A compatability-layer declaration of CUsharedconfig, needed to support
// cuFuncSetSharedMemConfig/cudaDeviceSetSharedMemConfig. Declared here for
// compatability with pre-C++11 compilers.
enum SharedMemConfig {
  // Indicates that the context's shared memory config should be used.
  kDefaultBankSize,

  // Specifies a four-byte bank size for shared memory.
  kFourByteBankSize,

  // Specifies an eight-byte bank size for shared memory.
  kEightByteBankSize,
};
}  // namespace shared_mem_config
}  // namespace gputools
}  // namespace perftools

#if !defined(__NVCC__) && !defined(GCUDACC_STANDALONE_MODE)
// Using gcudacc, either device-only or mixed-mode code. No special declarations
// are needed for host-only code being compiled under gcudacc.

// These includes are required by the code introduced during gcudacc operation.
// Since the user code may not directly include these headers, they may not be
// present in the build environment without inclusion here.
#include "tensorflow/stream_executor/device_memory.h"
#include "tensorflow/stream_executor/kernel.h"
#include "tensorflow/stream_executor/kernel_cache_config.h"
#include "tensorflow/stream_executor/launch_dim.h"
#include "tensorflow/stream_executor/machine_manager.h"
#include "tensorflow/stream_executor/shared_memory_config.h"
#include "tensorflow/stream_executor/stream.h"
#include "tensorflow/stream_executor/stream_executor.h"

// cudaConfigureCall is a symbol used by Clang when it sees a CUDA triple-angle-
// bracket launch, so we declare it here so the symbol resolves. It is not used
// by gcudacc-generated code, however, so it is not defined anywhere.
// In other words, this is a dummy declaration needed for parsing.

#ifdef __GCUDACC__
// These symbols only need to be defined during compilation with gcudacc.
namespace perftools {
namespace gputools {

// This class defines all the implicit conversions necessary to match launch
// dimensions against the cudaConfigureCall() signature, and sits where a dim3
// usually would in triple angle launches. This supports the kernel launch
// dimension styles:
//  kernel<<<1, 1>>>() and
//  kernel<<<BlockDim(...), ThreadDim(...)>>> and
//  kernel<<<dim3(1), dim3(1)>>>
// All of these are predicated upon implicit conversions, which are frowned upon
// by the style guide. Rather then add this CUDA-specific bad behavior to
// StreamExecutor headers, we isolate it here.
class LaunchDimConverter {
 public:
  LaunchDimConverter(unsigned long long int i) : _dim(i, 1, 1) {}  // NOLINT
  LaunchDimConverter(::perftools::gputools::BlockDim dim)
      :  // NOLINT
        _dim(dim.x, dim.y, dim.z) {}
  LaunchDimConverter(::perftools::gputools::ThreadDim dim)
      :  // NOLINT
        _dim(dim.x, dim.y, dim.z) {}
  LaunchDimConverter(dim3 dim) :  // NOLINT
    _dim(dim.x, dim.y, dim.z) {}

  ::perftools::gputools::BlockDim AsBlockDim() {
    return ::perftools::gputools::BlockDim(_dim.x, _dim.y, _dim.z);
  }

  ::perftools::gputools::ThreadDim AsThreadDim() {
    return ::perftools::gputools::ThreadDim(_dim.x, _dim.y, _dim.z);
  }

 private:
  ::perftools::gputools::Dim3D _dim;
};
}  // namespace gputools
}  // namespace perftools

int cudaConfigureCall(::perftools::gputools::LaunchDimConverter grid_size,
                      ::perftools::gputools::LaunchDimConverter block_size,
                      unsigned shared_size = 0,
                      ::perftools::gputools::Stream *stream = 0);
#endif

// The rest of the symbols in this block are needed during both StreamExecutor
// and user library compilation.
namespace perftools {
namespace gputools {

// Gets the preferred shared memory configuration for the device to which
// the specified executor is bound.
shared_mem_config::SharedMemConfig DeviceGetSharedMemConfig(
    StreamExecutor *stream_exec);

// Sets the preferred shared memory configuration for the device to which
// the specified executor is bound.
// Does not return an error if the current device is invalid.
void DeviceSetSharedMemConfig(StreamExecutor *stream_exec,
                              shared_mem_config::SharedMemConfig config);

// Sets the preferred cache configuration for the given kernel.
template <typename KernelT>
void FuncSetCacheConfig(Stream *stream, KernelT kernel,
                        cache_config::CacheConfig cache_config) {
  FuncSetCacheConfig(stream, reinterpret_cast<void *>(kernel), cache_config);
}

// Internal specialization of the above.
template <>
void FuncSetCacheConfig<void *>(Stream *stream, void *kernel,
                                cache_config::CacheConfig cache_config);

// Gets the preferred cache configuration for the given kernel.
template <typename KernelT>
KernelCacheConfig FuncGetCacheConfig(KernelT kernel) {
  return FuncGetCacheConfig(reinterpret_cast<void *>(kernel));
}

// Internal specialization of the above.
template <>
KernelCacheConfig FuncGetCacheConfig<void *>(void *kernel);

}  // namespace gputools
}  // namespace perftools

#elif defined(__NVCC__)
// NVCC code compilation, device-only or mixed mode. As above, no special
// declarations are needed for host-only code.
namespace perftools {
namespace gputools {
class Stream;
}  // namespace gputools
}  // namespace perftools

// --- BEGIN EXTERNALLY-DEFINED FUNCTIONS

// The following functions must be defined in some external library linked in to
// the final binary - they are _not_ defined in the StreamExecutor
// (in nvcc mode).

// Sets the preferred cache configuration for the specified kernel.
template <typename KernelT>
void SetCudaCacheConfig(perftools::gputools::Stream* stream, KernelT kernel,
    ::perftools::gputools::cache_config::CacheConfig preference);

// Gets the current device for use in CUDA runtime-emulating routines.
// "device" is the device ordinal as returned by
// StreamExecutor::device_ordinal().
int GetDevice();

// Sets the current device for use in CUDA runtime-emulating routines.
// "device" is the device ordinal as returned by
// StreamExecutor::device_ordinal().
void SetDevice(int device);

// --- END EXTERNALLY-DEFINED FUNCTIONS

namespace perftools {
namespace gputools {
template <typename KernelT>
void FuncSetCacheConfig(Stream *stream, KernelT kernel,
                        cache_config::CacheConfig cache_config) {
  SetCudaCacheConfig(stream, reinterpret_cast<void*>(kernel), cache_config);
}
}  // namespace gputools
}  // namespace perftools

// The following functions are declared extern "C" in CUDA's device_functions.h,
// so we have to wrap them for compatability with the cuda_builtin namespace.
// Thin wrappers to break these functions out of cuda_builtin are defined below.
__forceinline__ __device__ clock_t __gcuda_nvcc_clock() { return clock(); }
__forceinline__ __device__ int __gcuda_nvcc__clz(int x) {
  return __clz(x);
}
__forceinline__ __device__ int __gcuda_nvcc__clzll(long long int x) {
  return __clzll(x);
}
__forceinline__ __device__ float __gcuda_nvcc__fdividef(float a, float b) {
  return __fdividef(a, b);
}
__forceinline__ __device__ int __gcuda_nvcc__ffsll(long long int x) { // NOLINT
  return __ffsll(x);
}
__forceinline__ __device__ int __gcuda_nvcc__popc(unsigned int x) {
  return __popc(x);
}
__forceinline__ __device__ float __gcuda_nvcc__powf(float a, float b) {
  return __powf(a, b);
}
__forceinline__ __device__ void __gcuda_nvcc__sincosf(
    float x, float *sptr, float *cptr) {
  __sincosf(x, sptr, cptr);
}
__forceinline__ __device__ unsigned int __gcuda_nvcc__umulhi(
    unsigned int x, unsigned int y) {
  return __umulhi(x, y);
}

#if __CUDA_ARCH__ >= 200 || !defined(__CUDA_ARCH__)
__forceinline__ __device__ unsigned int __gcuda_nvcc__ballot(int x) {
  return __ballot(x);
}
#endif  // __CUDA_ARCH__ >= 200 || !defined(__CUDA_ARCH__)

// Forward-declare printf as nvcc does not declare it by itself and we
// need this file to compile even if it is included before including
// stdio.h or cstdio.
int printf(const char* format, ...);

namespace cuda_builtin {
using ::abs;
using ::atomicAdd;
using ::atomicCAS;
using ::ceil;
using ::ceilf;
using ::cos;
using ::cosf;
using ::erfcinv;
using ::erfcinvf;
using ::exp;
using ::expf;
using ::fabs;
using ::fabsf;
using ::floor;
using ::floorf;
using ::fabs;
using ::fabsf;
using ::fma;
using ::fmaf;
using ::fmax;
using ::fmaxf;
using ::fmin;
using ::fminf;
using ::log;
using ::log1p;
using ::log1pf;
using ::logf;
using ::max;
using ::min;
using ::powf;
using ::printf;
using ::sin;
using ::sinf;
using ::sincos;
using ::sincosf;
using ::sincospi;
using ::sincospif;
using ::sqrt;
using ::sqrtf;
using ::tanh;
using ::trunc;
using ::truncf;
using ::trunc;

// rsqrt and rsqrtf are functions defined by nvcc in both host and device mode.
// Add these functions to gcuda.h such that it is also host device. In device
// side they correspond to intrinsics while explicit definitions are provided
// below for host side.
#ifdef __CUDA_ARCH__
using ::rsqrt;
using ::rsqrtf;
#else
__forceinline__ __host__ __device__ float rsqrtf(float x) {
  return 1 / std::sqrt(x);
}
__forceinline__ __host__ __device__ double rsqrt(double x) {
  return 1 / std::sqrt(x);
}
#endif

__forceinline__ __device__ int clock() { return __gcuda_nvcc_clock(); }

__forceinline__ __device__ int __clz(int x) {
  return __gcuda_nvcc__clz(x);
}

__forceinline__ __device__ int __clz(long long int x) {
  return __gcuda_nvcc__clzll(x);
}

__forceinline__ __device__ float __fdividef(float a, float b) {
  return __gcuda_nvcc__fdividef(a, b);
}

__forceinline__ __device__ int __ffsll(long long int x) { // NOLINT
  return __gcuda_nvcc__ffsll(x);
}

__forceinline__ __device__ int __popc(unsigned int x) {
  return __gcuda_nvcc__popc(x);
}

__forceinline__ __device__ float __powf(float a, float b) {
  return __gcuda_nvcc__powf(a, b);
}

__forceinline__ __device__ void __sincosf(float x, float *sptr, float *cptr) {
  __gcuda_nvcc__sincosf(x, sptr, cptr);
}

__forceinline__ __device__ unsigned int __umulhi(unsigned int x,
                                                 unsigned int y) {
  return __gcuda_nvcc__umulhi(x, y);
}

#ifdef __CUDA_ARCH__
// These symbols are only visible when parsing device code.
using ::__double_as_longlong;
using ::__int_as_float;
using ::__float_as_int;
using ::__longlong_as_double;
#endif  // __CUDA_ARCH__

#if __CUDA_ARCH__ >= 200 || !defined(__CUDA_ARCH__)
__forceinline__ __device__ unsigned int __ballot(int x) {
  return __gcuda_nvcc__ballot(x);
}
#endif  // __CUDA_ARCH__ >= 200 || !defined(__CUDA_ARCH__)

#if __CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__)
using ::__shfl;
using ::__shfl_down;
using ::__shfl_up;
using ::__shfl_xor;
#endif  // __CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__)

#if __CUDA_ARCH__ >= 320 || !defined(__CUDA_ARCH__)
using ::__ldg;
#endif  // __CUDA_ARCH__ >= 320 || !defined(__CUDA_ARCH__)

#if __CUDA_API_VERSION < 6050
// CUDA < 6.5 defines isfinite as a macro, while CUDA >= 6.5 and gcudacc
// define isfinite as a function. Work around this for the CUDA 5.5 case,
// duplicating that macro definition.
#undef isfinite
#define __gcuda_nvcc_isfinite(x)                                  \
    (sizeof(x) == sizeof(float) ? __finitef(x) :                  \
        sizeof(x) == sizeof(double) ? __finite(x) : __finitel(x))
inline __device__ int isfinite(float x) {
  return __gcuda_nvcc_isfinite(x);
}
inline __device__ int isfinite(double x) {
  return __gcuda_nvcc_isfinite(x);
}
inline __device__ int isfinite(long double x) {
  return __gcuda_nvcc_isfinite(x);
}
#else
// CUDA API >= v6.5
using ::isfinite;
#endif  // __CUDA_API_VERSION >= 6050
}  // namespace cuda_builtin

#if __CUDA_API_VERSION >= 6050
// The second part of the isfinite workaround.
inline __device__ int isfinite(float x) {
  return __gcuda_nvcc_isfinite(x);
}
inline __device__ int isfinite(double x) {
  return __gcuda_nvcc_isfinite(x);
}
inline __device__ int isfinite(long double x) {
  return __gcuda_nvcc_isfinite(x);
}
#endif  // __CUDA_API_VERSION >= 6050

#endif  // defined(__NVCC__)

#endif  // TENSORFLOW_STREAM_EXECUTOR_GCUDA_H_