diff options
author | Manjunath Kudlur <keveman@gmail.com> | 2015-11-06 16:27:58 -0800 |
---|---|---|
committer | Manjunath Kudlur <keveman@gmail.com> | 2015-11-06 16:27:58 -0800 |
commit | f41959ccb2d9d4c722fe8fc3351401d53bcf4900 (patch) | |
tree | ef0ca22cb2a5ac4bdec9d080d8e0788a53ed496d /tensorflow/stream_executor/gcuda.h |
TensorFlow: Initial commit of TensorFlow library.
TensorFlow is an open source software library for numerical computation
using data flow graphs.
Base CL: 107276108
Diffstat (limited to 'tensorflow/stream_executor/gcuda.h')
-rw-r--r-- | tensorflow/stream_executor/gcuda.h | 415 |
1 files changed, 415 insertions, 0 deletions
diff --git a/tensorflow/stream_executor/gcuda.h b/tensorflow/stream_executor/gcuda.h new file mode 100644 index 0000000000..24b09c5358 --- /dev/null +++ b/tensorflow/stream_executor/gcuda.h @@ -0,0 +1,415 @@ +// Common declarations and includes for mixed-mode GPU usage at Google. +// +// This header serves to define a "common baseline" for GPU usage, +// either with gcudacc or nvcc, and on the host or device. The rule of thumb is, +// "if you're working with mixed-mode GPU code at Google, include this header." +#ifndef TENSORFLOW_STREAM_EXECUTOR_GCUDA_H_ +#define TENSORFLOW_STREAM_EXECUTOR_GCUDA_H_ + +// Symbol glossary: +// __CUDACC__: CUDA capable compiler, compiling host or device +// __CUDA_ARCH__: Compiling device code +// __GCUDACC__: Using gcudacc +// __NVCC__: Using nvcc + +// For device code compiled with gcudacc, CUDA_ASSUME(X) tells the compiler +// that it may assume that X is true. This can enable further optimization. +// It is undefined behavior if X is not true. X should not have side-effects +// and gcudacc will try to warn you if it does. +#if defined(__CUDA_ARCH__) && defined(__GCUDACC__) +#define CUDA_ASSUME(X) __builtin_assume(X) +#else +#define CUDA_ASSUME(X) do {} while (false) +#endif + +namespace perftools { +namespace gputools { +namespace cache_config { +// A version of the KernelCacheConfig enum class, exposed for pre-C++11 +// compilers. +enum CacheConfig { + // Indicates no preference for device L1/shared memory configuration. + kNoPreference, + + // Indicates a preference for more shared memory than L1 cache. + kPreferShared, + + // Indicates a preference for more L1 cache than shared memory. + kPreferL1, + + // Indicates a preference for equal amounts of L1 cache and shared memory. + kPreferEqual, +}; +} // namespace cache_config + +namespace shared_mem_config { +// A compatability-layer declaration of CUsharedconfig, needed to support +// cuFuncSetSharedMemConfig/cudaDeviceSetSharedMemConfig. Declared here for +// compatability with pre-C++11 compilers. +enum SharedMemConfig { + // Indicates that the context's shared memory config should be used. + kDefaultBankSize, + + // Specifies a four-byte bank size for shared memory. + kFourByteBankSize, + + // Specifies an eight-byte bank size for shared memory. + kEightByteBankSize, +}; +} // namespace shared_mem_config +} // namespace gputools +} // namespace perftools + +#if !defined(__NVCC__) && !defined(GCUDACC_STANDALONE_MODE) +// Using gcudacc, either device-only or mixed-mode code. No special declarations +// are needed for host-only code being compiled under gcudacc. + +// These includes are required by the code introduced during gcudacc operation. +// Since the user code may not directly include these headers, they may not be +// present in the build environment without inclusion here. +#include "tensorflow/stream_executor/device_memory.h" +#include "tensorflow/stream_executor/kernel.h" +#include "tensorflow/stream_executor/kernel_cache_config.h" +#include "tensorflow/stream_executor/launch_dim.h" +#include "tensorflow/stream_executor/machine_manager.h" +#include "tensorflow/stream_executor/shared_memory_config.h" +#include "tensorflow/stream_executor/stream.h" +#include "tensorflow/stream_executor/stream_executor.h" + +// cudaConfigureCall is a symbol used by Clang when it sees a CUDA triple-angle- +// bracket launch, so we declare it here so the symbol resolves. It is not used +// by gcudacc-generated code, however, so it is not defined anywhere. +// In other words, this is a dummy declaration needed for parsing. + +#ifdef __GCUDACC__ +// These symbols only need to be defined during compilation with gcudacc. +namespace perftools { +namespace gputools { + +// This class defines all the implicit conversions necessary to match launch +// dimensions against the cudaConfigureCall() signature, and sits where a dim3 +// usually would in triple angle launches. This supports the kernel launch +// dimension styles: +// kernel<<<1, 1>>>() and +// kernel<<<BlockDim(...), ThreadDim(...)>>> and +// kernel<<<dim3(1), dim3(1)>>> +// All of these are predicated upon implicit conversions, which are frowned upon +// by the style guide. Rather then add this CUDA-specific bad behavior to +// StreamExecutor headers, we isolate it here. +class LaunchDimConverter { + public: + LaunchDimConverter(unsigned long long int i) : _dim(i, 1, 1) {} // NOLINT + LaunchDimConverter(::perftools::gputools::BlockDim dim) + : // NOLINT + _dim(dim.x, dim.y, dim.z) {} + LaunchDimConverter(::perftools::gputools::ThreadDim dim) + : // NOLINT + _dim(dim.x, dim.y, dim.z) {} + LaunchDimConverter(dim3 dim) : // NOLINT + _dim(dim.x, dim.y, dim.z) {} + + ::perftools::gputools::BlockDim AsBlockDim() { + return ::perftools::gputools::BlockDim(_dim.x, _dim.y, _dim.z); + } + + ::perftools::gputools::ThreadDim AsThreadDim() { + return ::perftools::gputools::ThreadDim(_dim.x, _dim.y, _dim.z); + } + + private: + ::perftools::gputools::Dim3D _dim; +}; +} // namespace gputools +} // namespace perftools + +int cudaConfigureCall(::perftools::gputools::LaunchDimConverter grid_size, + ::perftools::gputools::LaunchDimConverter block_size, + unsigned shared_size = 0, + ::perftools::gputools::Stream *stream = 0); +#endif + +// The rest of the symbols in this block are needed during both StreamExecutor +// and user library compilation. +namespace perftools { +namespace gputools { + +// Gets the preferred shared memory configuration for the device to which +// the specified executor is bound. +shared_mem_config::SharedMemConfig DeviceGetSharedMemConfig( + StreamExecutor *stream_exec); + +// Sets the preferred shared memory configuration for the device to which +// the specified executor is bound. +// Does not return an error if the current device is invalid. +void DeviceSetSharedMemConfig(StreamExecutor *stream_exec, + shared_mem_config::SharedMemConfig config); + +// Sets the preferred cache configuration for the given kernel. +template <typename KernelT> +void FuncSetCacheConfig(Stream *stream, KernelT kernel, + cache_config::CacheConfig cache_config) { + FuncSetCacheConfig(stream, reinterpret_cast<void *>(kernel), cache_config); +} + +// Internal specialization of the above. +template <> +void FuncSetCacheConfig<void *>(Stream *stream, void *kernel, + cache_config::CacheConfig cache_config); + +// Gets the preferred cache configuration for the given kernel. +template <typename KernelT> +KernelCacheConfig FuncGetCacheConfig(KernelT kernel) { + return FuncGetCacheConfig(reinterpret_cast<void *>(kernel)); +} + +// Internal specialization of the above. +template <> +KernelCacheConfig FuncGetCacheConfig<void *>(void *kernel); + +} // namespace gputools +} // namespace perftools + +#elif defined(__NVCC__) +// NVCC code compilation, device-only or mixed mode. As above, no special +// declarations are needed for host-only code. +namespace perftools { +namespace gputools { +class Stream; +} // namespace gputools +} // namespace perftools + +// --- BEGIN EXTERNALLY-DEFINED FUNCTIONS + +// The following functions must be defined in some external library linked in to +// the final binary - they are _not_ defined in the StreamExecutor +// (in nvcc mode). + +// Sets the preferred cache configuration for the specified kernel. +template <typename KernelT> +void SetCudaCacheConfig(perftools::gputools::Stream* stream, KernelT kernel, + ::perftools::gputools::cache_config::CacheConfig preference); + +// Gets the current device for use in CUDA runtime-emulating routines. +// "device" is the device ordinal as returned by +// StreamExecutor::device_ordinal(). +int GetDevice(); + +// Sets the current device for use in CUDA runtime-emulating routines. +// "device" is the device ordinal as returned by +// StreamExecutor::device_ordinal(). +void SetDevice(int device); + +// --- END EXTERNALLY-DEFINED FUNCTIONS + +namespace perftools { +namespace gputools { +template <typename KernelT> +void FuncSetCacheConfig(Stream *stream, KernelT kernel, + cache_config::CacheConfig cache_config) { + SetCudaCacheConfig(stream, reinterpret_cast<void*>(kernel), cache_config); +} +} // namespace gputools +} // namespace perftools + +// The following functions are declared extern "C" in CUDA's device_functions.h, +// so we have to wrap them for compatability with the cuda_builtin namespace. +// Thin wrappers to break these functions out of cuda_builtin are defined below. +__forceinline__ __device__ clock_t __gcuda_nvcc_clock() { return clock(); } +__forceinline__ __device__ int __gcuda_nvcc__clz(int x) { + return __clz(x); +} +__forceinline__ __device__ int __gcuda_nvcc__clzll(long long int x) { + return __clzll(x); +} +__forceinline__ __device__ float __gcuda_nvcc__fdividef(float a, float b) { + return __fdividef(a, b); +} +__forceinline__ __device__ int __gcuda_nvcc__ffsll(long long int x) { // NOLINT + return __ffsll(x); +} +__forceinline__ __device__ int __gcuda_nvcc__popc(unsigned int x) { + return __popc(x); +} +__forceinline__ __device__ float __gcuda_nvcc__powf(float a, float b) { + return __powf(a, b); +} +__forceinline__ __device__ void __gcuda_nvcc__sincosf( + float x, float *sptr, float *cptr) { + __sincosf(x, sptr, cptr); +} +__forceinline__ __device__ unsigned int __gcuda_nvcc__umulhi( + unsigned int x, unsigned int y) { + return __umulhi(x, y); +} + +#if __CUDA_ARCH__ >= 200 || !defined(__CUDA_ARCH__) +__forceinline__ __device__ unsigned int __gcuda_nvcc__ballot(int x) { + return __ballot(x); +} +#endif // __CUDA_ARCH__ >= 200 || !defined(__CUDA_ARCH__) + +// Forward-declare printf as nvcc does not declare it by itself and we +// need this file to compile even if it is included before including +// stdio.h or cstdio. +int printf(const char* format, ...); + +namespace cuda_builtin { +using ::abs; +using ::atomicAdd; +using ::atomicCAS; +using ::ceil; +using ::ceilf; +using ::cos; +using ::cosf; +using ::erfcinv; +using ::erfcinvf; +using ::exp; +using ::expf; +using ::fabs; +using ::fabsf; +using ::floor; +using ::floorf; +using ::fabs; +using ::fabsf; +using ::fma; +using ::fmaf; +using ::fmax; +using ::fmaxf; +using ::fmin; +using ::fminf; +using ::log; +using ::log1p; +using ::log1pf; +using ::logf; +using ::max; +using ::min; +using ::powf; +using ::printf; +using ::sin; +using ::sinf; +using ::sincos; +using ::sincosf; +using ::sincospi; +using ::sincospif; +using ::sqrt; +using ::sqrtf; +using ::tanh; +using ::trunc; +using ::truncf; +using ::trunc; + +// rsqrt and rsqrtf are functions defined by nvcc in both host and device mode. +// Add these functions to gcuda.h such that it is also host device. In device +// side they correspond to intrinsics while explicit definitions are provided +// below for host side. +#ifdef __CUDA_ARCH__ +using ::rsqrt; +using ::rsqrtf; +#else +__forceinline__ __host__ __device__ float rsqrtf(float x) { + return 1 / std::sqrt(x); +} +__forceinline__ __host__ __device__ double rsqrt(double x) { + return 1 / std::sqrt(x); +} +#endif + +__forceinline__ __device__ int clock() { return __gcuda_nvcc_clock(); } + +__forceinline__ __device__ int __clz(int x) { + return __gcuda_nvcc__clz(x); +} + +__forceinline__ __device__ int __clz(long long int x) { + return __gcuda_nvcc__clzll(x); +} + +__forceinline__ __device__ float __fdividef(float a, float b) { + return __gcuda_nvcc__fdividef(a, b); +} + +__forceinline__ __device__ int __ffsll(long long int x) { // NOLINT + return __gcuda_nvcc__ffsll(x); +} + +__forceinline__ __device__ int __popc(unsigned int x) { + return __gcuda_nvcc__popc(x); +} + +__forceinline__ __device__ float __powf(float a, float b) { + return __gcuda_nvcc__powf(a, b); +} + +__forceinline__ __device__ void __sincosf(float x, float *sptr, float *cptr) { + __gcuda_nvcc__sincosf(x, sptr, cptr); +} + +__forceinline__ __device__ unsigned int __umulhi(unsigned int x, + unsigned int y) { + return __gcuda_nvcc__umulhi(x, y); +} + +#ifdef __CUDA_ARCH__ +// These symbols are only visible when parsing device code. +using ::__double_as_longlong; +using ::__int_as_float; +using ::__float_as_int; +using ::__longlong_as_double; +#endif // __CUDA_ARCH__ + +#if __CUDA_ARCH__ >= 200 || !defined(__CUDA_ARCH__) +__forceinline__ __device__ unsigned int __ballot(int x) { + return __gcuda_nvcc__ballot(x); +} +#endif // __CUDA_ARCH__ >= 200 || !defined(__CUDA_ARCH__) + +#if __CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__) +using ::__shfl; +using ::__shfl_down; +using ::__shfl_up; +using ::__shfl_xor; +#endif // __CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__) + +#if __CUDA_ARCH__ >= 320 || !defined(__CUDA_ARCH__) +using ::__ldg; +#endif // __CUDA_ARCH__ >= 320 || !defined(__CUDA_ARCH__) + +#if __CUDA_API_VERSION < 6050 +// CUDA < 6.5 defines isfinite as a macro, while CUDA >= 6.5 and gcudacc +// define isfinite as a function. Work around this for the CUDA 5.5 case, +// duplicating that macro definition. +#undef isfinite +#define __gcuda_nvcc_isfinite(x) \ + (sizeof(x) == sizeof(float) ? __finitef(x) : \ + sizeof(x) == sizeof(double) ? __finite(x) : __finitel(x)) +inline __device__ int isfinite(float x) { + return __gcuda_nvcc_isfinite(x); +} +inline __device__ int isfinite(double x) { + return __gcuda_nvcc_isfinite(x); +} +inline __device__ int isfinite(long double x) { + return __gcuda_nvcc_isfinite(x); +} +#else +// CUDA API >= v6.5 +using ::isfinite; +#endif // __CUDA_API_VERSION >= 6050 +} // namespace cuda_builtin + +#if __CUDA_API_VERSION >= 6050 +// The second part of the isfinite workaround. +inline __device__ int isfinite(float x) { + return __gcuda_nvcc_isfinite(x); +} +inline __device__ int isfinite(double x) { + return __gcuda_nvcc_isfinite(x); +} +inline __device__ int isfinite(long double x) { + return __gcuda_nvcc_isfinite(x); +} +#endif // __CUDA_API_VERSION >= 6050 + +#endif // defined(__NVCC__) + +#endif // TENSORFLOW_STREAM_EXECUTOR_GCUDA_H_ |