diff options
author | Benoit Steiner <benoit.steiner.goog@gmail.com> | 2017-08-31 02:49:39 +0000 |
---|---|---|
committer | Benoit Steiner <benoit.steiner.goog@gmail.com> | 2017-08-31 02:49:39 +0000 |
commit | a4089991eb6bdb9e8ebfef93d81ca7b5e67ea77d (patch) | |
tree | 49a9b6c0c4ec6d006debe862cf209a8f252cfe78 /Eigen/src/Core/arch/CUDA | |
parent | 304ef2957134be386e50592ad7120177c5f3a7c0 (diff) |
Added support for CUDA 9.0.
Diffstat (limited to 'Eigen/src/Core/arch/CUDA')
-rw-r--r-- | Eigen/src/Core/arch/CUDA/Half.h | 58 | ||||
-rw-r--r-- | Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 3 |
2 files changed, 37 insertions, 24 deletions
diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index 8cedd65ad..1c557767a 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -50,38 +50,45 @@ struct half; namespace half_impl { #if !defined(EIGEN_HAS_CUDA_FP16) - -// Make our own __half definition that is similar to CUDA's. -struct __half { - EIGEN_DEVICE_FUNC __half() : x(0) {} - explicit EIGEN_DEVICE_FUNC __half(unsigned short raw) : x(raw) {} +// Make our own __half_raw definition that is similar to CUDA's. +struct __half_raw { + EIGEN_DEVICE_FUNC __half_raw() : x(0) {} + explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {} unsigned short x; }; - +#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 +// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw +typedef __half __half_raw; #endif -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x); -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff); -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h); +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x); +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff); +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h); -struct half_base : public __half { +struct half_base : public __half_raw { EIGEN_DEVICE_FUNC half_base() {} - EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half(h) {} - EIGEN_DEVICE_FUNC half_base(const __half& h) : __half(h) {} + EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {} + EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {} +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000 + EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {} +#endif }; } // namespace half_impl // Class definition. struct half : public half_impl::half_base { - #if !defined(EIGEN_HAS_CUDA_FP16) - typedef half_impl::__half __half; + #if !defined(EIGEN_HAS_CUDA_FP16) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000) + typedef half_impl::__half_raw __half_raw; #endif EIGEN_DEVICE_FUNC half() {} - EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {} + EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {} EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {} +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000 + EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {} +#endif explicit EIGEN_DEVICE_FUNC half(bool b) : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {} @@ -269,8 +276,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) { // these in hardware. If we need more performance on older/other CPUs, they are // also possible to vectorize directly. -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x) { - __half h; +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x) { + __half_raw h; h.x = x; return h; } @@ -280,12 +287,13 @@ union FP32 { float f; }; -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) { +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) { #if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 - return __float2half(ff); + __half tmp_ff = __float2half(ff); + return *(__half_raw*)&tmp_ff; #elif defined(EIGEN_HAS_FP16_C) - __half h; + __half_raw h; h.x = _cvtss_sh(ff, 0); return h; @@ -296,7 +304,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) { const FP32 f16max = { (127 + 16) << 23 }; const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 }; unsigned int sign_mask = 0x80000000u; - __half o; + __half_raw o; o.x = static_cast<unsigned short>(0x0u); unsigned int sign = f.u & sign_mask; @@ -335,7 +343,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) { #endif } -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h) { +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) { #if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 return __half2float(h); @@ -612,11 +620,15 @@ struct hash<Eigen::half> { // Add the missing shfl_xor intrinsic #if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) { + #if EIGEN_CUDACC_VER < 90000 return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width)); + #else + return static_cast<Eigen::half>(__shfl_xor_sync(0xFFFFFFFF, static_cast<float>(var), laneMask, width)); + #endif } #endif -// ldg() has an overload for __half, but we also need one for Eigen::half. +// ldg() has an overload for __half_raw, but we also need one for Eigen::half. #if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) { return Eigen::half_impl::raw_uint16_to_half( diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h index ba6a7f920..ce48e4b31 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -100,7 +100,8 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& template<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) { half2 result; - result.x = a.x & 0x7FFF7FFF; + unsigned temp = *(reinterpret_cast<const unsigned*>(&(a))); + *(reinterpret_cast<unsigned*>(&(result))) = temp & 0x7FFF7FFF; return result; } |