diff options
author | 2016-03-03 10:34:20 -0800 | |
---|---|---|
committer | 2016-03-03 10:34:20 -0800 | |
commit | 1032441c6fea0a0d98b394abe8ffdb228256f47b (patch) | |
tree | 9d3801e7f392aebb6f7443a860b794778e9d0a7b /Eigen | |
parent | 1da10a73580b3f3b672397ad65cded9300535ac7 (diff) |
Enable partial support for half floats on Kepler GPUs.
Diffstat (limited to 'Eigen')
-rw-r--r-- | Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 142 |
1 files changed, 78 insertions, 64 deletions
diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h index 1a1b4ec3d..720155ce1 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -17,8 +17,10 @@ // we'll use on the host side (SSE, AVX, ...) #if defined(__CUDACC__) && defined(EIGEN_USE_GPU) -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 +// The following operations require arch >= 5.3 +#if __CUDA_ARCH__ >= 530 __device__ half operator + (const half& a, const half& b) { return __hadd(a, b); } @@ -60,6 +62,7 @@ __device__ half abs(const half& a) { return result; } } +#endif namespace Eigen { namespace internal { @@ -98,8 +101,79 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const half& return __half2half2(from); } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload<half2>(const half* from) { + return *reinterpret_cast<const half2*>(from); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu<half2>(const half* from) { + return __halves2half2(from[0], from[1]); +} + +template<> EIGEN_STRONG_INLINE half2 ploaddup<half2>(const half* from) { + return __halves2half2(from[0], from[0]); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<half>(half* to, const half2& from) { + *reinterpret_cast<half2*>(to) = from; +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<half>(half* to, const half2& from) { + to[0] = __low2half(from); + to[1] = __high2half(from); +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const half* from) { +#if __CUDA_ARCH__ >= 320 + return __ldg((const half2*)from); +#else + return __halves2half2(*(from+0), *(from+1)); +#endif +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const half* from) { +#if __CUDA_ARCH__ >= 320 + return __halves2half2(__ldg(from+0), __ldg(from+1)); +#else + return __halves2half2(*(from+0), *(from+1)); +#endif +} + +template<> EIGEN_DEVICE_FUNC inline half2 pgather<half, half2>(const half* from, Index stride) { + return __halves2half2(from[0*stride], from[1*stride]); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter<half, half2>(half* to, const half2& from, Index stride) { + to[stride*0] = __low2half(from); + to[stride*1] = __high2half(from); +} + +template<> EIGEN_DEVICE_FUNC inline half pfirst<half2>(const half2& a) { + return __low2half(a); +} + +template<> EIGEN_DEVICE_FUNC inline half2 pabs<half2>(const half2& a) { + half2 result; + result.x = a.x & 0x7FFF7FFF; + return result; +} + + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock<half2,2>& kernel) { + half a1 = __low2half(kernel.packet[0]); + half a2 = __high2half(kernel.packet[0]); + half b1 = __low2half(kernel.packet[1]); + half b2 = __high2half(kernel.packet[1]); + kernel.packet[0] = __halves2half2(a1, b1); + kernel.packet[1] = __halves2half2(a2, b2); +} + +// The following operations require arch >= 5.3 +#if __CUDA_ARCH__ >= 530 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset<half2>(const half& a) { - return __halves2half2(a, __hadd(a, __float2half(1))); + return __halves2half2(a, __hadd(a, __float2half(1.0f))); } template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) { @@ -140,7 +214,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& float b1 = __low2float(b); float b2 = __high2float(b); half r1 = a1 < b1 ? __low2half(a) : __low2half(b); - half r2 = a2 < b2 ? __high2half(a) : __high2half(b); + half r2 = a2 < b2 ? __high2half(a) : __high2half(b); return __halves2half2(r1, r2); } @@ -154,50 +228,6 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& return __halves2half2(r1, r2); } -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload<half2>(const half* from) { - return *reinterpret_cast<const half2*>(from); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu<half2>(const half* from) { - return __halves2half2(from[0], from[1]); -} - -template<> EIGEN_STRONG_INLINE half2 ploaddup<half2>(const half* from) { - return __halves2half2(from[0], from[0]); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<half>(half* to, const half2& from) { - *reinterpret_cast<half2*>(to) = from; -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<half>(half* to, const half2& from) { - to[0] = __low2half(from); - to[1] = __high2half(from); -} - -template<> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const half* from) { - return __ldg((const half2*)from); -} - -template<> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const half* from) { - return __halves2half2(__ldg(from+0), __ldg(from+1)); -} - -template<> EIGEN_DEVICE_FUNC inline half2 pgather<half, half2>(const half* from, Index stride) { - return __halves2half2(from[0*stride], from[1*stride]); -} - -template<> EIGEN_DEVICE_FUNC inline void pscatter<half, half2>(half* to, const half2& from, Index stride) { - to[stride*0] = __low2half(from); - to[stride*1] = __high2half(from); -} - -template<> EIGEN_DEVICE_FUNC inline half pfirst<half2>(const half2& a) { - return __low2half(a); -} - template<> EIGEN_DEVICE_FUNC inline half predux<half2>(const half2& a) { return __hadd(__low2half(a), __high2half(a)); } @@ -217,23 +247,7 @@ template<> EIGEN_DEVICE_FUNC inline half predux_min<half2>(const half2& a) { template<> EIGEN_DEVICE_FUNC inline half predux_mul<half2>(const half2& a) { return __hmul(__low2half(a), __high2half(a)); } - -template<> EIGEN_DEVICE_FUNC inline half2 pabs<half2>(const half2& a) { - half2 result; - result.x = a.x & 0x7FFF7FFF; - return result; -} - - -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock<half2,2>& kernel) { - half a1 = __low2half(kernel.packet[0]); - half a2 = __high2half(kernel.packet[0]); - half b1 = __low2half(kernel.packet[1]); - half b2 = __high2half(kernel.packet[1]); - kernel.packet[0] = __halves2half2(a1, b1); - kernel.packet[1] = __halves2half2(a2, b2); -} +#endif } // end namespace internal |