From 8cfe0db108f54e4ceae2e94c47c5d2eb5116197b Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Mon, 7 Dec 2020 19:11:07 -0800 Subject: Fix host/device calls for __half. The previous code had `__host__ __device__` functions calling `__device__` functions (e.g. `__low2half`) which caused build failures in tensorflow. Also tried to simplify the `#ifdef` guards to make them more clear. --- Eigen/src/Core/arch/GPU/PacketMath.h | 302 +++++++++++++++++++---------------- 1 file changed, 167 insertions(+), 135 deletions(-) diff --git a/Eigen/src/Core/arch/GPU/PacketMath.h b/Eigen/src/Core/arch/GPU/PacketMath.h index fb32c98ac..83bd551a0 100644 --- a/Eigen/src/Core/arch/GPU/PacketMath.h +++ b/Eigen/src/Core/arch/GPU/PacketMath.h @@ -14,10 +14,21 @@ namespace Eigen { namespace internal { +// Read-only data cached load available. +#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 350 +#define EIGEN_GPU_HAS_LDG 1 +#endif + +// FP16 math available. +#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 530 +#define EIGEN_GPU_HAS_FP16_ARITHMETIC 1 +#endif + // Make sure this is only available when targeting a GPU: we don't want to // introduce conflicts between these packet_traits definitions and the ones // we'll use on the host side (SSE, AVX, ...) #if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU) + template<> struct is_arithmetic { enum { value = true }; }; template<> struct is_arithmetic { enum { value = true }; }; @@ -237,7 +248,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pcmp_lt(const double2& a, const double2& b) { return make_double2(lt_mask(a.x, b.x), lt_mask(a.y, b.y)); } -#endif // EIGEN_CUDA_ARCH || defined(EIGEN_HIP_DEVICE_COMPILE) +#endif // defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset(const float& a) { return make_float4(a, a+1, a+2, a+3); @@ -342,7 +353,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(double* to template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro(const float* from) { -#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 +#if defined(EIGEN_GPU_HAS_LDG) return __ldg((const float4*)from); #else return make_float4(from[0], from[1], from[2], from[3]); @@ -350,7 +361,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro(const fl } template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro(const double* from) { -#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 +#if defined(EIGEN_GPU_HAS_LDG) return __ldg((const double2*)from); #else return make_double2(from[0], from[1]); @@ -359,7 +370,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro(const template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro(const float* from) { -#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 +#if defined(EIGEN_GPU_HAS_LDG) return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3)); #else return make_float4(from[0], from[1], from[2], from[3]); @@ -367,7 +378,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro(const } template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro(const double* from) { -#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 +#if defined(EIGEN_GPU_HAS_LDG) return make_double2(__ldg(from+0), __ldg(from+1)); #else return make_double2(from[0], from[1]); @@ -511,12 +522,43 @@ template<> struct packet_traits : default_packet_traits }; }; +namespace { +// This is equivalent to make_half2, which is undocumented and doesn't seem to always exist. +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 combine_half(const __half& a, const __half& b) { +#if defined(EIGEN_GPU_COMPILE_PHASE) + return __halves2half2(a, b); +#else + // Round-about way since __halves2half2 is a __device__ function. + return __floats2half2_rn(__half2float(a), __half2float(b)); +#endif +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE __half get_half2_low(const half2& a) { +#if defined(EIGEN_GPU_COMPILE_PHASE) + return __low2half(a); +#else + return __float2half(__low2float(a)); +#endif +} + +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE __half get_half2_high(const half2& a) { +#if defined(EIGEN_GPU_COMPILE_PHASE) + return __high2half(a); +#else + return __float2half(__high2float(a)); +#endif +} +} // namespace + template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1(const Eigen::half& from) { -#if defined(EIGEN_HIPCC) +#if defined(EIGEN_HIP_DEVICE_COMPILE) return half2half2(from); -#else +#elif defined(EIGEN_CUDA_ARCH) return __half2half2(from); +#else + const float f = __half2float(from); + return __floats2half2_rn(f, f); #endif } @@ -532,7 +574,8 @@ pset1(const Eigen::half& from) { return r; } -#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) +// We now need this visible on both host and device. +// #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) namespace { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) { @@ -540,11 +583,11 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) { - return __halves2half2(from[0], from[1]); + return combine_half(from[0], from[1]); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half* from) { - return __halves2half2(from[0], from[0]); + return combine_half(from[0], from[0]); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to, @@ -554,170 +597,164 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to, EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const half2& from) { -#if defined(EIGEN_GPU_COMPILE_PHASE) - to[0] = __low2half(from); - to[1] = __high2half(from); -#else - // Unfortunately __low2half and __high2half are only __device__ functions. - to[0] = __float2half(__low2float(from)); - to[1] = __float2half(__high2float(from)); -#endif + to[0] = get_half2_low(from); + to[1] = get_half2_high(from); } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned( const Eigen::half* from) { -#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 350 +#if defined(EIGEN_GPU_HAS_LDG) return __ldg((const half2*)from); #else - return __halves2half2(*(from+0), *(from+1)); + return combine_half(*(from+0), *(from+1)); #endif } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_unaligned( const Eigen::half* from) { -#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 350 +#if defined(EIGEN_GPU_HAS_LDG) return __halves2half2(__ldg(from+0), __ldg(from+1)); #else - return __halves2half2(*(from+0), *(from+1)); + return combine_half(*(from+0), *(from+1)); #endif } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from, Index stride) { - return __halves2half2(from[0*stride], from[1*stride]); + return combine_half(from[0*stride], from[1*stride]); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter( Eigen::half* to, const half2& from, Index stride) { - to[stride*0] = __low2half(from); - to[stride*1] = __high2half(from); + to[stride*0] = get_half2_low(from); + to[stride*1] = get_half2_high(from); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& a) { - return __low2half(a); + return get_half2_low(a); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& a) { - half a1 = __low2half(a); - half a2 = __high2half(a); + half a1 = get_half2_low(a); + half a2 = get_half2_high(a); half result1 = half_impl::raw_uint16_to_half(a1.x & 0x7FFF); half result2 = half_impl::raw_uint16_to_half(a2.x & 0x7FFF); - return __halves2half2(result1, result2); + return combine_half(result1, result2); } -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue(const half2& a) { +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue(const half2& /*a*/) { half true_half = half_impl::raw_uint16_to_half(0xffffu); return pset1(true_half); } -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pzero(const half2& a) { +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pzero(const half2& /*a*/) { half false_half = half_impl::raw_uint16_to_half(0x0000u); return pset1(false_half); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { - __half a1 = __low2half(kernel.packet[0]); - __half a2 = __high2half(kernel.packet[0]); - __half b1 = __low2half(kernel.packet[1]); - __half b2 = __high2half(kernel.packet[1]); - kernel.packet[0] = __halves2half2(a1, b1); - kernel.packet[1] = __halves2half2(a2, b2); + __half a1 = get_half2_low(kernel.packet[0]); + __half a2 = get_half2_high(kernel.packet[0]); + __half b1 = get_half2_low(kernel.packet[1]); + __half b2 = get_half2_high(kernel.packet[1]); + kernel.packet[0] = combine_half(a1, b1); + kernel.packet[1] = combine_half(a2, b2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) { -#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 530 +#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) return __halves2half2(a, __hadd(a, __float2half(1.0f))); #else float f = __half2float(a) + 1.0f; - return __halves2half2(a, __float2half(f)); + return combine_half(a, __float2half(f)); #endif } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect(const half2& mask, const half2& a, const half2& b) { - half mask_low = __low2half(mask); - half mask_high = __high2half(mask); - half result_low = mask_low == half(0) ? __low2half(b) : __low2half(a); - half result_high = mask_high == half(0) ? __high2half(b) : __high2half(a); - return __halves2half2(result_low, result_high); + half mask_low = get_half2_low(mask); + half mask_high = get_half2_high(mask); + half result_low = mask_low == half(0) ? get_half2_low(b) : get_half2_low(a); + half result_high = mask_high == half(0) ? get_half2_high(b) : get_half2_high(a); + return combine_half(result_low, result_high); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_eq(const half2& a, const half2& b) { half true_half = half_impl::raw_uint16_to_half(0xffffu); half false_half = half_impl::raw_uint16_to_half(0x0000u); - half a1 = __low2half(a); - half a2 = __high2half(a); - half b1 = __low2half(b); - half b2 = __high2half(b); + half a1 = get_half2_low(a); + half a2 = get_half2_high(a); + half b1 = get_half2_low(b); + half b2 = get_half2_high(b); half eq1 = __half2float(a1) == __half2float(b1) ? true_half : false_half; half eq2 = __half2float(a2) == __half2float(b2) ? true_half : false_half; - return __halves2half2(eq1, eq2); + return combine_half(eq1, eq2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_lt(const half2& a, const half2& b) { half true_half = half_impl::raw_uint16_to_half(0xffffu); half false_half = half_impl::raw_uint16_to_half(0x0000u); - half a1 = __low2half(a); - half a2 = __high2half(a); - half b1 = __low2half(b); - half b2 = __high2half(b); + half a1 = get_half2_low(a); + half a2 = get_half2_high(a); + half b1 = get_half2_low(b); + half b2 = get_half2_high(b); half eq1 = __half2float(a1) < __half2float(b1) ? true_half : false_half; half eq2 = __half2float(a2) < __half2float(b2) ? true_half : false_half; - return __halves2half2(eq1, eq2); + return combine_half(eq1, eq2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pand(const half2& a, const half2& b) { - half a1 = __low2half(a); - half a2 = __high2half(a); - half b1 = __low2half(b); - half b2 = __high2half(b); + half a1 = get_half2_low(a); + half a2 = get_half2_high(a); + half b1 = get_half2_low(b); + half b2 = get_half2_high(b); half result1 = half_impl::raw_uint16_to_half(a1.x & b1.x); half result2 = half_impl::raw_uint16_to_half(a2.x & b2.x); - return __halves2half2(result1, result2); + return combine_half(result1, result2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 por(const half2& a, const half2& b) { - half a1 = __low2half(a); - half a2 = __high2half(a); - half b1 = __low2half(b); - half b2 = __high2half(b); + half a1 = get_half2_low(a); + half a2 = get_half2_high(a); + half b1 = get_half2_low(b); + half b2 = get_half2_high(b); half result1 = half_impl::raw_uint16_to_half(a1.x | b1.x); half result2 = half_impl::raw_uint16_to_half(a2.x | b2.x); - return __halves2half2(result1, result2); + return combine_half(result1, result2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pxor(const half2& a, const half2& b) { - half a1 = __low2half(a); - half a2 = __high2half(a); - half b1 = __low2half(b); - half b2 = __high2half(b); + half a1 = get_half2_low(a); + half a2 = get_half2_high(a); + half b1 = get_half2_low(b); + half b2 = get_half2_high(b); half result1 = half_impl::raw_uint16_to_half(a1.x ^ b1.x); half result2 = half_impl::raw_uint16_to_half(a2.x ^ b2.x); - return __halves2half2(result1, result2); + return combine_half(result1, result2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(const half2& a, const half2& b) { - half a1 = __low2half(a); - half a2 = __high2half(a); - half b1 = __low2half(b); - half b2 = __high2half(b); + half a1 = get_half2_low(a); + half a2 = get_half2_high(a); + half b1 = get_half2_low(b); + half b2 = get_half2_high(b); half result1 = half_impl::raw_uint16_to_half(a1.x & ~b1.x); half result2 = half_impl::raw_uint16_to_half(a2.x & ~b2.x); - return __halves2half2(result1, result2); + return combine_half(result1, result2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) { -#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 530 +#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) return __hadd2(a, b); #else float a1 = __low2float(a); @@ -732,7 +769,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) { -#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 530 +#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) return __hsub2(a, b); #else float a1 = __low2float(a); @@ -746,7 +783,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a, } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { -#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 530 +#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) return __hneg2(a); #else float a1 = __low2float(a); @@ -759,7 +796,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) { -#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 530 +#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) return __hmul2(a, b); #else float a1 = __low2float(a); @@ -775,7 +812,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a, EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a, const half2& b, const half2& c) { -#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 530 +#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) return __hfma2(a, b, c); #else float a1 = __low2float(a); @@ -792,9 +829,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a, EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) { -#if defined(EIGEN_HIP_DEVICE_COMPILE) +#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) return __h2div(a, b); -#else // EIGEN_CUDA_ARCH +#else float a1 = __low2float(a); float a2 = __high2float(a); float b1 = __low2float(b); @@ -811,9 +848,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a, float a2 = __high2float(a); float b1 = __low2float(b); float b2 = __high2float(b); - __half r1 = a1 < b1 ? __low2half(a) : __low2half(b); - __half r2 = a2 < b2 ? __high2half(a) : __high2half(b); - return __halves2half2(r1, r2); + __half r1 = a1 < b1 ? get_half2_low(a) : get_half2_low(b); + __half r2 = a2 < b2 ? get_half2_high(a) : get_half2_high(b); + return combine_half(r1, r2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, @@ -822,13 +859,13 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, float a2 = __high2float(a); float b1 = __low2float(b); float b2 = __high2float(b); - __half r1 = a1 > b1 ? __low2half(a) : __low2half(b); - __half r2 = a2 > b2 ? __high2half(a) : __high2half(b); - return __halves2half2(r1, r2); + __half r1 = a1 > b1 ? get_half2_low(a) : get_half2_low(b); + __half r2 = a2 > b2 ? get_half2_high(a) : get_half2_high(b); + return combine_half(r1, r2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) { -#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 530 +#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) return __hadd(__low2half(a), __high2half(a)); #else float a1 = __low2float(a); @@ -838,31 +875,31 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) { -#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 530 +#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) __half first = __low2half(a); __half second = __high2half(a); return __hgt(first, second) ? first : second; #else float a1 = __low2float(a); float a2 = __high2float(a); - return a1 > a2 ? __low2half(a) : __high2half(a); + return a1 > a2 ? get_half2_low(a) : get_half2_high(a); #endif } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) { -#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 530 +#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) __half first = __low2half(a); __half second = __high2half(a); return __hlt(first, second) ? first : second; #else float a1 = __low2float(a); float a2 = __high2float(a); - return a1 < a2 ? __low2half(a) : __high2half(a); + return a1 < a2 ? get_half2_low(a) : get_half2_high(a); #endif } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul(const half2& a) { -#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 530 +#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) return __hmul(__low2half(a), __high2half(a)); #else float a1 = __low2float(a); @@ -996,7 +1033,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu( template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2 ploadt_ro(const Eigen::half* from) { -#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 350 +#if defined(EIGEN_GPU_HAS_LDG) Packet4h2 r; r = __ldg((const Packet4h2*)from); return r; @@ -1028,10 +1065,10 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pgather(const Eigen::half* from, Index stride) { Packet4h2 r; half2* p_alias = reinterpret_cast(&r); - p_alias[0] = __halves2half2(from[0 * stride], from[1 * stride]); - p_alias[1] = __halves2half2(from[2 * stride], from[3 * stride]); - p_alias[2] = __halves2half2(from[4 * stride], from[5 * stride]); - p_alias[3] = __halves2half2(from[6 * stride], from[7 * stride]); + p_alias[0] = combine_half(from[0 * stride], from[1 * stride]); + p_alias[1] = combine_half(from[2 * stride], from[3 * stride]); + p_alias[2] = combine_half(from[4 * stride], from[5 * stride]); + p_alias[3] = combine_half(from[6 * stride], from[7 * stride]); return r; } @@ -1066,13 +1103,13 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pabs( template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 ptrue( - const Packet4h2& a) { + const Packet4h2& /*a*/) { half true_half = half_impl::raw_uint16_to_half(0xffffu); return pset1(true_half); } template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pzero(const Packet4h2& a) { +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pzero(const Packet4h2& /*a*/) { half false_half = half_impl::raw_uint16_to_half(0x0000u); return pset1(false_half); } @@ -1112,12 +1149,12 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half2( EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half(half2& f0, half2& f1) { - __half a1 = __low2half(f0); - __half a2 = __high2half(f0); - __half b1 = __low2half(f1); - __half b2 = __high2half(f1); - f0 = __halves2half2(a1, b1); - f1 = __halves2half2(a2, b2); + __half a1 = get_half2_low(f0); + __half a2 = get_half2_high(f0); + __half b1 = get_half2_low(f1); + __half b2 = get_half2_high(f1); + f0 = combine_half(a1, b1); + f1 = combine_half(a2, b2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void @@ -1191,9 +1228,7 @@ plset(const Eigen::half& a) { p_alias[3] = __halves2half2(__hadd(a, __float2half(6.0f)), __hadd(a, __float2half(7.0f))); return r; -#else // EIGEN_CUDA_ARCH - -#if EIGEN_CUDA_ARCH >= 530 +#elif EIGEN_CUDA_ARCH >= 530 Packet4h2 r; half2* r_alias = reinterpret_cast(&r); @@ -1216,14 +1251,12 @@ plset(const Eigen::half& a) { float f = __half2float(a); Packet4h2 r; half2* p_alias = reinterpret_cast(&r); - p_alias[0] = __halves2half2(a, __float2half(f + 1.0f)); - p_alias[1] = __halves2half2(__float2half(f + 2.0f), __float2half(f + 3.0f)); - p_alias[2] = __halves2half2(__float2half(f + 4.0f), __float2half(f + 5.0f)); - p_alias[3] = __halves2half2(__float2half(f + 6.0f), __float2half(f + 7.0f)); + p_alias[0] = combine_half(a, __float2half(f + 1.0f)); + p_alias[1] = combine_half(__float2half(f + 2.0f), __float2half(f + 3.0f)); + p_alias[2] = combine_half(__float2half(f + 4.0f), __float2half(f + 5.0f)); + p_alias[3] = combine_half(__float2half(f + 6.0f), __float2half(f + 7.0f)); return r; #endif - -#endif } template <> @@ -1441,9 +1474,9 @@ template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max( const Packet4h2& a) { const half2* a_alias = reinterpret_cast(&a); - half2 m0 = __halves2half2(predux_max(a_alias[0]), + half2 m0 = combine_half(predux_max(a_alias[0]), predux_max(a_alias[1])); - half2 m1 = __halves2half2(predux_max(a_alias[2]), + half2 m1 = combine_half(predux_max(a_alias[2]), predux_max(a_alias[3])); __half first = predux_max(m0); __half second = predux_max(m1); @@ -1460,9 +1493,9 @@ template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min( const Packet4h2& a) { const half2* a_alias = reinterpret_cast(&a); - half2 m0 = __halves2half2(predux_min(a_alias[0]), + half2 m0 = combine_half(predux_min(a_alias[0]), predux_min(a_alias[1])); - half2 m1 = __halves2half2(predux_min(a_alias[2]), + half2 m1 = combine_half(predux_min(a_alias[2]), predux_min(a_alias[3])); __half first = predux_min(m0); __half second = predux_min(m1); @@ -1564,7 +1597,7 @@ prsqrt(const Packet4h2& a) { template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) { -#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 530 +#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) return __hadd2(a, b); #else float a1 = __low2float(a); @@ -1580,7 +1613,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) { -#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 530 +#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) return __hmul2(a, b); #else float a1 = __low2float(a); @@ -1596,12 +1629,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a, template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) { -#if defined(EIGEN_HIP_DEVICE_COMPILE) - +#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC) return __h2div(a, b); - -#else // EIGEN_CUDA_ARCH - +#else float a1 = __low2float(a); float a2 = __high2float(a); float b1 = __low2float(b); @@ -1609,7 +1639,6 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a, float r1 = a1 / b1; float r2 = a2 / b2; return __floats2half2_rn(r1, r2); - #endif } @@ -1620,9 +1649,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a, float a2 = __high2float(a); float b1 = __low2float(b); float b2 = __high2float(b); - __half r1 = a1 < b1 ? __low2half(a) : __low2half(b); - __half r2 = a2 < b2 ? __high2half(a) : __high2half(b); - return __halves2half2(r1, r2); + __half r1 = a1 < b1 ? get_half2_low(a) : get_half2_low(b); + __half r2 = a2 < b2 ? get_half2_high(a) : get_half2_high(b); + return combine_half(r1, r2); } template<> @@ -1632,14 +1661,17 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, float a2 = __high2float(a); float b1 = __low2float(b); float b2 = __high2float(b); - __half r1 = a1 > b1 ? __low2half(a) : __low2half(b); - __half r2 = a2 > b2 ? __high2half(a) : __high2half(b); - return __halves2half2(r1, r2); + __half r1 = a1 > b1 ? get_half2_low(a) : get_half2_low(b); + __half r2 = a2 > b2 ? get_half2_high(a) : get_half2_high(b); + return combine_half(r1, r2); } -#endif // defined(EIGEN_CUDA_ARCH) +// #endif // defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) + +#endif // defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16) -#endif // defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC) +#undef EIGEN_GPU_HAS_LDG +#undef EIGEN_GPU_HAS_FP16_ARITHMETIC } // end namespace internal -- cgit v1.2.3