From 9ee9ac81de8e94cc4fb49ee7dd37e255d9ccbd76 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Thu, 3 Dec 2020 15:00:18 -0800 Subject: Fix shfl* macros for CUDA/HIP The `shfl*` functions are `__device__` only, and adjusted `#ifdef`s so they are defined whenever the corresponding CUDA/HIP ones are. Also changed the HIP/CUDA<9.0 versions to cast to int instead of doing the conversion `half`<->`float`. Fixes #2083 --- Eigen/src/Core/arch/Default/Half.h | 119 ++++++++++++++++++++----------------- 1 file changed, 66 insertions(+), 53 deletions(-) (limited to 'Eigen/src/Core/arch/Default') diff --git a/Eigen/src/Core/arch/Default/Half.h b/Eigen/src/Core/arch/Default/Half.h index 7029c500d..b9a8a46aa 100644 --- a/Eigen/src/Core/arch/Default/Half.h +++ b/Eigen/src/Core/arch/Default/Half.h @@ -59,7 +59,7 @@ #define F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, METHOD) \ template <> \ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED \ - PACKET_F16 METHOD(const PACKET_F16& _x) { \ + PACKET_F16 METHOD(const PACKET_F16& _x) { \ return float2half(METHOD(half2float(_x))); \ } @@ -772,58 +772,6 @@ struct hash { } // end namespace std -// Add the missing shfl* intrinsics. -// HIP and CUDA prior to 9.0 define -// __shfl, __shfl_up, __shfl_down, __shfl_xor for int, float -// CUDA since 9.0 deprecates those and instead defines -// __shfl_sync, __shfl_up_sync, __shfl_down_sync, __shfl_xor_sync -// for int, long, long long, float, double, __half, __half2, __nv_bfloat16, __nv_bfloat162 - -#if defined(EIGEN_HAS_HIP_FP16) || (defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER < 90000) - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half __shfl(Eigen::half var, int srcLane, int width=warpSize) { - return static_cast(__shfl(static_cast(var), srcLane, width)); -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half __shfl_up(Eigen::half var, unsigned int delta, int width=warpSize) { - return static_cast(__shfl_up(static_cast(var), delta, width)); -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half __shfl_down(Eigen::half var, unsigned int delta, int width=warpSize) { - return static_cast(__shfl_down(static_cast(var), delta, width)); -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) { - return static_cast(__shfl_xor(static_cast(var), laneMask, width)); -} - -#elif defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 90000 - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half __shfl_sync(unsigned mask, Eigen::half var, int srcLane, int width=warpSize) { - return static_cast(__shfl_sync(mask, static_cast<__half>(var), srcLane, width)); -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half __shfl_up_sync(unsigned mask, Eigen::half var, unsigned int delta, int width=warpSize) { - return static_cast(__shfl_up_sync(mask, static_cast<__half>(var), delta, width)); -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half __shfl_down_sync(unsigned mask, Eigen::half var, unsigned int delta, int width=warpSize) { - return static_cast(__shfl_down_sync(mask, static_cast<__half>(var), delta, width)); -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half __shfl_xor_sync(unsigned mask, Eigen::half var, int laneMask, int width=warpSize) { - return static_cast(__shfl_xor_sync(mask, static_cast<__half>(var), laneMask, width)); -} - -#endif // shfl - -// ldg() has an overload for __half_raw, but we also need one for Eigen::half. -#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350) || defined(EIGEN_HAS_HIP_FP16) -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) { - return Eigen::half_impl::raw_uint16_to_half(__ldg(reinterpret_cast(ptr))); -} -#endif - namespace Eigen { namespace numext { @@ -859,4 +807,69 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC uint16_t bit_cast(c } // namespace numext } // namespace Eigen +// Add the missing shfl* intrinsics. +// The __shfl* functions are only valid on HIP or _CUDA_ARCH_ >= 300. +// CUDA defines them for (__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__)) +// +// HIP and CUDA prior to SDK 9.0 define +// __shfl, __shfl_up, __shfl_down, __shfl_xor for int and float +// CUDA since 9.0 deprecates those and instead defines +// __shfl_sync, __shfl_up_sync, __shfl_down_sync, __shfl_xor_sync, +// with native support for __half and __nv_bfloat16 +// +// Note that the following are __device__ - only functions. +#if (defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 300)) \ + || defined(EIGEN_HIPCC) + +#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 90000 + +__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_sync(unsigned mask, Eigen::half var, int srcLane, int width=warpSize) { + return static_cast(__shfl_sync(mask, static_cast<__half>(var), srcLane, width)); +} + +__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_up_sync(unsigned mask, Eigen::half var, unsigned int delta, int width=warpSize) { + return static_cast(__shfl_up_sync(mask, static_cast<__half>(var), delta, width)); +} + +__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_down_sync(unsigned mask, Eigen::half var, unsigned int delta, int width=warpSize) { + return static_cast(__shfl_down_sync(mask, static_cast<__half>(var), delta, width)); +} + +__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor_sync(unsigned mask, Eigen::half var, int laneMask, int width=warpSize) { + return static_cast(__shfl_xor_sync(mask, static_cast<__half>(var), laneMask, width)); +} + +#else // HIP or CUDA SDK < 9.0 + +__device__ EIGEN_STRONG_INLINE Eigen::half __shfl(Eigen::half var, int srcLane, int width=warpSize) { + const int ivar = static_cast(Eigen::numext::bit_cast(var)); + return Eigen::numext::bit_cast(static_cast(__shfl(ivar, srcLane, width))); +} + +__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_up(Eigen::half var, unsigned int delta, int width=warpSize) { + const int ivar = static_cast(Eigen::numext::bit_cast(var)); + return Eigen::numext::bit_cast(static_cast(__shfl_up(ivar, delta, width))); +} + +__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_down(Eigen::half var, unsigned int delta, int width=warpSize) { + const int ivar = static_cast(Eigen::numext::bit_cast(var)); + return Eigen::numext::bit_cast(static_cast(__shfl_down(ivar, delta, width))); +} + +__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) { + const int ivar = static_cast(Eigen::numext::bit_cast(var)); + return Eigen::numext::bit_cast(static_cast(__shfl_xor(ivar, laneMask, width))); +} + +#endif // HIP vs CUDA +#endif // __shfl* + +// ldg() has an overload for __half_raw, but we also need one for Eigen::half. +#if (defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 350)) \ + || defined(EIGEN_HIPCC) +EIGEN_STRONG_INLINE __device__ Eigen::half __ldg(const Eigen::half* ptr) { + return Eigen::half_impl::raw_uint16_to_half(__ldg(reinterpret_cast(ptr))); +} +#endif // __ldg + #endif // EIGEN_HALF_H -- cgit v1.2.3