diff options
author | Deven Desai <deven.desai.amd@gmail.com> | 2020-03-11 23:06:56 +0000 |
---|---|---|
committer | Deven Desai <deven.desai.amd@gmail.com> | 2020-03-12 01:06:24 +0000 |
commit | 7158ed4e0e34d40cd0f358a3bf69a5c30d8d0f83 (patch) | |
tree | 6ee1f2ce81b3e442210564b283fdf9e953ff0306 | |
parent | d53ae40f7bcfb948b85b893acf305cdebcba3ba8 (diff) |
Fixing HIP breakage caused by the recent commit that introduces Packet4h2 as the Eigen::Half packet type
-rw-r--r-- | Eigen/src/Core/arch/Default/Half.h | 4 | ||||
-rw-r--r-- | Eigen/src/Core/arch/GPU/PacketMath.h | 11 | ||||
-rw-r--r-- | Eigen/src/Core/util/ConfigureVectorization.h | 3 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h | 46 |
4 files changed, 39 insertions, 25 deletions
diff --git a/Eigen/src/Core/arch/Default/Half.h b/Eigen/src/Core/arch/Default/Half.h index 9f4c1ebdf..cfd0bdc06 100644 --- a/Eigen/src/Core/arch/Default/Half.h +++ b/Eigen/src/Core/arch/Default/Half.h @@ -706,7 +706,7 @@ struct hash<Eigen::half> { // Add the missing shfl_xor intrinsic #if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ - defined(EIGEN_HIP_DEVICE_COMPILE) + defined(EIGEN_HIPCC) __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) { #if (EIGEN_CUDA_SDK_VER < 90000) || \ @@ -720,7 +720,7 @@ __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneM // ldg() has an overload for __half_raw, but we also need one for Eigen::half. #if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350) || \ - defined(EIGEN_HIP_DEVICE_COMPILE) + defined(EIGEN_HIPCC) EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) { return Eigen::half_impl::raw_uint16_to_half( __ldg(reinterpret_cast<const unsigned short*>(ptr))); diff --git a/Eigen/src/Core/arch/GPU/PacketMath.h b/Eigen/src/Core/arch/GPU/PacketMath.h index 1f6a562c5..dd4e77d3a 100644 --- a/Eigen/src/Core/arch/GPU/PacketMath.h +++ b/Eigen/src/Core/arch/GPU/PacketMath.h @@ -481,7 +481,7 @@ ptranspose(PacketBlock<double2,2>& kernel) { // Packet4h2 must be defined in the macro without EIGEN_CUDA_ARCH, meaning // its corresponding packet_traits<Eigen::half> must be visible on host. #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC)) || \ - (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIPCC) && defined(EIGEN_HIP_DEVICE_COMPILE)) || \ + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIPCC)) || \ (defined(EIGEN_HAS_CUDA_FP16) && defined(__clang__) && defined(__CUDA__)) typedef ulonglong2 Packet4h2; @@ -515,11 +515,13 @@ template<> struct packet_traits<Eigen::half> : default_packet_traits template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) { -#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIP_DEVICE_COMPILE) +#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIPCC) half2 r; r.x = from; r.y = from; return r; +#elif defined(EIGEN_HIPCC) + return __half2{from,from}; #else return __half2half2(from); #endif @@ -537,7 +539,7 @@ pset1<Packet4h2>(const Eigen::half& from) { return r; } -#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) +#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) namespace { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) { @@ -559,7 +561,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to, EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const half2& from) { -#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIP_DEVICE_COMPILE) +#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIPCC) to[0] = from.x; to[1] = from.y; #else @@ -1056,7 +1058,6 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) { #endif } // namespace - template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pload<Packet4h2>(const Eigen::half* from) { diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h index 271795a06..952abc306 100644 --- a/Eigen/src/Core/util/ConfigureVectorization.h +++ b/Eigen/src/Core/util/ConfigureVectorization.h @@ -439,9 +439,6 @@ #if defined(EIGEN_HIPCC) #define EIGEN_VECTORIZE_GPU #include <hip/hip_vector_types.h> -#endif - -#if defined(EIGEN_HIP_DEVICE_COMPILE) #define EIGEN_HAS_HIP_FP16 #include <hip/hip_fp16.h> #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h index 9d3305cfd..36df03d62 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h @@ -306,11 +306,17 @@ __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input, #pragma unroll for (int offset = warpSize/2; offset > 0; offset /= 2) { #if defined(EIGEN_HIPCC) - // FIXME : remove this workaround once we have native half/half2 support for __shfl_down - union { int i; half2 h; } wka_in, wka_out; - wka_in.h = accum; - wka_out.i = __shfl_down(wka_in.i, offset, warpSize); - reducer.reducePacket(wka_out.h, &accum); + PacketType r1; + half2* hr = reinterpret_cast<half2*>(&r1); + half2* hacc = reinterpret_cast<half2*>(&accum); + for (int i = 0; i < packet_width / 2; i++) { + // FIXME : remove this workaround once we have native half/half2 support for __shfl_down + union { int i; half2 h; } wka_in, wka_out; + wka_in.h = hacc[i]; + wka_out.i = __shfl_down(wka_in.i, offset, warpSize); + hr[i] = wka_out.h; + } + reducer.reducePacket(r1, &accum); #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000 PacketType r1; half2* hr = reinterpret_cast<half2*>(&r1); @@ -661,16 +667,26 @@ __global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, #pragma unroll for (int offset = warpSize/2; offset > 0; offset /= 2) { #if defined(EIGEN_HIPCC) - // FIXME : remove this workaround once we have native half/half2 support for __shfl_down - union { int i; half2 h; } wka_in, wka_out; - - wka_in.h = reduced_val1; - wka_out.i = __shfl_down(wka_in.i, offset, warpSize); - reducer.reducePacket(wka_out.h, &reduced_val1); - - wka_in.h = reduced_val2; - wka_out.i = __shfl_down(wka_in.i, offset, warpSize); - reducer.reducePacket(wka_out.h, &reduced_val2); + PacketType r1; + PacketType r2; + half2* hr1 = reinterpret_cast<half2*>(&r1); + half2* hr2 = reinterpret_cast<half2*>(&r2); + half2* rv1 = reinterpret_cast<half2*>(&reduced_val1); + half2* rv2 = reinterpret_cast<half2*>(&reduced_val2); + for (int i = 0; i < packet_width / 2; i++) { + // FIXME : remove this workaround once we have native half/half2 support for __shfl_down + union { int i; half2 h; } wka_in1, wka_out1; + wka_in1.h = rv1[i]; + wka_out1.i = __shfl_down(wka_in1.i, offset, warpSize); + hr1[i] = wka_out1.h; + + union { int i; half2 h; } wka_in2, wka_out2; + wka_in2.h = rv2[i]; + wka_out2.i = __shfl_down(wka_in2.i, offset, warpSize); + hr2[i] = wka_out2.h; + } + reducer.reducePacket(r1, &reduced_val1); + reducer.reducePacket(r2, &reduced_val2); #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000 PacketType r1; PacketType r2; |