Fixing HIP breakage caused by the recent commit that introduces Packet4h2 as the Eigen::Half packet type

author: Deven Desai <deven.desai.amd@gmail.com> 2020-03-11 23:06:56 +0000
committer: Deven Desai <deven.desai.amd@gmail.com> 2020-03-12 01:06:24 +0000
commit: 7158ed4e0e34d40cd0f358a3bf69a5c30d8d0f83 (patch)
tree: 6ee1f2ce81b3e442210564b283fdf9e953ff0306
parent: d53ae40f7bcfb948b85b893acf305cdebcba3ba8 (diff)
4 files changed, 39 insertions, 25 deletions
diff --git a/Eigen/src/Core/arch/Default/Half.h b/Eigen/src/Core/arch/Default/Half.h
index 9f4c1ebdf..cfd0bdc06 100644
--- a/Eigen/src/Core/arch/Default/Half.h
+++ b/Eigen/src/Core/arch/Default/Half.h
@@ -706,7 +706,7 @@ struct hash<Eigen::half> {
 
 // Add the missing shfl_xor intrinsic
 #if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
-  defined(EIGEN_HIP_DEVICE_COMPILE)
+  defined(EIGEN_HIPCC)
 
 __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
   #if (EIGEN_CUDA_SDK_VER < 90000) || \
@@ -720,7 +720,7 @@ __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneM
 
 // ldg() has an overload for __half_raw, but we also need one for Eigen::half.
 #if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350) || \
-  defined(EIGEN_HIP_DEVICE_COMPILE)
+  defined(EIGEN_HIPCC)
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) {
   return Eigen::half_impl::raw_uint16_to_half(
       __ldg(reinterpret_cast<const unsigned short*>(ptr)));
diff --git a/Eigen/src/Core/arch/GPU/PacketMath.h b/Eigen/src/Core/arch/GPU/PacketMath.h
index 1f6a562c5..dd4e77d3a 100644
--- a/Eigen/src/Core/arch/GPU/PacketMath.h
+++ b/Eigen/src/Core/arch/GPU/PacketMath.h
@@ -481,7 +481,7 @@ ptranspose(PacketBlock<double2,2>& kernel) {
 // Packet4h2 must be defined in the macro without EIGEN_CUDA_ARCH, meaning
 // its corresponding packet_traits<Eigen::half> must be visible on host.
 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC)) || \
-  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIPCC) && defined(EIGEN_HIP_DEVICE_COMPILE)) || \
+  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIPCC)) || \
   (defined(EIGEN_HAS_CUDA_FP16) && defined(__clang__) && defined(__CUDA__))
 
 typedef ulonglong2 Packet4h2;
@@ -515,11 +515,13 @@ template<> struct packet_traits<Eigen::half> : default_packet_traits
 
 template<>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
-#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIP_DEVICE_COMPILE)
+#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIPCC)
   half2 r;
   r.x = from;
   r.y = from;
   return r;
+#elif defined(EIGEN_HIPCC)
+  return __half2{from,from};
 #else
   return __half2half2(from);
 #endif
@@ -537,7 +539,7 @@ pset1<Packet4h2>(const Eigen::half& from) {
   return r;
 }
 
-#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
+#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
 namespace {
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) {
@@ -559,7 +561,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to,
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to,
                                                    const half2& from) {
-#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIP_DEVICE_COMPILE)
+#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIPCC)
   to[0] = from.x;
   to[1] = from.y;
 #else
@@ -1056,7 +1058,6 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) {
 #endif
 } // namespace
 
-
 template <>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
 pload<Packet4h2>(const Eigen::half* from) {
diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h
index 271795a06..952abc306 100644
--- a/Eigen/src/Core/util/ConfigureVectorization.h
+++ b/Eigen/src/Core/util/ConfigureVectorization.h
@@ -439,9 +439,6 @@
 #if defined(EIGEN_HIPCC)
   #define EIGEN_VECTORIZE_GPU
   #include <hip/hip_vector_types.h>
-#endif
-
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
   #define EIGEN_HAS_HIP_FP16
   #include <hip/hip_fp16.h>
 #endif
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
index 9d3305cfd..36df03d62 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
@@ -306,11 +306,17 @@ __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input,
 #pragma unroll
   for (int offset = warpSize/2; offset > 0; offset /= 2) {
   #if defined(EIGEN_HIPCC)
-    // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
-    union { int i; half2 h; } wka_in, wka_out;
-    wka_in.h = accum;
-    wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
-    reducer.reducePacket(wka_out.h, &accum);
+    PacketType r1;
+    half2* hr = reinterpret_cast<half2*>(&r1);
+    half2* hacc = reinterpret_cast<half2*>(&accum);
+    for (int i = 0; i < packet_width / 2; i++) {
+      // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
+      union { int i; half2 h; } wka_in, wka_out;
+      wka_in.h = hacc[i];
+      wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
+      hr[i] = wka_out.h;
+    }
+    reducer.reducePacket(r1, &accum);
   #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
     PacketType r1;
     half2* hr = reinterpret_cast<half2*>(&r1);
@@ -661,16 +667,26 @@ __global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input,
 #pragma unroll
       for (int offset = warpSize/2; offset > 0; offset /= 2) {
       #if defined(EIGEN_HIPCC)
-        // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
-        union { int i; half2 h; } wka_in, wka_out;
-
-        wka_in.h = reduced_val1;
-        wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
-        reducer.reducePacket(wka_out.h, &reduced_val1);
-
-        wka_in.h = reduced_val2;
-        wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
-        reducer.reducePacket(wka_out.h, &reduced_val2);
+        PacketType r1;
+        PacketType r2;
+        half2* hr1 = reinterpret_cast<half2*>(&r1);
+        half2* hr2 = reinterpret_cast<half2*>(&r2);
+        half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
+        half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
+        for (int i = 0; i < packet_width / 2; i++) {
+	  // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
+	  union { int i; half2 h; } wka_in1, wka_out1;
+	  wka_in1.h = rv1[i];
+	  wka_out1.i = __shfl_down(wka_in1.i, offset, warpSize);
+	  hr1[i] = wka_out1.h;
+
+	  union { int i; half2 h; } wka_in2, wka_out2;
+	  wka_in2.h = rv2[i];
+	  wka_out2.i = __shfl_down(wka_in2.i, offset, warpSize);
+	  hr2[i] = wka_out2.h;
+        }
+        reducer.reducePacket(r1, &reduced_val1);
+        reducer.reducePacket(r2, &reduced_val2);
       #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
         PacketType r1;
         PacketType r2;
author	Deven Desai <deven.desai.amd@gmail.com>	2020-03-11 23:06:56 +0000
committer	Deven Desai <deven.desai.amd@gmail.com>	2020-03-12 01:06:24 +0000
commit	7158ed4e0e34d40cd0f358a3bf69a5c30d8d0f83 (patch)
tree	6ee1f2ce81b3e442210564b283fdf9e953ff0306
parent	d53ae40f7bcfb948b85b893acf305cdebcba3ba8 (diff)